llama_cpp 0.5.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -24,12 +24,59 @@ typedef struct {
24
24
  int8_t qs[QK8_0]; // quants
25
25
  } block_q8_0;
26
26
 
27
+ // general-purpose kernel for addition of two tensors
28
+ // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
29
+ // cons: not very efficient
27
30
  kernel void kernel_add(
28
- device const float4 * src0,
29
- device const float4 * src1,
30
- device float4 * dst,
31
- uint tpig[[thread_position_in_grid]]) {
32
- dst[tpig] = src0[tpig] + src1[tpig];
31
+ device const char * src0,
32
+ device const char * src1,
33
+ device char * dst,
34
+ constant int64_t & ne00,
35
+ constant int64_t & ne01,
36
+ constant int64_t & ne02,
37
+ constant int64_t & ne03,
38
+ constant int64_t & nb00,
39
+ constant int64_t & nb01,
40
+ constant int64_t & nb02,
41
+ constant int64_t & nb03,
42
+ constant int64_t & ne10,
43
+ constant int64_t & ne11,
44
+ constant int64_t & ne12,
45
+ constant int64_t & ne13,
46
+ constant int64_t & nb10,
47
+ constant int64_t & nb11,
48
+ constant int64_t & nb12,
49
+ constant int64_t & nb13,
50
+ constant int64_t & ne0,
51
+ constant int64_t & ne1,
52
+ constant int64_t & ne2,
53
+ constant int64_t & ne3,
54
+ constant int64_t & nb0,
55
+ constant int64_t & nb1,
56
+ constant int64_t & nb2,
57
+ constant int64_t & nb3,
58
+ uint3 tgpig[[threadgroup_position_in_grid]],
59
+ uint3 tpitg[[thread_position_in_threadgroup]],
60
+ uint3 ntg[[threads_per_threadgroup]]) {
61
+ const int64_t i03 = tgpig.z;
62
+ const int64_t i02 = tgpig.y;
63
+ const int64_t i01 = tgpig.x;
64
+
65
+ const int64_t i13 = i03 % ne13;
66
+ const int64_t i12 = i02 % ne12;
67
+ const int64_t i11 = i01 % ne11;
68
+
69
+ device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
70
+ device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
71
+ device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
72
+
73
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
74
+ ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
75
+
76
+ src0_ptr += ntg.x*nb00;
77
+ src1_ptr += ntg.x*nb10;
78
+ dst_ptr += ntg.x*nb0;
79
+ }
33
80
  }
34
81
 
35
82
  // assumption: src1 is a row
@@ -38,7 +85,7 @@ kernel void kernel_add_row(
38
85
  device const float4 * src0,
39
86
  device const float4 * src1,
40
87
  device float4 * dst,
41
- constant int64_t & nb,
88
+ constant int64_t & nb [[buffer(27)]],
42
89
  uint tpig[[thread_position_in_grid]]) {
43
90
  dst[tpig] = src0[tpig] + src1[tpig % nb];
44
91
  }
@@ -783,7 +830,9 @@ kernel void kernel_alibi_f32(
783
830
  constant uint64_t & nb1,
784
831
  constant uint64_t & nb2,
785
832
  constant uint64_t & nb3,
786
- constant float & m0,
833
+ constant float & m0,
834
+ constant float & m1,
835
+ constant int & n_heads_log2_floor,
787
836
  uint3 tgpig[[threadgroup_position_in_grid]],
788
837
  uint3 tpitg[[thread_position_in_threadgroup]],
789
838
  uint3 ntg[[threads_per_threadgroup]]) {
@@ -799,37 +848,73 @@ kernel void kernel_alibi_f32(
799
848
  const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
800
849
 
801
850
  device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
802
- float m_k = pow(m0, i2 + 1);
851
+ float m_k;
852
+ if (i2 < n_heads_log2_floor) {
853
+ m_k = pow(m0, i2 + 1);
854
+ } else {
855
+ m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
856
+ }
803
857
  for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
804
858
  device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
805
859
  dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
806
860
  }
807
861
  }
808
862
 
863
+ typedef void (rope_t)(
864
+ device const void * src0,
865
+ device const int32_t * src1,
866
+ device float * dst,
867
+ constant int64_t & ne00,
868
+ constant int64_t & ne01,
869
+ constant int64_t & ne02,
870
+ constant int64_t & ne03,
871
+ constant uint64_t & nb00,
872
+ constant uint64_t & nb01,
873
+ constant uint64_t & nb02,
874
+ constant uint64_t & nb03,
875
+ constant int64_t & ne0,
876
+ constant int64_t & ne1,
877
+ constant int64_t & ne2,
878
+ constant int64_t & ne3,
879
+ constant uint64_t & nb0,
880
+ constant uint64_t & nb1,
881
+ constant uint64_t & nb2,
882
+ constant uint64_t & nb3,
883
+ constant int & n_past,
884
+ constant int & n_dims,
885
+ constant int & mode,
886
+ constant float & freq_base,
887
+ constant float & freq_scale,
888
+ uint tiitg[[thread_index_in_threadgroup]],
889
+ uint3 tptg[[threads_per_threadgroup]],
890
+ uint3 tgpig[[threadgroup_position_in_grid]]);
891
+
892
+ template<typename T>
809
893
  kernel void kernel_rope(
810
- device const void * src0,
811
- device float * dst,
812
- constant int64_t & ne00,
813
- constant int64_t & ne01,
814
- constant int64_t & ne02,
815
- constant int64_t & ne03,
816
- constant uint64_t & nb00,
817
- constant uint64_t & nb01,
818
- constant uint64_t & nb02,
819
- constant uint64_t & nb03,
820
- constant int64_t & ne0,
821
- constant int64_t & ne1,
822
- constant int64_t & ne2,
823
- constant int64_t & ne3,
824
- constant uint64_t & nb0,
825
- constant uint64_t & nb1,
826
- constant uint64_t & nb2,
827
- constant uint64_t & nb3,
828
- constant int & n_past,
829
- constant int & n_dims,
830
- constant int & mode,
831
- constant float & freq_base,
832
- constant float & freq_scale,
894
+ device const void * src0,
895
+ device const int32_t * src1,
896
+ device float * dst,
897
+ constant int64_t & ne00,
898
+ constant int64_t & ne01,
899
+ constant int64_t & ne02,
900
+ constant int64_t & ne03,
901
+ constant uint64_t & nb00,
902
+ constant uint64_t & nb01,
903
+ constant uint64_t & nb02,
904
+ constant uint64_t & nb03,
905
+ constant int64_t & ne0,
906
+ constant int64_t & ne1,
907
+ constant int64_t & ne2,
908
+ constant int64_t & ne3,
909
+ constant uint64_t & nb0,
910
+ constant uint64_t & nb1,
911
+ constant uint64_t & nb2,
912
+ constant uint64_t & nb3,
913
+ constant int & n_past,
914
+ constant int & n_dims,
915
+ constant int & mode,
916
+ constant float & freq_base,
917
+ constant float & freq_scale,
833
918
  uint tiitg[[thread_index_in_threadgroup]],
834
919
  uint3 tptg[[threads_per_threadgroup]],
835
920
  uint3 tgpig[[threadgroup_position_in_grid]]) {
@@ -839,7 +924,9 @@ kernel void kernel_rope(
839
924
 
840
925
  const bool is_neox = mode & 2;
841
926
 
842
- const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
927
+ device const int32_t * pos = src1;
928
+
929
+ const int64_t p = pos[i2];
843
930
 
844
931
  const float theta_0 = freq_scale * (float)p;
845
932
  const float inv_ndims = -1.f/n_dims;
@@ -851,11 +938,11 @@ kernel void kernel_rope(
851
938
  const float cos_theta = cos(theta);
852
939
  const float sin_theta = sin(theta);
853
940
 
854
- device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
855
- device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
941
+ device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
942
+ device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
856
943
 
857
- const float x0 = src[0];
858
- const float x1 = src[1];
944
+ const T x0 = src[0];
945
+ const T x1 = src[1];
859
946
 
860
947
  dst_data[0] = x0*cos_theta - x1*sin_theta;
861
948
  dst_data[1] = x0*sin_theta + x1*cos_theta;
@@ -870,8 +957,8 @@ kernel void kernel_rope(
870
957
 
871
958
  const int64_t i0 = ib*n_dims + ic/2;
872
959
 
873
- device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
874
- device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
960
+ device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
961
+ device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
875
962
 
876
963
  const float x0 = src[0];
877
964
  const float x1 = src[n_dims/2];
@@ -883,6 +970,9 @@ kernel void kernel_rope(
883
970
  }
884
971
  }
885
972
 
973
+ template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
974
+ template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
975
+
886
976
  kernel void kernel_cpy_f16_f16(
887
977
  device const half * src0,
888
978
  device half * dst,
@@ -1273,8 +1363,8 @@ kernel void kernel_mul_mat_q3_K_f32(
1273
1363
 
1274
1364
  float yl[32];
1275
1365
 
1276
- const uint16_t kmask1 = 0x3030;
1277
- const uint16_t kmask2 = 0x0f0f;
1366
+ //const uint16_t kmask1 = 0x3030;
1367
+ //const uint16_t kmask2 = 0x0f0f;
1278
1368
 
1279
1369
  const int tid = tiisg/4;
1280
1370
  const int ix = tiisg%4;
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
202
202
 
203
203
  __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
204
204
  {
205
- const int i = get_group_id(0);
205
+ const int i = get_group_id(0) + get_global_offset(0);
206
206
  const int tid = get_local_id(0);
207
207
  const int n = tid / 32;
208
208
  const int l = tid - 32 * n;
209
209
  const int is = 8 * n + l / 16;
210
210
 
211
211
  const uint8_t q = x[i].qs[32 * n + l];
212
- __global float *y = yy + i * QK_K + 128 * n;
212
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
213
213
 
214
214
  const float dall = vload_half(0, &x[i].d);
215
215
  const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
223
223
  __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
224
224
  {
225
225
  int r = get_local_id(0) / 4;
226
- int i = get_group_id(0);
226
+ int i = get_group_id(0) + get_global_offset(0);
227
227
  int tid = r / 2;
228
228
  int is0 = r % 2;
229
229
  int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
241
241
  float d_all = vload_half(0, &x[i].d);
242
242
  float dl = d_all * (us - 32);
243
243
 
244
- __global float *y = yy + i * QK_K + 128 * n + 32 * j;
244
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
245
245
  const __global uint8_t *q = x[i].qs + 32 * n;
246
246
  const __global uint8_t *hm = x[i].hmask;
247
247
 
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
251
251
 
252
252
  __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
253
253
  {
254
- const int i = get_group_id(0);
254
+ const int i = get_group_id(0) + get_global_offset(0);
255
255
  const int tid = get_local_id(0);
256
256
  const int il = tid / 8;
257
257
  const int ir = tid % 8;
258
258
  const int is = 2 * il;
259
259
  const int n = 4;
260
260
 
261
- __global float *y = yy + i * QK_K + 64 * il + n * ir;
261
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
262
262
 
263
263
  const float dall = vload_half(0, &x[i].d);
264
264
  const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
281
281
 
282
282
  __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
283
283
  {
284
- const int i = get_group_id(0);
284
+ const int i = get_group_id(0) + get_global_offset(0);
285
285
  const int tid = get_local_id(0);
286
286
  const int il = tid / 16;
287
287
  const int ir = tid % 16;
288
288
  const int is = 2 * il;
289
289
 
290
- __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
290
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
291
291
 
292
292
  const float dall = vload_half(0, &x[i].d);
293
293
  const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
313
313
 
314
314
  __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
315
315
  {
316
- const int i = get_group_id(0);
316
+ const int i = get_group_id(0) + get_global_offset(0);
317
317
  const int tid = get_local_id(0);
318
318
  const int ip = tid / 32;
319
319
  const int il = tid - 32 * ip;
320
320
  const int is = 8 * ip + il / 16;
321
321
 
322
- __global float *y = yy + i * QK_K + 128 * ip + il;
322
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
323
323
 
324
324
  const float d = vload_half(0, &x[i].d);
325
325
 
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
730
730
  const uint qk = QUANT_K;
731
731
  const uint qr = QUANT_R;
732
732
 
733
- const int ib = i/qk; // block index
733
+ const int ib = i/qk + get_global_offset(0); // block index
734
734
  const int iqs = (i%qk)/qr; // quant index
735
735
  const int iybs = i - i%qk; // y block start index
736
736
  const int y_offset = qr == 1 ? 1 : qk/2;
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1349
1349
  const enum ggml_type type = src->type;
1350
1350
  const size_t ts = ggml_type_size(type);
1351
1351
  const size_t bs = ggml_blck_size(type);
1352
+ const uint64_t row_size = ts*ne0/bs;
1352
1353
 
1353
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1355
- err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1356
- return err;
1354
+ const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1355
+ if (nb0 == ts && nb1 == row_size) {
1356
+ return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
1357
1357
  }
1358
1358
  if (nb0 == ts) {
1359
1359
  const size_t buffer_origin[3] = { offset, 0, 0 };
1360
1360
  const size_t host_origin[3] = { 0, 0, 0 };
1361
- const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1362
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1363
- return err;
1361
+ const size_t region[3] = { row_size, ne1, 1 };
1362
+ return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
1364
1363
  }
1364
+ std::vector<cl_event> events;
1365
+ if (ev && ne1>1) events.reserve(ne1-1);
1365
1366
  for (uint64_t i1 = 0; i1 < ne1; i1++) {
1366
1367
  // pretend the row is a matrix with cols=1
1367
- const size_t buffer_origin[3] = { offset, i1, 0 };
1368
+ const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
1368
1369
  const size_t host_origin[3] = { 0, 0, 0 };
1369
- const size_t region[3] = { ts/bs, ne0, 1 };
1370
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1370
+ const size_t region[3] = { ts, ne0/bs, 1 };
1371
+ // if an event is requested, make the last write wait for all previous writes to complete
1372
+ if (ev && i1) {
1373
+ events.push_back(*ev);
1374
+ }
1375
+ cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1376
+ err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
1371
1377
  if (err != CL_SUCCESS) {
1372
- break;
1378
+ for (auto event : events) {
1379
+ clReleaseEvent(event);
1380
+ }
1381
+ return err;
1373
1382
  }
1374
1383
  }
1375
- return err;
1384
+ for (auto event : events) {
1385
+ CL_CHECK(clReleaseEvent(event));
1386
+ }
1387
+ return CL_SUCCESS;
1376
1388
  }
1377
1389
 
1378
1390
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1476
1488
 
1477
1489
  const int64_t ne10 = src1->ne[0];
1478
1490
  const int64_t ne11 = src1->ne[1];
1491
+ const int64_t ne12 = src1->ne[2];
1492
+ const int64_t ne13 = src1->ne[3];
1479
1493
 
1480
1494
  const int nb2 = dst->nb[2];
1481
1495
  const int nb3 = dst->nb[3];
1482
1496
 
1497
+ const int64_t r2 = ne12 / ne02;
1498
+ const int64_t r3 = ne13 / ne03;
1499
+
1483
1500
  const float alpha = 1.0f;
1484
1501
  const float beta = 0.0f;
1485
1502
  const int x_ne = ne01 * ne00;
@@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1498
1515
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1516
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1500
1517
 
1501
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1502
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1518
+ size_t x_offset = 0;
1519
+ int64_t pi02 = -1;
1520
+ int64_t pi03 = -1;
1521
+
1522
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1523
+ int64_t i03 = i13 / r3;
1524
+
1525
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1526
+ int64_t i02 = i12 / r2;
1527
+
1503
1528
  // copy data to device
1504
- if (src0->backend != GGML_BACKEND_GPU) {
1529
+ if (src0->backend == GGML_BACKEND_GPU) {
1530
+ x_offset = (i03 * ne02 + i02) * x_ne;
1531
+ } else if (i02 != pi02 || i03 != pi03) {
1505
1532
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1533
+ pi02 = i02;
1534
+ pi03 = i03;
1506
1535
  }
1507
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1536
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1508
1537
 
1509
1538
  CL_CHECK(clFinish(queue));
1510
1539
 
@@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1514
1543
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1515
1544
  ne01, ne11, ne10,
1516
1545
  alpha,
1517
- d_X, 0, ne00,
1546
+ d_X, x_offset, ne00,
1518
1547
  d_Y, 0, ne10,
1519
1548
  beta,
1520
1549
  d_D, 0, ne01,
@@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1525
1554
  }
1526
1555
 
1527
1556
  // copy dst to host
1528
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1557
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1529
1558
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1530
1559
  }
1531
1560
  }
@@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1547
1576
 
1548
1577
  const int64_t ne10 = src1->ne[0];
1549
1578
  const int64_t ne11 = src1->ne[1];
1579
+ const int64_t ne12 = src1->ne[2];
1580
+ const int64_t ne13 = src1->ne[3];
1550
1581
 
1551
1582
  const int nb10 = src1->nb[0];
1552
1583
  const int nb11 = src1->nb[1];
@@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1556
1587
  const int nb2 = dst->nb[2];
1557
1588
  const int nb3 = dst->nb[3];
1558
1589
 
1590
+ const int64_t r2 = ne12 / ne02;
1591
+ const int64_t r3 = ne13 / ne03;
1592
+
1559
1593
  const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
1560
1594
  const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
1561
1595
  const int x_ne = ne01 * ne00;
@@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1577
1611
  bool src1_cont_rows = nb10 == sizeof(float);
1578
1612
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1579
1613
 
1580
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1581
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1614
+ size_t x_offset = 0;
1615
+ int64_t pi02 = -1;
1616
+ int64_t pi03 = -1;
1617
+
1618
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1619
+ int64_t i03 = i13 / r3;
1620
+
1621
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1622
+ int64_t i02 = i12 / r2;
1623
+
1582
1624
  // copy src0 to device
1583
- if (src0->backend != GGML_BACKEND_GPU) {
1625
+ if (src0->backend == GGML_BACKEND_GPU) {
1626
+ x_offset = (i03 * ne02 + i02) * x_ne;
1627
+ } else if (i02 != pi02 || i03 != pi03) {
1584
1628
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1629
+ pi02 = i02;
1630
+ pi03 = i03;
1585
1631
  }
1586
1632
 
1587
1633
  // convert src1 to fp16
1588
1634
  // TODO: use multiple threads
1589
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
1590
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
1635
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1636
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1591
1637
  if (src1_cont_rows) {
1592
1638
  if (src1_cont_cols) {
1593
1639
  ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1594
1640
  }
1595
1641
  else {
1596
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1597
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
1642
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1643
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1598
1644
  }
1599
1645
  }
1600
1646
  }
1601
1647
  else {
1602
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1603
- for (int64_t i00 = 0; i00 < ne10; i00++) {
1648
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1649
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1604
1650
  // very slow due to no inlining
1605
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
1651
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1606
1652
  }
1607
1653
  }
1608
1654
  }
@@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1618
1664
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1619
1665
  ne01, ne11, ne10,
1620
1666
  alpha,
1621
- d_X, 0, ne00,
1667
+ d_X, x_offset, ne00,
1622
1668
  d_Y, 0, ne10,
1623
1669
  beta,
1624
1670
  d_D, 0, ne01,
@@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1631
1677
  // copy dst to host, then convert to float
1632
1678
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1633
1679
 
1634
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1680
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1635
1681
 
1636
1682
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
1637
1683
  }
@@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1652
1698
 
1653
1699
  const int64_t ne10 = src1->ne[0];
1654
1700
  const int64_t ne11 = src1->ne[1];
1701
+ const int64_t ne12 = src1->ne[2];
1702
+ const int64_t ne13 = src1->ne[3];
1655
1703
 
1656
1704
  const int nb2 = dst->nb[2];
1657
1705
  const int nb3 = dst->nb[3];
1658
1706
  const ggml_type type = src0->type;
1659
1707
  const bool mul_mat_vec = ne11 == 1;
1660
1708
 
1709
+ const int64_t r2 = ne12 / ne02;
1710
+ const int64_t r3 = ne13 / ne03;
1711
+
1661
1712
  const float alpha = 1.0f;
1662
1713
  const float beta = 0.0f;
1663
1714
  const int x_ne = ne01 * ne00;
1664
1715
  const int y_ne = ne11 * ne10;
1665
1716
  const int d_ne = ne11 * ne01;
1666
- const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
1717
+ const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
1718
+ const size_t q_sz = ggml_type_size(type) * x_bps;
1667
1719
 
1668
1720
  size_t x_size;
1669
1721
  size_t y_size;
@@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1690
1742
  size_t ev_idx = 0;
1691
1743
  std::vector<cl_event> events;
1692
1744
 
1693
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1694
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1745
+ int64_t pi02 = -1;
1746
+ int64_t pi03 = -1;
1747
+
1748
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1749
+ int64_t i03 = i13 / r3;
1750
+
1751
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1752
+ int64_t i02 = i12 / r2;
1753
+
1695
1754
  // copy src0 to device if necessary
1696
1755
  if (src0->backend == GGML_BACKEND_CPU) {
1697
- events.emplace_back();
1698
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1756
+ if (i02 != pi02 || i03 != pi03) {
1757
+ events.emplace_back();
1758
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1759
+ pi02 = i02;
1760
+ pi03 = i03;
1761
+ }
1699
1762
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
1763
  d_Q = (cl_mem) src0->extra;
1701
1764
  } else {
@@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1767
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1705
1768
  // copy src1 to device
1706
1769
  events.emplace_back();
1707
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
1770
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1708
1771
 
1709
1772
  // compute
1710
1773
  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1720
1783
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1721
1784
  // convert src0 to fp32 on device
1722
1785
  const size_t global = x_ne / global_denom;
1786
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1723
1787
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1724
1788
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1725
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1789
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1726
1790
 
1727
1791
  // copy src1 to device
1728
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1792
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1729
1793
 
1730
1794
  events.emplace_back();
1731
1795
 
@@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1749
1813
  }
1750
1814
 
1751
1815
  // copy dst to host
1752
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1816
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1753
1817
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1754
1818
  for (auto *event : events) {
1755
1819
  clReleaseEvent(event);
@@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1844
1908
  const int64_t ne3 = tensor->ne[3];
1845
1909
 
1846
1910
  const ggml_type type = tensor->type;
1847
- const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1911
+ const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1912
+ const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
1848
1913
 
1849
1914
  size_t q_size;
1850
1915
  cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1851
1916
 
1852
1917
  tensor->data = data;
1853
1918
  // copy tensor to device
1919
+ size_t offset = 0;
1854
1920
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1855
1921
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1856
- int i = i3*ne2 + i2;
1857
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1922
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1923
+ offset += s_sz;
1858
1924
  }
1859
1925
  }
1860
1926