llama_cpp 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@
19
19
  #pragma warning(disable: 4244 4267) // possible loss of data
20
20
  #endif
21
21
 
22
- #define CL_DMMV_BLOCK_SIZE 32
22
+ #define CL_DMMV_LOCAL_SIZE 32
23
23
 
24
24
  #ifndef K_QUANTS_PER_ITERATION
25
25
  #define K_QUANTS_PER_ITERATION 1
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
202
202
 
203
203
  __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
204
204
  {
205
- const int i = get_group_id(0);
205
+ const int i = get_group_id(0) + get_global_offset(0);
206
206
  const int tid = get_local_id(0);
207
207
  const int n = tid / 32;
208
208
  const int l = tid - 32 * n;
209
209
  const int is = 8 * n + l / 16;
210
210
 
211
211
  const uint8_t q = x[i].qs[32 * n + l];
212
- __global float *y = yy + i * QK_K + 128 * n;
212
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
213
213
 
214
214
  const float dall = vload_half(0, &x[i].d);
215
215
  const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
223
223
  __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
224
224
  {
225
225
  int r = get_local_id(0) / 4;
226
- int i = get_group_id(0);
226
+ int i = get_group_id(0) + get_global_offset(0);
227
227
  int tid = r / 2;
228
228
  int is0 = r % 2;
229
229
  int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
241
241
  float d_all = vload_half(0, &x[i].d);
242
242
  float dl = d_all * (us - 32);
243
243
 
244
- __global float *y = yy + i * QK_K + 128 * n + 32 * j;
244
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
245
245
  const __global uint8_t *q = x[i].qs + 32 * n;
246
246
  const __global uint8_t *hm = x[i].hmask;
247
247
 
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
251
251
 
252
252
  __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
253
253
  {
254
- const int i = get_group_id(0);
254
+ const int i = get_group_id(0) + get_global_offset(0);
255
255
  const int tid = get_local_id(0);
256
256
  const int il = tid / 8;
257
257
  const int ir = tid % 8;
258
258
  const int is = 2 * il;
259
259
  const int n = 4;
260
260
 
261
- __global float *y = yy + i * QK_K + 64 * il + n * ir;
261
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
262
262
 
263
263
  const float dall = vload_half(0, &x[i].d);
264
264
  const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
281
281
 
282
282
  __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
283
283
  {
284
- const int i = get_group_id(0);
284
+ const int i = get_group_id(0) + get_global_offset(0);
285
285
  const int tid = get_local_id(0);
286
286
  const int il = tid / 16;
287
287
  const int ir = tid % 16;
288
288
  const int is = 2 * il;
289
289
 
290
- __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
290
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
291
291
 
292
292
  const float dall = vload_half(0, &x[i].d);
293
293
  const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
313
313
 
314
314
  __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
315
315
  {
316
- const int i = get_group_id(0);
316
+ const int i = get_group_id(0) + get_global_offset(0);
317
317
  const int tid = get_local_id(0);
318
318
  const int ip = tid / 32;
319
319
  const int il = tid - 32 * ip;
320
320
  const int is = 8 * ip + il / 16;
321
321
 
322
- __global float *y = yy + i * QK_K + 128 * ip + il;
322
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
323
323
 
324
324
  const float d = vload_half(0, &x[i].d);
325
325
 
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338
338
  const int row = get_group_id(0);
339
339
 
340
340
  const int num_blocks_per_row = ncols / QK_K;
341
- const int ib0 = row*num_blocks_per_row;
341
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
342
342
 
343
343
  __global const struct block_q2_K * x = xx + ib0;
344
344
 
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413
413
  const int row = get_group_id(0);
414
414
 
415
415
  const int num_blocks_per_row = ncols / QK_K;
416
- const int ib0 = row*num_blocks_per_row;
416
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
417
417
 
418
418
  __global const struct block_q3_K * x = xx + ib0;
419
419
 
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489
489
 
490
490
  const int row = get_group_id(0);
491
491
  const int num_blocks_per_row = ncols / QK_K;
492
- const int ib0 = row*num_blocks_per_row;
492
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
493
493
 
494
494
  const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
495
495
  const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562
562
 
563
563
  const int row = get_group_id(0);
564
564
  const int num_blocks_per_row = ncols / QK_K;
565
- const int ib0 = row*num_blocks_per_row;
565
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
566
566
 
567
567
  const int tid = get_local_id(0)/2; // 0...15
568
568
  const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641
641
  const int row = get_group_id(0);
642
642
 
643
643
  const int num_blocks_per_row = ncols / QK_K;
644
- const int ib0 = row*num_blocks_per_row;
644
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
645
645
 
646
646
  __global const struct block_q6_K * x = xx + ib0;
647
647
 
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
730
730
  const uint qk = QUANT_K;
731
731
  const uint qr = QUANT_R;
732
732
 
733
- const int ib = i/qk; // block index
733
+ const int ib = i/qk + get_global_offset(0); // block index
734
734
  const int iqs = (i%qk)/qr; // quant index
735
735
  const int iybs = i - i%qk; // y block start index
736
736
  const int y_offset = qr == 1 ? 1 : qk/2;
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745
745
 
746
746
  std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747
747
  __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
748
- const int block_size = get_local_size(0);
748
+ const int local_size = get_local_size(0);
749
749
  const int row = get_group_id(0);
750
750
  const int tid = get_local_id(0);
751
751
 
752
752
  const uint qk = QUANT_K;
753
753
  const uint qr = QUANT_R;
754
754
 
755
+ const int col_step = local_size * 2;
755
756
  const int y_offset = qr == 1 ? 1 : qk/2;
756
757
 
758
+ x += get_global_offset(0);
759
+
757
760
  tmp[tid] = 0;
758
761
 
759
- for (int i = 0; i < ncols/block_size; i += 2) {
760
- const int col = i*block_size + 2*tid;
762
+ for (int col = tid*2; col < ncols; col += col_step) {
761
763
  const int ib = (row*ncols + col)/qk; // block index
762
764
  const int iqs = (col%qk)/qr; // quant index
763
765
  const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773
775
 
774
776
  // sum up partial sums and write back result
775
777
  barrier(CLK_LOCAL_MEM_FENCE);
776
- for (int s=block_size/2; s>0; s>>=1) {
778
+ for (int s=local_size/2; s>0; s>>=1) {
777
779
  if (tid < s) {
778
780
  tmp[tid] += tmp[tid + s];
779
781
  }
@@ -1349,30 +1351,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1349
1351
  const enum ggml_type type = src->type;
1350
1352
  const size_t ts = ggml_type_size(type);
1351
1353
  const size_t bs = ggml_blck_size(type);
1354
+ const uint64_t row_size = ts*ne0/bs;
1352
1355
 
1353
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1355
- err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1356
- return err;
1356
+ const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1357
+ if (nb0 == ts && nb1 == row_size) {
1358
+ return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
1357
1359
  }
1358
1360
  if (nb0 == ts) {
1359
1361
  const size_t buffer_origin[3] = { offset, 0, 0 };
1360
1362
  const size_t host_origin[3] = { 0, 0, 0 };
1361
- const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1362
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1363
- return err;
1363
+ const size_t region[3] = { row_size, ne1, 1 };
1364
+ return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
1364
1365
  }
1366
+ std::vector<cl_event> events;
1367
+ if (ev && ne1>1) events.reserve(ne1-1);
1365
1368
  for (uint64_t i1 = 0; i1 < ne1; i1++) {
1366
1369
  // pretend the row is a matrix with cols=1
1367
- const size_t buffer_origin[3] = { offset, i1, 0 };
1370
+ const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
1368
1371
  const size_t host_origin[3] = { 0, 0, 0 };
1369
- const size_t region[3] = { ts/bs, ne0, 1 };
1370
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1372
+ const size_t region[3] = { ts, ne0/bs, 1 };
1373
+ // if an event is requested, make the last write wait for all previous writes to complete
1374
+ if (ev && i1) {
1375
+ events.push_back(*ev);
1376
+ }
1377
+ cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1378
+ err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
1371
1379
  if (err != CL_SUCCESS) {
1372
- break;
1380
+ for (auto event : events) {
1381
+ clReleaseEvent(event);
1382
+ }
1383
+ return err;
1373
1384
  }
1374
1385
  }
1375
- return err;
1386
+ for (auto event : events) {
1387
+ CL_CHECK(clReleaseEvent(event));
1388
+ }
1389
+ return CL_SUCCESS;
1376
1390
  }
1377
1391
 
1378
1392
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1476,10 +1490,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1476
1490
 
1477
1491
  const int64_t ne10 = src1->ne[0];
1478
1492
  const int64_t ne11 = src1->ne[1];
1493
+ const int64_t ne12 = src1->ne[2];
1494
+ const int64_t ne13 = src1->ne[3];
1479
1495
 
1480
1496
  const int nb2 = dst->nb[2];
1481
1497
  const int nb3 = dst->nb[3];
1482
1498
 
1499
+ const int64_t r2 = ne12 / ne02;
1500
+ const int64_t r3 = ne13 / ne03;
1501
+
1483
1502
  const float alpha = 1.0f;
1484
1503
  const float beta = 0.0f;
1485
1504
  const int x_ne = ne01 * ne00;
@@ -1498,13 +1517,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1498
1517
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1518
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1500
1519
 
1501
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1502
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1520
+ size_t x_offset = 0;
1521
+ int64_t pi02 = -1;
1522
+ int64_t pi03 = -1;
1523
+
1524
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1525
+ int64_t i03 = i13 / r3;
1526
+
1527
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1528
+ int64_t i02 = i12 / r2;
1529
+
1503
1530
  // copy data to device
1504
- if (src0->backend != GGML_BACKEND_GPU) {
1531
+ if (src0->backend == GGML_BACKEND_GPU) {
1532
+ x_offset = (i03 * ne02 + i02) * x_ne;
1533
+ } else if (i02 != pi02 || i03 != pi03) {
1505
1534
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1535
+ pi02 = i02;
1536
+ pi03 = i03;
1506
1537
  }
1507
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1538
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1508
1539
 
1509
1540
  CL_CHECK(clFinish(queue));
1510
1541
 
@@ -1514,7 +1545,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1514
1545
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1515
1546
  ne01, ne11, ne10,
1516
1547
  alpha,
1517
- d_X, 0, ne00,
1548
+ d_X, x_offset, ne00,
1518
1549
  d_Y, 0, ne10,
1519
1550
  beta,
1520
1551
  d_D, 0, ne01,
@@ -1525,7 +1556,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1525
1556
  }
1526
1557
 
1527
1558
  // copy dst to host
1528
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1559
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1529
1560
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1530
1561
  }
1531
1562
  }
@@ -1547,6 +1578,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1547
1578
 
1548
1579
  const int64_t ne10 = src1->ne[0];
1549
1580
  const int64_t ne11 = src1->ne[1];
1581
+ const int64_t ne12 = src1->ne[2];
1582
+ const int64_t ne13 = src1->ne[3];
1550
1583
 
1551
1584
  const int nb10 = src1->nb[0];
1552
1585
  const int nb11 = src1->nb[1];
@@ -1556,6 +1589,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1556
1589
  const int nb2 = dst->nb[2];
1557
1590
  const int nb3 = dst->nb[3];
1558
1591
 
1592
+ const int64_t r2 = ne12 / ne02;
1593
+ const int64_t r3 = ne13 / ne03;
1594
+
1559
1595
  const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
1560
1596
  const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
1561
1597
  const int x_ne = ne01 * ne00;
@@ -1577,32 +1613,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1577
1613
  bool src1_cont_rows = nb10 == sizeof(float);
1578
1614
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1579
1615
 
1580
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1581
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1616
+ size_t x_offset = 0;
1617
+ int64_t pi02 = -1;
1618
+ int64_t pi03 = -1;
1619
+
1620
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1621
+ int64_t i03 = i13 / r3;
1622
+
1623
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1624
+ int64_t i02 = i12 / r2;
1625
+
1582
1626
  // copy src0 to device
1583
- if (src0->backend != GGML_BACKEND_GPU) {
1627
+ if (src0->backend == GGML_BACKEND_GPU) {
1628
+ x_offset = (i03 * ne02 + i02) * x_ne;
1629
+ } else if (i02 != pi02 || i03 != pi03) {
1584
1630
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1631
+ pi02 = i02;
1632
+ pi03 = i03;
1585
1633
  }
1586
1634
 
1587
1635
  // convert src1 to fp16
1588
1636
  // TODO: use multiple threads
1589
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
1590
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
1637
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1638
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1591
1639
  if (src1_cont_rows) {
1592
1640
  if (src1_cont_cols) {
1593
1641
  ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1594
1642
  }
1595
1643
  else {
1596
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1597
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
1644
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1645
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1598
1646
  }
1599
1647
  }
1600
1648
  }
1601
1649
  else {
1602
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1603
- for (int64_t i00 = 0; i00 < ne10; i00++) {
1650
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1651
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1604
1652
  // very slow due to no inlining
1605
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
1653
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1606
1654
  }
1607
1655
  }
1608
1656
  }
@@ -1618,7 +1666,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1618
1666
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1619
1667
  ne01, ne11, ne10,
1620
1668
  alpha,
1621
- d_X, 0, ne00,
1669
+ d_X, x_offset, ne00,
1622
1670
  d_Y, 0, ne10,
1623
1671
  beta,
1624
1672
  d_D, 0, ne01,
@@ -1631,7 +1679,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1631
1679
  // copy dst to host, then convert to float
1632
1680
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1633
1681
 
1634
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1682
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1635
1683
 
1636
1684
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
1637
1685
  }
@@ -1652,18 +1700,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1652
1700
 
1653
1701
  const int64_t ne10 = src1->ne[0];
1654
1702
  const int64_t ne11 = src1->ne[1];
1703
+ const int64_t ne12 = src1->ne[2];
1704
+ const int64_t ne13 = src1->ne[3];
1655
1705
 
1656
1706
  const int nb2 = dst->nb[2];
1657
1707
  const int nb3 = dst->nb[3];
1658
1708
  const ggml_type type = src0->type;
1659
- const bool mul_mat_vec = ne11 == 1;
1709
+ const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
1710
+
1711
+ const int64_t r2 = ne12 / ne02;
1712
+ const int64_t r3 = ne13 / ne03;
1660
1713
 
1661
1714
  const float alpha = 1.0f;
1662
1715
  const float beta = 0.0f;
1663
1716
  const int x_ne = ne01 * ne00;
1664
1717
  const int y_ne = ne11 * ne10;
1665
1718
  const int d_ne = ne11 * ne01;
1666
- const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
1719
+ const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
1720
+ const size_t q_sz = ggml_type_size(type) * x_bps;
1667
1721
 
1668
1722
  size_t x_size;
1669
1723
  size_t y_size;
@@ -1685,17 +1739,28 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1685
1739
  GGML_ASSERT(to_fp32_cl != nullptr);
1686
1740
 
1687
1741
  const size_t global_denom = ggml_cl_global_denom(type);
1688
- const size_t local = ggml_cl_local_size(type);
1742
+ const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
1689
1743
 
1690
1744
  size_t ev_idx = 0;
1691
1745
  std::vector<cl_event> events;
1692
1746
 
1693
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1694
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1747
+ int64_t pi02 = -1;
1748
+ int64_t pi03 = -1;
1749
+
1750
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1751
+ int64_t i03 = i13 / r3;
1752
+
1753
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1754
+ int64_t i02 = i12 / r2;
1755
+
1695
1756
  // copy src0 to device if necessary
1696
1757
  if (src0->backend == GGML_BACKEND_CPU) {
1697
- events.emplace_back();
1698
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1758
+ if (i02 != pi02 || i03 != pi03) {
1759
+ events.emplace_back();
1760
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1761
+ pi02 = i02;
1762
+ pi03 = i03;
1763
+ }
1699
1764
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
1765
  d_Q = (cl_mem) src0->extra;
1701
1766
  } else {
@@ -1704,11 +1769,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1769
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1705
1770
  // copy src1 to device
1706
1771
  events.emplace_back();
1707
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
1772
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1708
1773
 
1709
1774
  // compute
1710
- const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1711
- const size_t local = CL_DMMV_BLOCK_SIZE;
1775
+ const size_t global = ne01 * local;
1776
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1712
1777
  const cl_int ncols = ne00;
1713
1778
  events.emplace_back();
1714
1779
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1716,16 +1781,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1716
1781
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1717
1782
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1718
1783
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1719
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1784
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1720
1785
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1721
1786
  // convert src0 to fp32 on device
1722
1787
  const size_t global = x_ne / global_denom;
1788
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1723
1789
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1724
1790
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1725
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1791
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1726
1792
 
1727
1793
  // copy src1 to device
1728
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1794
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1729
1795
 
1730
1796
  events.emplace_back();
1731
1797
 
@@ -1749,7 +1815,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1749
1815
  }
1750
1816
 
1751
1817
  // copy dst to host
1752
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1818
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1753
1819
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1754
1820
  for (auto *event : events) {
1755
1821
  clReleaseEvent(event);
@@ -1844,17 +1910,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1844
1910
  const int64_t ne3 = tensor->ne[3];
1845
1911
 
1846
1912
  const ggml_type type = tensor->type;
1847
- const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1913
+ const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1914
+ const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
1848
1915
 
1849
1916
  size_t q_size;
1850
1917
  cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1851
1918
 
1852
1919
  tensor->data = data;
1853
1920
  // copy tensor to device
1921
+ size_t offset = 0;
1854
1922
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1855
1923
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1856
- int i = i3*ne2 + i2;
1857
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1924
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1925
+ offset += s_sz;
1858
1926
  }
1859
1927
  }
1860
1928