llama_cpp 0.6.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,7 +19,7 @@
19
19
  #pragma warning(disable: 4244 4267) // possible loss of data
20
20
  #endif
21
21
 
22
- #define CL_DMMV_BLOCK_SIZE 32
22
+ #define CL_DMMV_LOCAL_SIZE 32
23
23
 
24
24
  #ifndef K_QUANTS_PER_ITERATION
25
25
  #define K_QUANTS_PER_ITERATION 1
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
202
202
 
203
203
  __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
204
204
  {
205
- const int i = get_group_id(0);
205
+ const int i = get_group_id(0) + get_global_offset(0);
206
206
  const int tid = get_local_id(0);
207
207
  const int n = tid / 32;
208
208
  const int l = tid - 32 * n;
209
209
  const int is = 8 * n + l / 16;
210
210
 
211
211
  const uint8_t q = x[i].qs[32 * n + l];
212
- __global float *y = yy + i * QK_K + 128 * n;
212
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
213
213
 
214
214
  const float dall = vload_half(0, &x[i].d);
215
215
  const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
223
223
  __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
224
224
  {
225
225
  int r = get_local_id(0) / 4;
226
- int i = get_group_id(0);
226
+ int i = get_group_id(0) + get_global_offset(0);
227
227
  int tid = r / 2;
228
228
  int is0 = r % 2;
229
229
  int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
241
241
  float d_all = vload_half(0, &x[i].d);
242
242
  float dl = d_all * (us - 32);
243
243
 
244
- __global float *y = yy + i * QK_K + 128 * n + 32 * j;
244
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
245
245
  const __global uint8_t *q = x[i].qs + 32 * n;
246
246
  const __global uint8_t *hm = x[i].hmask;
247
247
 
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
251
251
 
252
252
  __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
253
253
  {
254
- const int i = get_group_id(0);
254
+ const int i = get_group_id(0) + get_global_offset(0);
255
255
  const int tid = get_local_id(0);
256
256
  const int il = tid / 8;
257
257
  const int ir = tid % 8;
258
258
  const int is = 2 * il;
259
259
  const int n = 4;
260
260
 
261
- __global float *y = yy + i * QK_K + 64 * il + n * ir;
261
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
262
262
 
263
263
  const float dall = vload_half(0, &x[i].d);
264
264
  const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
281
281
 
282
282
  __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
283
283
  {
284
- const int i = get_group_id(0);
284
+ const int i = get_group_id(0) + get_global_offset(0);
285
285
  const int tid = get_local_id(0);
286
286
  const int il = tid / 16;
287
287
  const int ir = tid % 16;
288
288
  const int is = 2 * il;
289
289
 
290
- __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
290
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
291
291
 
292
292
  const float dall = vload_half(0, &x[i].d);
293
293
  const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
313
313
 
314
314
  __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
315
315
  {
316
- const int i = get_group_id(0);
316
+ const int i = get_group_id(0) + get_global_offset(0);
317
317
  const int tid = get_local_id(0);
318
318
  const int ip = tid / 32;
319
319
  const int il = tid - 32 * ip;
320
320
  const int is = 8 * ip + il / 16;
321
321
 
322
- __global float *y = yy + i * QK_K + 128 * ip + il;
322
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
323
323
 
324
324
  const float d = vload_half(0, &x[i].d);
325
325
 
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
338
338
  const int row = get_group_id(0);
339
339
 
340
340
  const int num_blocks_per_row = ncols / QK_K;
341
- const int ib0 = row*num_blocks_per_row;
341
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
342
342
 
343
343
  __global const struct block_q2_K * x = xx + ib0;
344
344
 
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
413
413
  const int row = get_group_id(0);
414
414
 
415
415
  const int num_blocks_per_row = ncols / QK_K;
416
- const int ib0 = row*num_blocks_per_row;
416
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
417
417
 
418
418
  __global const struct block_q3_K * x = xx + ib0;
419
419
 
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
489
489
 
490
490
  const int row = get_group_id(0);
491
491
  const int num_blocks_per_row = ncols / QK_K;
492
- const int ib0 = row*num_blocks_per_row;
492
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
493
493
 
494
494
  const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
495
495
  const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
562
562
 
563
563
  const int row = get_group_id(0);
564
564
  const int num_blocks_per_row = ncols / QK_K;
565
- const int ib0 = row*num_blocks_per_row;
565
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
566
566
 
567
567
  const int tid = get_local_id(0)/2; // 0...15
568
568
  const int ix = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
641
641
  const int row = get_group_id(0);
642
642
 
643
643
  const int num_blocks_per_row = ncols / QK_K;
644
- const int ib0 = row*num_blocks_per_row;
644
+ const int ib0 = row*num_blocks_per_row + get_global_offset(0);
645
645
 
646
646
  __global const struct block_q6_K * x = xx + ib0;
647
647
 
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
730
730
  const uint qk = QUANT_K;
731
731
  const uint qr = QUANT_R;
732
732
 
733
- const int ib = i/qk; // block index
733
+ const int ib = i/qk + get_global_offset(0); // block index
734
734
  const int iqs = (i%qk)/qr; // quant index
735
735
  const int iybs = i - i%qk; // y block start index
736
736
  const int y_offset = qr == 1 ? 1 : qk/2;
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
745
745
 
746
746
  std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
747
747
  __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
748
- const int block_size = get_local_size(0);
748
+ const int local_size = get_local_size(0);
749
749
  const int row = get_group_id(0);
750
750
  const int tid = get_local_id(0);
751
751
 
752
752
  const uint qk = QUANT_K;
753
753
  const uint qr = QUANT_R;
754
754
 
755
+ const int col_step = local_size * 2;
755
756
  const int y_offset = qr == 1 ? 1 : qk/2;
756
757
 
758
+ x += get_global_offset(0);
759
+
757
760
  tmp[tid] = 0;
758
761
 
759
- for (int i = 0; i < ncols/block_size; i += 2) {
760
- const int col = i*block_size + 2*tid;
762
+ for (int col = tid*2; col < ncols; col += col_step) {
761
763
  const int ib = (row*ncols + col)/qk; // block index
762
764
  const int iqs = (col%qk)/qr; // quant index
763
765
  const int iybs = col - col%qk; // y block start index
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
773
775
 
774
776
  // sum up partial sums and write back result
775
777
  barrier(CLK_LOCAL_MEM_FENCE);
776
- for (int s=block_size/2; s>0; s>>=1) {
778
+ for (int s=local_size/2; s>0; s>>=1) {
777
779
  if (tid < s) {
778
780
  tmp[tid] += tmp[tid + s];
779
781
  }
@@ -1349,30 +1351,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1349
1351
  const enum ggml_type type = src->type;
1350
1352
  const size_t ts = ggml_type_size(type);
1351
1353
  const size_t bs = ggml_blck_size(type);
1354
+ const uint64_t row_size = ts*ne0/bs;
1352
1355
 
1353
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1355
- err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1356
- return err;
1356
+ const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1357
+ if (nb0 == ts && nb1 == row_size) {
1358
+ return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
1357
1359
  }
1358
1360
  if (nb0 == ts) {
1359
1361
  const size_t buffer_origin[3] = { offset, 0, 0 };
1360
1362
  const size_t host_origin[3] = { 0, 0, 0 };
1361
- const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1362
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1363
- return err;
1363
+ const size_t region[3] = { row_size, ne1, 1 };
1364
+ return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
1364
1365
  }
1366
+ std::vector<cl_event> events;
1367
+ if (ev && ne1>1) events.reserve(ne1-1);
1365
1368
  for (uint64_t i1 = 0; i1 < ne1; i1++) {
1366
1369
  // pretend the row is a matrix with cols=1
1367
- const size_t buffer_origin[3] = { offset, i1, 0 };
1370
+ const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
1368
1371
  const size_t host_origin[3] = { 0, 0, 0 };
1369
- const size_t region[3] = { ts/bs, ne0, 1 };
1370
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1372
+ const size_t region[3] = { ts, ne0/bs, 1 };
1373
+ // if an event is requested, make the last write wait for all previous writes to complete
1374
+ if (ev && i1) {
1375
+ events.push_back(*ev);
1376
+ }
1377
+ cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1378
+ err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
1371
1379
  if (err != CL_SUCCESS) {
1372
- break;
1380
+ for (auto event : events) {
1381
+ clReleaseEvent(event);
1382
+ }
1383
+ return err;
1373
1384
  }
1374
1385
  }
1375
- return err;
1386
+ for (auto event : events) {
1387
+ CL_CHECK(clReleaseEvent(event));
1388
+ }
1389
+ return CL_SUCCESS;
1376
1390
  }
1377
1391
 
1378
1392
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1476,10 +1490,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1476
1490
 
1477
1491
  const int64_t ne10 = src1->ne[0];
1478
1492
  const int64_t ne11 = src1->ne[1];
1493
+ const int64_t ne12 = src1->ne[2];
1494
+ const int64_t ne13 = src1->ne[3];
1479
1495
 
1480
1496
  const int nb2 = dst->nb[2];
1481
1497
  const int nb3 = dst->nb[3];
1482
1498
 
1499
+ const int64_t r2 = ne12 / ne02;
1500
+ const int64_t r3 = ne13 / ne03;
1501
+
1483
1502
  const float alpha = 1.0f;
1484
1503
  const float beta = 0.0f;
1485
1504
  const int x_ne = ne01 * ne00;
@@ -1498,13 +1517,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1498
1517
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1518
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1500
1519
 
1501
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1502
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1520
+ size_t x_offset = 0;
1521
+ int64_t pi02 = -1;
1522
+ int64_t pi03 = -1;
1523
+
1524
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1525
+ int64_t i03 = i13 / r3;
1526
+
1527
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1528
+ int64_t i02 = i12 / r2;
1529
+
1503
1530
  // copy data to device
1504
- if (src0->backend != GGML_BACKEND_GPU) {
1531
+ if (src0->backend == GGML_BACKEND_GPU) {
1532
+ x_offset = (i03 * ne02 + i02) * x_ne;
1533
+ } else if (i02 != pi02 || i03 != pi03) {
1505
1534
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1535
+ pi02 = i02;
1536
+ pi03 = i03;
1506
1537
  }
1507
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1538
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1508
1539
 
1509
1540
  CL_CHECK(clFinish(queue));
1510
1541
 
@@ -1514,7 +1545,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1514
1545
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1515
1546
  ne01, ne11, ne10,
1516
1547
  alpha,
1517
- d_X, 0, ne00,
1548
+ d_X, x_offset, ne00,
1518
1549
  d_Y, 0, ne10,
1519
1550
  beta,
1520
1551
  d_D, 0, ne01,
@@ -1525,7 +1556,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1525
1556
  }
1526
1557
 
1527
1558
  // copy dst to host
1528
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1559
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1529
1560
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1530
1561
  }
1531
1562
  }
@@ -1547,6 +1578,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1547
1578
 
1548
1579
  const int64_t ne10 = src1->ne[0];
1549
1580
  const int64_t ne11 = src1->ne[1];
1581
+ const int64_t ne12 = src1->ne[2];
1582
+ const int64_t ne13 = src1->ne[3];
1550
1583
 
1551
1584
  const int nb10 = src1->nb[0];
1552
1585
  const int nb11 = src1->nb[1];
@@ -1556,6 +1589,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1556
1589
  const int nb2 = dst->nb[2];
1557
1590
  const int nb3 = dst->nb[3];
1558
1591
 
1592
+ const int64_t r2 = ne12 / ne02;
1593
+ const int64_t r3 = ne13 / ne03;
1594
+
1559
1595
  const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
1560
1596
  const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
1561
1597
  const int x_ne = ne01 * ne00;
@@ -1577,32 +1613,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1577
1613
  bool src1_cont_rows = nb10 == sizeof(float);
1578
1614
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1579
1615
 
1580
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1581
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1616
+ size_t x_offset = 0;
1617
+ int64_t pi02 = -1;
1618
+ int64_t pi03 = -1;
1619
+
1620
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1621
+ int64_t i03 = i13 / r3;
1622
+
1623
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1624
+ int64_t i02 = i12 / r2;
1625
+
1582
1626
  // copy src0 to device
1583
- if (src0->backend != GGML_BACKEND_GPU) {
1627
+ if (src0->backend == GGML_BACKEND_GPU) {
1628
+ x_offset = (i03 * ne02 + i02) * x_ne;
1629
+ } else if (i02 != pi02 || i03 != pi03) {
1584
1630
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1631
+ pi02 = i02;
1632
+ pi03 = i03;
1585
1633
  }
1586
1634
 
1587
1635
  // convert src1 to fp16
1588
1636
  // TODO: use multiple threads
1589
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
1590
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
1637
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1638
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1591
1639
  if (src1_cont_rows) {
1592
1640
  if (src1_cont_cols) {
1593
1641
  ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1594
1642
  }
1595
1643
  else {
1596
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1597
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
1644
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1645
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1598
1646
  }
1599
1647
  }
1600
1648
  }
1601
1649
  else {
1602
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1603
- for (int64_t i00 = 0; i00 < ne10; i00++) {
1650
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1651
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1604
1652
  // very slow due to no inlining
1605
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
1653
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1606
1654
  }
1607
1655
  }
1608
1656
  }
@@ -1618,7 +1666,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1618
1666
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1619
1667
  ne01, ne11, ne10,
1620
1668
  alpha,
1621
- d_X, 0, ne00,
1669
+ d_X, x_offset, ne00,
1622
1670
  d_Y, 0, ne10,
1623
1671
  beta,
1624
1672
  d_D, 0, ne01,
@@ -1631,7 +1679,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1631
1679
  // copy dst to host, then convert to float
1632
1680
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1633
1681
 
1634
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1682
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1635
1683
 
1636
1684
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
1637
1685
  }
@@ -1652,18 +1700,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1652
1700
 
1653
1701
  const int64_t ne10 = src1->ne[0];
1654
1702
  const int64_t ne11 = src1->ne[1];
1703
+ const int64_t ne12 = src1->ne[2];
1704
+ const int64_t ne13 = src1->ne[3];
1655
1705
 
1656
1706
  const int nb2 = dst->nb[2];
1657
1707
  const int nb3 = dst->nb[3];
1658
1708
  const ggml_type type = src0->type;
1659
- const bool mul_mat_vec = ne11 == 1;
1709
+ const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
1710
+
1711
+ const int64_t r2 = ne12 / ne02;
1712
+ const int64_t r3 = ne13 / ne03;
1660
1713
 
1661
1714
  const float alpha = 1.0f;
1662
1715
  const float beta = 0.0f;
1663
1716
  const int x_ne = ne01 * ne00;
1664
1717
  const int y_ne = ne11 * ne10;
1665
1718
  const int d_ne = ne11 * ne01;
1666
- const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
1719
+ const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
1720
+ const size_t q_sz = ggml_type_size(type) * x_bps;
1667
1721
 
1668
1722
  size_t x_size;
1669
1723
  size_t y_size;
@@ -1685,17 +1739,28 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1685
1739
  GGML_ASSERT(to_fp32_cl != nullptr);
1686
1740
 
1687
1741
  const size_t global_denom = ggml_cl_global_denom(type);
1688
- const size_t local = ggml_cl_local_size(type);
1742
+ const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
1689
1743
 
1690
1744
  size_t ev_idx = 0;
1691
1745
  std::vector<cl_event> events;
1692
1746
 
1693
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1694
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1747
+ int64_t pi02 = -1;
1748
+ int64_t pi03 = -1;
1749
+
1750
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1751
+ int64_t i03 = i13 / r3;
1752
+
1753
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1754
+ int64_t i02 = i12 / r2;
1755
+
1695
1756
  // copy src0 to device if necessary
1696
1757
  if (src0->backend == GGML_BACKEND_CPU) {
1697
- events.emplace_back();
1698
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1758
+ if (i02 != pi02 || i03 != pi03) {
1759
+ events.emplace_back();
1760
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1761
+ pi02 = i02;
1762
+ pi03 = i03;
1763
+ }
1699
1764
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
1765
  d_Q = (cl_mem) src0->extra;
1701
1766
  } else {
@@ -1704,11 +1769,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1769
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1705
1770
  // copy src1 to device
1706
1771
  events.emplace_back();
1707
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
1772
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1708
1773
 
1709
1774
  // compute
1710
- const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1711
- const size_t local = CL_DMMV_BLOCK_SIZE;
1775
+ const size_t global = ne01 * local;
1776
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1712
1777
  const cl_int ncols = ne00;
1713
1778
  events.emplace_back();
1714
1779
  CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1716,16 +1781,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1716
1781
  CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1717
1782
  CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1718
1783
  CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1719
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1784
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1720
1785
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1721
1786
  // convert src0 to fp32 on device
1722
1787
  const size_t global = x_ne / global_denom;
1788
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1723
1789
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1724
1790
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1725
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1791
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1726
1792
 
1727
1793
  // copy src1 to device
1728
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1794
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1729
1795
 
1730
1796
  events.emplace_back();
1731
1797
 
@@ -1749,7 +1815,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1749
1815
  }
1750
1816
 
1751
1817
  // copy dst to host
1752
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1818
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1753
1819
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1754
1820
  for (auto *event : events) {
1755
1821
  clReleaseEvent(event);
@@ -1844,17 +1910,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1844
1910
  const int64_t ne3 = tensor->ne[3];
1845
1911
 
1846
1912
  const ggml_type type = tensor->type;
1847
- const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1913
+ const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1914
+ const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
1848
1915
 
1849
1916
  size_t q_size;
1850
1917
  cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1851
1918
 
1852
1919
  tensor->data = data;
1853
1920
  // copy tensor to device
1921
+ size_t offset = 0;
1854
1922
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1855
1923
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1856
- int i = i3*ne2 + i2;
1857
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1924
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1925
+ offset += s_sz;
1858
1926
  }
1859
1927
  }
1860
1928