llama_cpp 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
202
202
 
203
203
  __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
204
204
  {
205
- const int i = get_group_id(0);
205
+ const int i = get_group_id(0) + get_global_offset(0);
206
206
  const int tid = get_local_id(0);
207
207
  const int n = tid / 32;
208
208
  const int l = tid - 32 * n;
209
209
  const int is = 8 * n + l / 16;
210
210
 
211
211
  const uint8_t q = x[i].qs[32 * n + l];
212
- __global float *y = yy + i * QK_K + 128 * n;
212
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
213
213
 
214
214
  const float dall = vload_half(0, &x[i].d);
215
215
  const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
223
223
  __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
224
224
  {
225
225
  int r = get_local_id(0) / 4;
226
- int i = get_group_id(0);
226
+ int i = get_group_id(0) + get_global_offset(0);
227
227
  int tid = r / 2;
228
228
  int is0 = r % 2;
229
229
  int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
241
241
  float d_all = vload_half(0, &x[i].d);
242
242
  float dl = d_all * (us - 32);
243
243
 
244
- __global float *y = yy + i * QK_K + 128 * n + 32 * j;
244
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
245
245
  const __global uint8_t *q = x[i].qs + 32 * n;
246
246
  const __global uint8_t *hm = x[i].hmask;
247
247
 
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
251
251
 
252
252
  __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
253
253
  {
254
- const int i = get_group_id(0);
254
+ const int i = get_group_id(0) + get_global_offset(0);
255
255
  const int tid = get_local_id(0);
256
256
  const int il = tid / 8;
257
257
  const int ir = tid % 8;
258
258
  const int is = 2 * il;
259
259
  const int n = 4;
260
260
 
261
- __global float *y = yy + i * QK_K + 64 * il + n * ir;
261
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
262
262
 
263
263
  const float dall = vload_half(0, &x[i].d);
264
264
  const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
281
281
 
282
282
  __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
283
283
  {
284
- const int i = get_group_id(0);
284
+ const int i = get_group_id(0) + get_global_offset(0);
285
285
  const int tid = get_local_id(0);
286
286
  const int il = tid / 16;
287
287
  const int ir = tid % 16;
288
288
  const int is = 2 * il;
289
289
 
290
- __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
290
+ __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
291
291
 
292
292
  const float dall = vload_half(0, &x[i].d);
293
293
  const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
313
313
 
314
314
  __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
315
315
  {
316
- const int i = get_group_id(0);
316
+ const int i = get_group_id(0) + get_global_offset(0);
317
317
  const int tid = get_local_id(0);
318
318
  const int ip = tid / 32;
319
319
  const int il = tid - 32 * ip;
320
320
  const int is = 8 * ip + il / 16;
321
321
 
322
- __global float *y = yy + i * QK_K + 128 * ip + il;
322
+ __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
323
323
 
324
324
  const float d = vload_half(0, &x[i].d);
325
325
 
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
730
730
  const uint qk = QUANT_K;
731
731
  const uint qr = QUANT_R;
732
732
 
733
- const int ib = i/qk; // block index
733
+ const int ib = i/qk + get_global_offset(0); // block index
734
734
  const int iqs = (i%qk)/qr; // quant index
735
735
  const int iybs = i - i%qk; // y block start index
736
736
  const int y_offset = qr == 1 ? 1 : qk/2;
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1349
1349
  const enum ggml_type type = src->type;
1350
1350
  const size_t ts = ggml_type_size(type);
1351
1351
  const size_t bs = ggml_blck_size(type);
1352
+ const uint64_t row_size = ts*ne0/bs;
1352
1353
 
1353
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1355
- err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1356
- return err;
1354
+ const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1355
+ if (nb0 == ts && nb1 == row_size) {
1356
+ return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
1357
1357
  }
1358
1358
  if (nb0 == ts) {
1359
1359
  const size_t buffer_origin[3] = { offset, 0, 0 };
1360
1360
  const size_t host_origin[3] = { 0, 0, 0 };
1361
- const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1362
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1363
- return err;
1361
+ const size_t region[3] = { row_size, ne1, 1 };
1362
+ return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
1364
1363
  }
1364
+ std::vector<cl_event> events;
1365
+ if (ev && ne1>1) events.reserve(ne1-1);
1365
1366
  for (uint64_t i1 = 0; i1 < ne1; i1++) {
1366
1367
  // pretend the row is a matrix with cols=1
1367
- const size_t buffer_origin[3] = { offset, i1, 0 };
1368
+ const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
1368
1369
  const size_t host_origin[3] = { 0, 0, 0 };
1369
- const size_t region[3] = { ts/bs, ne0, 1 };
1370
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1370
+ const size_t region[3] = { ts, ne0/bs, 1 };
1371
+ // if an event is requested, make the last write wait for all previous writes to complete
1372
+ if (ev && i1) {
1373
+ events.push_back(*ev);
1374
+ }
1375
+ cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1376
+ err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
1371
1377
  if (err != CL_SUCCESS) {
1372
- break;
1378
+ for (auto event : events) {
1379
+ clReleaseEvent(event);
1380
+ }
1381
+ return err;
1373
1382
  }
1374
1383
  }
1375
- return err;
1384
+ for (auto event : events) {
1385
+ CL_CHECK(clReleaseEvent(event));
1386
+ }
1387
+ return CL_SUCCESS;
1376
1388
  }
1377
1389
 
1378
1390
  static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1476
1488
 
1477
1489
  const int64_t ne10 = src1->ne[0];
1478
1490
  const int64_t ne11 = src1->ne[1];
1491
+ const int64_t ne12 = src1->ne[2];
1492
+ const int64_t ne13 = src1->ne[3];
1479
1493
 
1480
1494
  const int nb2 = dst->nb[2];
1481
1495
  const int nb3 = dst->nb[3];
1482
1496
 
1497
+ const int64_t r2 = ne12 / ne02;
1498
+ const int64_t r3 = ne13 / ne03;
1499
+
1483
1500
  const float alpha = 1.0f;
1484
1501
  const float beta = 0.0f;
1485
1502
  const int x_ne = ne01 * ne00;
@@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1498
1515
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1516
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1500
1517
 
1501
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1502
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1518
+ size_t x_offset = 0;
1519
+ int64_t pi02 = -1;
1520
+ int64_t pi03 = -1;
1521
+
1522
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1523
+ int64_t i03 = i13 / r3;
1524
+
1525
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1526
+ int64_t i02 = i12 / r2;
1527
+
1503
1528
  // copy data to device
1504
- if (src0->backend != GGML_BACKEND_GPU) {
1529
+ if (src0->backend == GGML_BACKEND_GPU) {
1530
+ x_offset = (i03 * ne02 + i02) * x_ne;
1531
+ } else if (i02 != pi02 || i03 != pi03) {
1505
1532
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1533
+ pi02 = i02;
1534
+ pi03 = i03;
1506
1535
  }
1507
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1536
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1508
1537
 
1509
1538
  CL_CHECK(clFinish(queue));
1510
1539
 
@@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1514
1543
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1515
1544
  ne01, ne11, ne10,
1516
1545
  alpha,
1517
- d_X, 0, ne00,
1546
+ d_X, x_offset, ne00,
1518
1547
  d_Y, 0, ne10,
1519
1548
  beta,
1520
1549
  d_D, 0, ne01,
@@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1525
1554
  }
1526
1555
 
1527
1556
  // copy dst to host
1528
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1557
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1529
1558
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1530
1559
  }
1531
1560
  }
@@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1547
1576
 
1548
1577
  const int64_t ne10 = src1->ne[0];
1549
1578
  const int64_t ne11 = src1->ne[1];
1579
+ const int64_t ne12 = src1->ne[2];
1580
+ const int64_t ne13 = src1->ne[3];
1550
1581
 
1551
1582
  const int nb10 = src1->nb[0];
1552
1583
  const int nb11 = src1->nb[1];
@@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1556
1587
  const int nb2 = dst->nb[2];
1557
1588
  const int nb3 = dst->nb[3];
1558
1589
 
1590
+ const int64_t r2 = ne12 / ne02;
1591
+ const int64_t r3 = ne13 / ne03;
1592
+
1559
1593
  const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
1560
1594
  const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
1561
1595
  const int x_ne = ne01 * ne00;
@@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1577
1611
  bool src1_cont_rows = nb10 == sizeof(float);
1578
1612
  bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1579
1613
 
1580
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1581
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1614
+ size_t x_offset = 0;
1615
+ int64_t pi02 = -1;
1616
+ int64_t pi03 = -1;
1617
+
1618
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1619
+ int64_t i03 = i13 / r3;
1620
+
1621
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1622
+ int64_t i02 = i12 / r2;
1623
+
1582
1624
  // copy src0 to device
1583
- if (src0->backend != GGML_BACKEND_GPU) {
1625
+ if (src0->backend == GGML_BACKEND_GPU) {
1626
+ x_offset = (i03 * ne02 + i02) * x_ne;
1627
+ } else if (i02 != pi02 || i03 != pi03) {
1584
1628
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1629
+ pi02 = i02;
1630
+ pi03 = i03;
1585
1631
  }
1586
1632
 
1587
1633
  // convert src1 to fp16
1588
1634
  // TODO: use multiple threads
1589
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
1590
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
1635
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1636
+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1591
1637
  if (src1_cont_rows) {
1592
1638
  if (src1_cont_cols) {
1593
1639
  ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1594
1640
  }
1595
1641
  else {
1596
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1597
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
1642
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1643
+ ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1598
1644
  }
1599
1645
  }
1600
1646
  }
1601
1647
  else {
1602
- for (int64_t i01 = 0; i01 < ne11; i01++) {
1603
- for (int64_t i00 = 0; i00 < ne10; i00++) {
1648
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
1649
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
1604
1650
  // very slow due to no inlining
1605
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
1651
+ tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1606
1652
  }
1607
1653
  }
1608
1654
  }
@@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1618
1664
  clblast::Transpose::kYes, clblast::Transpose::kNo,
1619
1665
  ne01, ne11, ne10,
1620
1666
  alpha,
1621
- d_X, 0, ne00,
1667
+ d_X, x_offset, ne00,
1622
1668
  d_Y, 0, ne10,
1623
1669
  beta,
1624
1670
  d_D, 0, ne01,
@@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1631
1677
  // copy dst to host, then convert to float
1632
1678
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1633
1679
 
1634
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1680
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1635
1681
 
1636
1682
  ggml_fp16_to_fp32_row(tmp, d, d_ne);
1637
1683
  }
@@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1652
1698
 
1653
1699
  const int64_t ne10 = src1->ne[0];
1654
1700
  const int64_t ne11 = src1->ne[1];
1701
+ const int64_t ne12 = src1->ne[2];
1702
+ const int64_t ne13 = src1->ne[3];
1655
1703
 
1656
1704
  const int nb2 = dst->nb[2];
1657
1705
  const int nb3 = dst->nb[3];
1658
1706
  const ggml_type type = src0->type;
1659
1707
  const bool mul_mat_vec = ne11 == 1;
1660
1708
 
1709
+ const int64_t r2 = ne12 / ne02;
1710
+ const int64_t r3 = ne13 / ne03;
1711
+
1661
1712
  const float alpha = 1.0f;
1662
1713
  const float beta = 0.0f;
1663
1714
  const int x_ne = ne01 * ne00;
1664
1715
  const int y_ne = ne11 * ne10;
1665
1716
  const int d_ne = ne11 * ne01;
1666
- const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
1717
+ const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
1718
+ const size_t q_sz = ggml_type_size(type) * x_bps;
1667
1719
 
1668
1720
  size_t x_size;
1669
1721
  size_t y_size;
@@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1690
1742
  size_t ev_idx = 0;
1691
1743
  std::vector<cl_event> events;
1692
1744
 
1693
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1694
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1745
+ int64_t pi02 = -1;
1746
+ int64_t pi03 = -1;
1747
+
1748
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
1749
+ int64_t i03 = i13 / r3;
1750
+
1751
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
1752
+ int64_t i02 = i12 / r2;
1753
+
1695
1754
  // copy src0 to device if necessary
1696
1755
  if (src0->backend == GGML_BACKEND_CPU) {
1697
- events.emplace_back();
1698
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1756
+ if (i02 != pi02 || i03 != pi03) {
1757
+ events.emplace_back();
1758
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1759
+ pi02 = i02;
1760
+ pi03 = i03;
1761
+ }
1699
1762
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
1763
  d_Q = (cl_mem) src0->extra;
1701
1764
  } else {
@@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1767
  if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1705
1768
  // copy src1 to device
1706
1769
  events.emplace_back();
1707
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
1770
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1708
1771
 
1709
1772
  // compute
1710
1773
  const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1720
1783
  } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1721
1784
  // convert src0 to fp32 on device
1722
1785
  const size_t global = x_ne / global_denom;
1786
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1723
1787
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1724
1788
  CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1725
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1789
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1726
1790
 
1727
1791
  // copy src1 to device
1728
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1792
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1729
1793
 
1730
1794
  events.emplace_back();
1731
1795
 
@@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1749
1813
  }
1750
1814
 
1751
1815
  // copy dst to host
1752
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1816
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1753
1817
  CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1754
1818
  for (auto *event : events) {
1755
1819
  clReleaseEvent(event);
@@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1844
1908
  const int64_t ne3 = tensor->ne[3];
1845
1909
 
1846
1910
  const ggml_type type = tensor->type;
1847
- const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1911
+ const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1912
+ const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
1848
1913
 
1849
1914
  size_t q_size;
1850
1915
  cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1851
1916
 
1852
1917
  tensor->data = data;
1853
1918
  // copy tensor to device
1919
+ size_t offset = 0;
1854
1920
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1855
1921
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1856
- int i = i3*ne2 + i2;
1857
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1922
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1923
+ offset += s_sz;
1858
1924
  }
1859
1925
  }
1860
1926