llama_cpp 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +122 -72
- data/ext/llama_cpp/src/ggml-metal.m +4 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +755 -320
- data/ext/llama_cpp/src/ggml.h +13 -0
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +779 -113
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +3 -2
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
|
|
202
202
|
|
203
203
|
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
204
204
|
{
|
205
|
-
const int i = get_group_id(0);
|
205
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
206
206
|
const int tid = get_local_id(0);
|
207
207
|
const int n = tid / 32;
|
208
208
|
const int l = tid - 32 * n;
|
209
209
|
const int is = 8 * n + l / 16;
|
210
210
|
|
211
211
|
const uint8_t q = x[i].qs[32 * n + l];
|
212
|
-
__global float *y = yy +
|
212
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
|
213
213
|
|
214
214
|
const float dall = vload_half(0, &x[i].d);
|
215
215
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
|
223
223
|
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
224
224
|
{
|
225
225
|
int r = get_local_id(0) / 4;
|
226
|
-
int i = get_group_id(0);
|
226
|
+
int i = get_group_id(0) + get_global_offset(0);
|
227
227
|
int tid = r / 2;
|
228
228
|
int is0 = r % 2;
|
229
229
|
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
241
241
|
float d_all = vload_half(0, &x[i].d);
|
242
242
|
float dl = d_all * (us - 32);
|
243
243
|
|
244
|
-
__global float *y = yy +
|
244
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
|
245
245
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
246
246
|
const __global uint8_t *hm = x[i].hmask;
|
247
247
|
|
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
251
251
|
|
252
252
|
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
253
253
|
{
|
254
|
-
const int i = get_group_id(0);
|
254
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
255
255
|
const int tid = get_local_id(0);
|
256
256
|
const int il = tid / 8;
|
257
257
|
const int ir = tid % 8;
|
258
258
|
const int is = 2 * il;
|
259
259
|
const int n = 4;
|
260
260
|
|
261
|
-
__global float *y = yy +
|
261
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
|
262
262
|
|
263
263
|
const float dall = vload_half(0, &x[i].d);
|
264
264
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
|
281
281
|
|
282
282
|
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
283
283
|
{
|
284
|
-
const int i = get_group_id(0);
|
284
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
285
285
|
const int tid = get_local_id(0);
|
286
286
|
const int il = tid / 16;
|
287
287
|
const int ir = tid % 16;
|
288
288
|
const int is = 2 * il;
|
289
289
|
|
290
|
-
__global float *y = yy +
|
290
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
|
291
291
|
|
292
292
|
const float dall = vload_half(0, &x[i].d);
|
293
293
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
|
313
313
|
|
314
314
|
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
315
315
|
{
|
316
|
-
const int i = get_group_id(0);
|
316
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
317
317
|
const int tid = get_local_id(0);
|
318
318
|
const int ip = tid / 32;
|
319
319
|
const int il = tid - 32 * ip;
|
320
320
|
const int is = 8 * ip + il / 16;
|
321
321
|
|
322
|
-
__global float *y = yy +
|
322
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
|
323
323
|
|
324
324
|
const float d = vload_half(0, &x[i].d);
|
325
325
|
|
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
730
730
|
const uint qk = QUANT_K;
|
731
731
|
const uint qr = QUANT_R;
|
732
732
|
|
733
|
-
const int ib = i/qk; // block index
|
733
|
+
const int ib = i/qk + get_global_offset(0); // block index
|
734
734
|
const int iqs = (i%qk)/qr; // quant index
|
735
735
|
const int iybs = i - i%qk; // y block start index
|
736
736
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
1349
1349
|
const enum ggml_type type = src->type;
|
1350
1350
|
const size_t ts = ggml_type_size(type);
|
1351
1351
|
const size_t bs = ggml_blck_size(type);
|
1352
|
+
const uint64_t row_size = ts*ne0/bs;
|
1352
1353
|
|
1353
|
-
const
|
1354
|
-
if (nb0 == ts && nb1 ==
|
1355
|
-
|
1356
|
-
return err;
|
1354
|
+
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
|
1355
|
+
if (nb0 == ts && nb1 == row_size) {
|
1356
|
+
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
|
1357
1357
|
}
|
1358
1358
|
if (nb0 == ts) {
|
1359
1359
|
const size_t buffer_origin[3] = { offset, 0, 0 };
|
1360
1360
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1361
|
-
const size_t region[3] = {
|
1362
|
-
|
1363
|
-
return err;
|
1361
|
+
const size_t region[3] = { row_size, ne1, 1 };
|
1362
|
+
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
|
1364
1363
|
}
|
1364
|
+
std::vector<cl_event> events;
|
1365
|
+
if (ev && ne1>1) events.reserve(ne1-1);
|
1365
1366
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
1366
1367
|
// pretend the row is a matrix with cols=1
|
1367
|
-
const size_t buffer_origin[3] = { offset
|
1368
|
+
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
|
1368
1369
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1369
|
-
const size_t region[3] = { ts
|
1370
|
-
|
1370
|
+
const size_t region[3] = { ts, ne0/bs, 1 };
|
1371
|
+
// if an event is requested, make the last write wait for all previous writes to complete
|
1372
|
+
if (ev && i1) {
|
1373
|
+
events.push_back(*ev);
|
1374
|
+
}
|
1375
|
+
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
|
1376
|
+
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
|
1371
1377
|
if (err != CL_SUCCESS) {
|
1372
|
-
|
1378
|
+
for (auto event : events) {
|
1379
|
+
clReleaseEvent(event);
|
1380
|
+
}
|
1381
|
+
return err;
|
1373
1382
|
}
|
1374
1383
|
}
|
1375
|
-
|
1384
|
+
for (auto event : events) {
|
1385
|
+
CL_CHECK(clReleaseEvent(event));
|
1386
|
+
}
|
1387
|
+
return CL_SUCCESS;
|
1376
1388
|
}
|
1377
1389
|
|
1378
1390
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1476
1488
|
|
1477
1489
|
const int64_t ne10 = src1->ne[0];
|
1478
1490
|
const int64_t ne11 = src1->ne[1];
|
1491
|
+
const int64_t ne12 = src1->ne[2];
|
1492
|
+
const int64_t ne13 = src1->ne[3];
|
1479
1493
|
|
1480
1494
|
const int nb2 = dst->nb[2];
|
1481
1495
|
const int nb3 = dst->nb[3];
|
1482
1496
|
|
1497
|
+
const int64_t r2 = ne12 / ne02;
|
1498
|
+
const int64_t r3 = ne13 / ne03;
|
1499
|
+
|
1483
1500
|
const float alpha = 1.0f;
|
1484
1501
|
const float beta = 0.0f;
|
1485
1502
|
const int x_ne = ne01 * ne00;
|
@@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1498
1515
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1516
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1500
1517
|
|
1501
|
-
|
1502
|
-
|
1518
|
+
size_t x_offset = 0;
|
1519
|
+
int64_t pi02 = -1;
|
1520
|
+
int64_t pi03 = -1;
|
1521
|
+
|
1522
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1523
|
+
int64_t i03 = i13 / r3;
|
1524
|
+
|
1525
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1526
|
+
int64_t i02 = i12 / r2;
|
1527
|
+
|
1503
1528
|
// copy data to device
|
1504
|
-
if (src0->backend
|
1529
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1530
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1531
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1505
1532
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1533
|
+
pi02 = i02;
|
1534
|
+
pi03 = i03;
|
1506
1535
|
}
|
1507
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1536
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1508
1537
|
|
1509
1538
|
CL_CHECK(clFinish(queue));
|
1510
1539
|
|
@@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1514
1543
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1515
1544
|
ne01, ne11, ne10,
|
1516
1545
|
alpha,
|
1517
|
-
d_X,
|
1546
|
+
d_X, x_offset, ne00,
|
1518
1547
|
d_Y, 0, ne10,
|
1519
1548
|
beta,
|
1520
1549
|
d_D, 0, ne01,
|
@@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1525
1554
|
}
|
1526
1555
|
|
1527
1556
|
// copy dst to host
|
1528
|
-
float * d = (float *) ((char *) dst->data +
|
1557
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1529
1558
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1530
1559
|
}
|
1531
1560
|
}
|
@@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1547
1576
|
|
1548
1577
|
const int64_t ne10 = src1->ne[0];
|
1549
1578
|
const int64_t ne11 = src1->ne[1];
|
1579
|
+
const int64_t ne12 = src1->ne[2];
|
1580
|
+
const int64_t ne13 = src1->ne[3];
|
1550
1581
|
|
1551
1582
|
const int nb10 = src1->nb[0];
|
1552
1583
|
const int nb11 = src1->nb[1];
|
@@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1556
1587
|
const int nb2 = dst->nb[2];
|
1557
1588
|
const int nb3 = dst->nb[3];
|
1558
1589
|
|
1590
|
+
const int64_t r2 = ne12 / ne02;
|
1591
|
+
const int64_t r3 = ne13 / ne03;
|
1592
|
+
|
1559
1593
|
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
1560
1594
|
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
1561
1595
|
const int x_ne = ne01 * ne00;
|
@@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1577
1611
|
bool src1_cont_rows = nb10 == sizeof(float);
|
1578
1612
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1579
1613
|
|
1580
|
-
|
1581
|
-
|
1614
|
+
size_t x_offset = 0;
|
1615
|
+
int64_t pi02 = -1;
|
1616
|
+
int64_t pi03 = -1;
|
1617
|
+
|
1618
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1619
|
+
int64_t i03 = i13 / r3;
|
1620
|
+
|
1621
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1622
|
+
int64_t i02 = i12 / r2;
|
1623
|
+
|
1582
1624
|
// copy src0 to device
|
1583
|
-
if (src0->backend
|
1625
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1626
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1627
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1584
1628
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1629
|
+
pi02 = i02;
|
1630
|
+
pi03 = i03;
|
1585
1631
|
}
|
1586
1632
|
|
1587
1633
|
// convert src1 to fp16
|
1588
1634
|
// TODO: use multiple threads
|
1589
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (
|
1590
|
-
char * src1i = (char *) src1->data +
|
1635
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1636
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1591
1637
|
if (src1_cont_rows) {
|
1592
1638
|
if (src1_cont_cols) {
|
1593
1639
|
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1594
1640
|
}
|
1595
1641
|
else {
|
1596
|
-
for (int64_t
|
1597
|
-
ggml_fp32_to_fp16_row((float *) (src1i +
|
1642
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1643
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1598
1644
|
}
|
1599
1645
|
}
|
1600
1646
|
}
|
1601
1647
|
else {
|
1602
|
-
for (int64_t
|
1603
|
-
for (int64_t
|
1648
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1649
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1604
1650
|
// very slow due to no inlining
|
1605
|
-
tmp[
|
1651
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1606
1652
|
}
|
1607
1653
|
}
|
1608
1654
|
}
|
@@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1618
1664
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1619
1665
|
ne01, ne11, ne10,
|
1620
1666
|
alpha,
|
1621
|
-
d_X,
|
1667
|
+
d_X, x_offset, ne00,
|
1622
1668
|
d_Y, 0, ne10,
|
1623
1669
|
beta,
|
1624
1670
|
d_D, 0, ne01,
|
@@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1631
1677
|
// copy dst to host, then convert to float
|
1632
1678
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1633
1679
|
|
1634
|
-
float * d = (float *) ((char *) dst->data +
|
1680
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1635
1681
|
|
1636
1682
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1637
1683
|
}
|
@@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1652
1698
|
|
1653
1699
|
const int64_t ne10 = src1->ne[0];
|
1654
1700
|
const int64_t ne11 = src1->ne[1];
|
1701
|
+
const int64_t ne12 = src1->ne[2];
|
1702
|
+
const int64_t ne13 = src1->ne[3];
|
1655
1703
|
|
1656
1704
|
const int nb2 = dst->nb[2];
|
1657
1705
|
const int nb3 = dst->nb[3];
|
1658
1706
|
const ggml_type type = src0->type;
|
1659
1707
|
const bool mul_mat_vec = ne11 == 1;
|
1660
1708
|
|
1709
|
+
const int64_t r2 = ne12 / ne02;
|
1710
|
+
const int64_t r3 = ne13 / ne03;
|
1711
|
+
|
1661
1712
|
const float alpha = 1.0f;
|
1662
1713
|
const float beta = 0.0f;
|
1663
1714
|
const int x_ne = ne01 * ne00;
|
1664
1715
|
const int y_ne = ne11 * ne10;
|
1665
1716
|
const int d_ne = ne11 * ne01;
|
1666
|
-
const
|
1717
|
+
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
|
1718
|
+
const size_t q_sz = ggml_type_size(type) * x_bps;
|
1667
1719
|
|
1668
1720
|
size_t x_size;
|
1669
1721
|
size_t y_size;
|
@@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1690
1742
|
size_t ev_idx = 0;
|
1691
1743
|
std::vector<cl_event> events;
|
1692
1744
|
|
1693
|
-
|
1694
|
-
|
1745
|
+
int64_t pi02 = -1;
|
1746
|
+
int64_t pi03 = -1;
|
1747
|
+
|
1748
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1749
|
+
int64_t i03 = i13 / r3;
|
1750
|
+
|
1751
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1752
|
+
int64_t i02 = i12 / r2;
|
1753
|
+
|
1695
1754
|
// copy src0 to device if necessary
|
1696
1755
|
if (src0->backend == GGML_BACKEND_CPU) {
|
1697
|
-
|
1698
|
-
|
1756
|
+
if (i02 != pi02 || i03 != pi03) {
|
1757
|
+
events.emplace_back();
|
1758
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1759
|
+
pi02 = i02;
|
1760
|
+
pi03 = i03;
|
1761
|
+
}
|
1699
1762
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
1763
|
d_Q = (cl_mem) src0->extra;
|
1701
1764
|
} else {
|
@@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1767
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1705
1768
|
// copy src1 to device
|
1706
1769
|
events.emplace_back();
|
1707
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1770
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1708
1771
|
|
1709
1772
|
// compute
|
1710
1773
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
@@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1720
1783
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1721
1784
|
// convert src0 to fp32 on device
|
1722
1785
|
const size_t global = x_ne / global_denom;
|
1786
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1723
1787
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1724
1788
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1725
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1789
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1726
1790
|
|
1727
1791
|
// copy src1 to device
|
1728
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1792
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1729
1793
|
|
1730
1794
|
events.emplace_back();
|
1731
1795
|
|
@@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1749
1813
|
}
|
1750
1814
|
|
1751
1815
|
// copy dst to host
|
1752
|
-
float * d = (float *) ((char *) dst->data +
|
1816
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1753
1817
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1754
1818
|
for (auto *event : events) {
|
1755
1819
|
clReleaseEvent(event);
|
@@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1844
1908
|
const int64_t ne3 = tensor->ne[3];
|
1845
1909
|
|
1846
1910
|
const ggml_type type = tensor->type;
|
1847
|
-
const size_t
|
1911
|
+
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
|
1912
|
+
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
|
1848
1913
|
|
1849
1914
|
size_t q_size;
|
1850
1915
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1851
1916
|
|
1852
1917
|
tensor->data = data;
|
1853
1918
|
// copy tensor to device
|
1919
|
+
size_t offset = 0;
|
1854
1920
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1855
1921
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1856
|
-
|
1857
|
-
|
1922
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
|
1923
|
+
offset += s_sz;
|
1858
1924
|
}
|
1859
1925
|
}
|
1860
1926
|
|