llama_cpp 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +122 -72
- data/ext/llama_cpp/src/ggml-metal.m +4 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +755 -320
- data/ext/llama_cpp/src/ggml.h +13 -0
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +779 -113
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +3 -2
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
|
|
202
202
|
|
203
203
|
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
204
204
|
{
|
205
|
-
const int i = get_group_id(0);
|
205
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
206
206
|
const int tid = get_local_id(0);
|
207
207
|
const int n = tid / 32;
|
208
208
|
const int l = tid - 32 * n;
|
209
209
|
const int is = 8 * n + l / 16;
|
210
210
|
|
211
211
|
const uint8_t q = x[i].qs[32 * n + l];
|
212
|
-
__global float *y = yy +
|
212
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
|
213
213
|
|
214
214
|
const float dall = vload_half(0, &x[i].d);
|
215
215
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
|
223
223
|
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
224
224
|
{
|
225
225
|
int r = get_local_id(0) / 4;
|
226
|
-
int i = get_group_id(0);
|
226
|
+
int i = get_group_id(0) + get_global_offset(0);
|
227
227
|
int tid = r / 2;
|
228
228
|
int is0 = r % 2;
|
229
229
|
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
241
241
|
float d_all = vload_half(0, &x[i].d);
|
242
242
|
float dl = d_all * (us - 32);
|
243
243
|
|
244
|
-
__global float *y = yy +
|
244
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
|
245
245
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
246
246
|
const __global uint8_t *hm = x[i].hmask;
|
247
247
|
|
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
251
251
|
|
252
252
|
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
253
253
|
{
|
254
|
-
const int i = get_group_id(0);
|
254
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
255
255
|
const int tid = get_local_id(0);
|
256
256
|
const int il = tid / 8;
|
257
257
|
const int ir = tid % 8;
|
258
258
|
const int is = 2 * il;
|
259
259
|
const int n = 4;
|
260
260
|
|
261
|
-
__global float *y = yy +
|
261
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
|
262
262
|
|
263
263
|
const float dall = vload_half(0, &x[i].d);
|
264
264
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
|
281
281
|
|
282
282
|
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
283
283
|
{
|
284
|
-
const int i = get_group_id(0);
|
284
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
285
285
|
const int tid = get_local_id(0);
|
286
286
|
const int il = tid / 16;
|
287
287
|
const int ir = tid % 16;
|
288
288
|
const int is = 2 * il;
|
289
289
|
|
290
|
-
__global float *y = yy +
|
290
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
|
291
291
|
|
292
292
|
const float dall = vload_half(0, &x[i].d);
|
293
293
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
|
313
313
|
|
314
314
|
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
315
315
|
{
|
316
|
-
const int i = get_group_id(0);
|
316
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
317
317
|
const int tid = get_local_id(0);
|
318
318
|
const int ip = tid / 32;
|
319
319
|
const int il = tid - 32 * ip;
|
320
320
|
const int is = 8 * ip + il / 16;
|
321
321
|
|
322
|
-
__global float *y = yy +
|
322
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
|
323
323
|
|
324
324
|
const float d = vload_half(0, &x[i].d);
|
325
325
|
|
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
730
730
|
const uint qk = QUANT_K;
|
731
731
|
const uint qr = QUANT_R;
|
732
732
|
|
733
|
-
const int ib = i/qk; // block index
|
733
|
+
const int ib = i/qk + get_global_offset(0); // block index
|
734
734
|
const int iqs = (i%qk)/qr; // quant index
|
735
735
|
const int iybs = i - i%qk; // y block start index
|
736
736
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
1349
1349
|
const enum ggml_type type = src->type;
|
1350
1350
|
const size_t ts = ggml_type_size(type);
|
1351
1351
|
const size_t bs = ggml_blck_size(type);
|
1352
|
+
const uint64_t row_size = ts*ne0/bs;
|
1352
1353
|
|
1353
|
-
const
|
1354
|
-
if (nb0 == ts && nb1 ==
|
1355
|
-
|
1356
|
-
return err;
|
1354
|
+
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
|
1355
|
+
if (nb0 == ts && nb1 == row_size) {
|
1356
|
+
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
|
1357
1357
|
}
|
1358
1358
|
if (nb0 == ts) {
|
1359
1359
|
const size_t buffer_origin[3] = { offset, 0, 0 };
|
1360
1360
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1361
|
-
const size_t region[3] = {
|
1362
|
-
|
1363
|
-
return err;
|
1361
|
+
const size_t region[3] = { row_size, ne1, 1 };
|
1362
|
+
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
|
1364
1363
|
}
|
1364
|
+
std::vector<cl_event> events;
|
1365
|
+
if (ev && ne1>1) events.reserve(ne1-1);
|
1365
1366
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
1366
1367
|
// pretend the row is a matrix with cols=1
|
1367
|
-
const size_t buffer_origin[3] = { offset
|
1368
|
+
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
|
1368
1369
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1369
|
-
const size_t region[3] = { ts
|
1370
|
-
|
1370
|
+
const size_t region[3] = { ts, ne0/bs, 1 };
|
1371
|
+
// if an event is requested, make the last write wait for all previous writes to complete
|
1372
|
+
if (ev && i1) {
|
1373
|
+
events.push_back(*ev);
|
1374
|
+
}
|
1375
|
+
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
|
1376
|
+
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
|
1371
1377
|
if (err != CL_SUCCESS) {
|
1372
|
-
|
1378
|
+
for (auto event : events) {
|
1379
|
+
clReleaseEvent(event);
|
1380
|
+
}
|
1381
|
+
return err;
|
1373
1382
|
}
|
1374
1383
|
}
|
1375
|
-
|
1384
|
+
for (auto event : events) {
|
1385
|
+
CL_CHECK(clReleaseEvent(event));
|
1386
|
+
}
|
1387
|
+
return CL_SUCCESS;
|
1376
1388
|
}
|
1377
1389
|
|
1378
1390
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1476,10 +1488,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1476
1488
|
|
1477
1489
|
const int64_t ne10 = src1->ne[0];
|
1478
1490
|
const int64_t ne11 = src1->ne[1];
|
1491
|
+
const int64_t ne12 = src1->ne[2];
|
1492
|
+
const int64_t ne13 = src1->ne[3];
|
1479
1493
|
|
1480
1494
|
const int nb2 = dst->nb[2];
|
1481
1495
|
const int nb3 = dst->nb[3];
|
1482
1496
|
|
1497
|
+
const int64_t r2 = ne12 / ne02;
|
1498
|
+
const int64_t r3 = ne13 / ne03;
|
1499
|
+
|
1483
1500
|
const float alpha = 1.0f;
|
1484
1501
|
const float beta = 0.0f;
|
1485
1502
|
const int x_ne = ne01 * ne00;
|
@@ -1498,13 +1515,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1498
1515
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1516
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1500
1517
|
|
1501
|
-
|
1502
|
-
|
1518
|
+
size_t x_offset = 0;
|
1519
|
+
int64_t pi02 = -1;
|
1520
|
+
int64_t pi03 = -1;
|
1521
|
+
|
1522
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1523
|
+
int64_t i03 = i13 / r3;
|
1524
|
+
|
1525
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1526
|
+
int64_t i02 = i12 / r2;
|
1527
|
+
|
1503
1528
|
// copy data to device
|
1504
|
-
if (src0->backend
|
1529
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1530
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1531
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1505
1532
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1533
|
+
pi02 = i02;
|
1534
|
+
pi03 = i03;
|
1506
1535
|
}
|
1507
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1536
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1508
1537
|
|
1509
1538
|
CL_CHECK(clFinish(queue));
|
1510
1539
|
|
@@ -1514,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1514
1543
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1515
1544
|
ne01, ne11, ne10,
|
1516
1545
|
alpha,
|
1517
|
-
d_X,
|
1546
|
+
d_X, x_offset, ne00,
|
1518
1547
|
d_Y, 0, ne10,
|
1519
1548
|
beta,
|
1520
1549
|
d_D, 0, ne01,
|
@@ -1525,7 +1554,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1525
1554
|
}
|
1526
1555
|
|
1527
1556
|
// copy dst to host
|
1528
|
-
float * d = (float *) ((char *) dst->data +
|
1557
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1529
1558
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1530
1559
|
}
|
1531
1560
|
}
|
@@ -1547,6 +1576,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1547
1576
|
|
1548
1577
|
const int64_t ne10 = src1->ne[0];
|
1549
1578
|
const int64_t ne11 = src1->ne[1];
|
1579
|
+
const int64_t ne12 = src1->ne[2];
|
1580
|
+
const int64_t ne13 = src1->ne[3];
|
1550
1581
|
|
1551
1582
|
const int nb10 = src1->nb[0];
|
1552
1583
|
const int nb11 = src1->nb[1];
|
@@ -1556,6 +1587,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1556
1587
|
const int nb2 = dst->nb[2];
|
1557
1588
|
const int nb3 = dst->nb[3];
|
1558
1589
|
|
1590
|
+
const int64_t r2 = ne12 / ne02;
|
1591
|
+
const int64_t r3 = ne13 / ne03;
|
1592
|
+
|
1559
1593
|
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
1560
1594
|
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
1561
1595
|
const int x_ne = ne01 * ne00;
|
@@ -1577,32 +1611,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1577
1611
|
bool src1_cont_rows = nb10 == sizeof(float);
|
1578
1612
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1579
1613
|
|
1580
|
-
|
1581
|
-
|
1614
|
+
size_t x_offset = 0;
|
1615
|
+
int64_t pi02 = -1;
|
1616
|
+
int64_t pi03 = -1;
|
1617
|
+
|
1618
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1619
|
+
int64_t i03 = i13 / r3;
|
1620
|
+
|
1621
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1622
|
+
int64_t i02 = i12 / r2;
|
1623
|
+
|
1582
1624
|
// copy src0 to device
|
1583
|
-
if (src0->backend
|
1625
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1626
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1627
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1584
1628
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1629
|
+
pi02 = i02;
|
1630
|
+
pi03 = i03;
|
1585
1631
|
}
|
1586
1632
|
|
1587
1633
|
// convert src1 to fp16
|
1588
1634
|
// TODO: use multiple threads
|
1589
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (
|
1590
|
-
char * src1i = (char *) src1->data +
|
1635
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1636
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1591
1637
|
if (src1_cont_rows) {
|
1592
1638
|
if (src1_cont_cols) {
|
1593
1639
|
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1594
1640
|
}
|
1595
1641
|
else {
|
1596
|
-
for (int64_t
|
1597
|
-
ggml_fp32_to_fp16_row((float *) (src1i +
|
1642
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1643
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1598
1644
|
}
|
1599
1645
|
}
|
1600
1646
|
}
|
1601
1647
|
else {
|
1602
|
-
for (int64_t
|
1603
|
-
for (int64_t
|
1648
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1649
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1604
1650
|
// very slow due to no inlining
|
1605
|
-
tmp[
|
1651
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1606
1652
|
}
|
1607
1653
|
}
|
1608
1654
|
}
|
@@ -1618,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1618
1664
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1619
1665
|
ne01, ne11, ne10,
|
1620
1666
|
alpha,
|
1621
|
-
d_X,
|
1667
|
+
d_X, x_offset, ne00,
|
1622
1668
|
d_Y, 0, ne10,
|
1623
1669
|
beta,
|
1624
1670
|
d_D, 0, ne01,
|
@@ -1631,7 +1677,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1631
1677
|
// copy dst to host, then convert to float
|
1632
1678
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1633
1679
|
|
1634
|
-
float * d = (float *) ((char *) dst->data +
|
1680
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1635
1681
|
|
1636
1682
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1637
1683
|
}
|
@@ -1652,18 +1698,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1652
1698
|
|
1653
1699
|
const int64_t ne10 = src1->ne[0];
|
1654
1700
|
const int64_t ne11 = src1->ne[1];
|
1701
|
+
const int64_t ne12 = src1->ne[2];
|
1702
|
+
const int64_t ne13 = src1->ne[3];
|
1655
1703
|
|
1656
1704
|
const int nb2 = dst->nb[2];
|
1657
1705
|
const int nb3 = dst->nb[3];
|
1658
1706
|
const ggml_type type = src0->type;
|
1659
1707
|
const bool mul_mat_vec = ne11 == 1;
|
1660
1708
|
|
1709
|
+
const int64_t r2 = ne12 / ne02;
|
1710
|
+
const int64_t r3 = ne13 / ne03;
|
1711
|
+
|
1661
1712
|
const float alpha = 1.0f;
|
1662
1713
|
const float beta = 0.0f;
|
1663
1714
|
const int x_ne = ne01 * ne00;
|
1664
1715
|
const int y_ne = ne11 * ne10;
|
1665
1716
|
const int d_ne = ne11 * ne01;
|
1666
|
-
const
|
1717
|
+
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
|
1718
|
+
const size_t q_sz = ggml_type_size(type) * x_bps;
|
1667
1719
|
|
1668
1720
|
size_t x_size;
|
1669
1721
|
size_t y_size;
|
@@ -1690,12 +1742,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1690
1742
|
size_t ev_idx = 0;
|
1691
1743
|
std::vector<cl_event> events;
|
1692
1744
|
|
1693
|
-
|
1694
|
-
|
1745
|
+
int64_t pi02 = -1;
|
1746
|
+
int64_t pi03 = -1;
|
1747
|
+
|
1748
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1749
|
+
int64_t i03 = i13 / r3;
|
1750
|
+
|
1751
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1752
|
+
int64_t i02 = i12 / r2;
|
1753
|
+
|
1695
1754
|
// copy src0 to device if necessary
|
1696
1755
|
if (src0->backend == GGML_BACKEND_CPU) {
|
1697
|
-
|
1698
|
-
|
1756
|
+
if (i02 != pi02 || i03 != pi03) {
|
1757
|
+
events.emplace_back();
|
1758
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1759
|
+
pi02 = i02;
|
1760
|
+
pi03 = i03;
|
1761
|
+
}
|
1699
1762
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
1763
|
d_Q = (cl_mem) src0->extra;
|
1701
1764
|
} else {
|
@@ -1704,7 +1767,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1767
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1705
1768
|
// copy src1 to device
|
1706
1769
|
events.emplace_back();
|
1707
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1770
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1708
1771
|
|
1709
1772
|
// compute
|
1710
1773
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
@@ -1720,12 +1783,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1720
1783
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1721
1784
|
// convert src0 to fp32 on device
|
1722
1785
|
const size_t global = x_ne / global_denom;
|
1786
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1723
1787
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1724
1788
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1725
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1789
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1726
1790
|
|
1727
1791
|
// copy src1 to device
|
1728
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1792
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1729
1793
|
|
1730
1794
|
events.emplace_back();
|
1731
1795
|
|
@@ -1749,7 +1813,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1749
1813
|
}
|
1750
1814
|
|
1751
1815
|
// copy dst to host
|
1752
|
-
float * d = (float *) ((char *) dst->data +
|
1816
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1753
1817
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1754
1818
|
for (auto *event : events) {
|
1755
1819
|
clReleaseEvent(event);
|
@@ -1844,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1844
1908
|
const int64_t ne3 = tensor->ne[3];
|
1845
1909
|
|
1846
1910
|
const ggml_type type = tensor->type;
|
1847
|
-
const size_t
|
1911
|
+
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
|
1912
|
+
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
|
1848
1913
|
|
1849
1914
|
size_t q_size;
|
1850
1915
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1851
1916
|
|
1852
1917
|
tensor->data = data;
|
1853
1918
|
// copy tensor to device
|
1919
|
+
size_t offset = 0;
|
1854
1920
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1855
1921
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1856
|
-
|
1857
|
-
|
1922
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
|
1923
|
+
offset += s_sz;
|
1858
1924
|
}
|
1859
1925
|
}
|
1860
1926
|
|