llama_cpp 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +622 -150
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +358 -131
- data/ext/llama_cpp/src/ggml-metal.metal +137 -47
- data/ext/llama_cpp/src/ggml-opencl.cpp +136 -68
- data/ext/llama_cpp/src/ggml.c +812 -365
- data/ext/llama_cpp/src/ggml.h +25 -7
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +2387 -421
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +5 -2
@@ -19,7 +19,7 @@
|
|
19
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
22
|
+
#define CL_DMMV_LOCAL_SIZE 32
|
23
23
|
|
24
24
|
#ifndef K_QUANTS_PER_ITERATION
|
25
25
|
#define K_QUANTS_PER_ITERATION 1
|
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
|
|
202
202
|
|
203
203
|
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
|
204
204
|
{
|
205
|
-
const int i = get_group_id(0);
|
205
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
206
206
|
const int tid = get_local_id(0);
|
207
207
|
const int n = tid / 32;
|
208
208
|
const int l = tid - 32 * n;
|
209
209
|
const int is = 8 * n + l / 16;
|
210
210
|
|
211
211
|
const uint8_t q = x[i].qs[32 * n + l];
|
212
|
-
__global float *y = yy +
|
212
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
|
213
213
|
|
214
214
|
const float dall = vload_half(0, &x[i].d);
|
215
215
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
|
223
223
|
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
|
224
224
|
{
|
225
225
|
int r = get_local_id(0) / 4;
|
226
|
-
int i = get_group_id(0);
|
226
|
+
int i = get_group_id(0) + get_global_offset(0);
|
227
227
|
int tid = r / 2;
|
228
228
|
int is0 = r % 2;
|
229
229
|
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
|
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
241
241
|
float d_all = vload_half(0, &x[i].d);
|
242
242
|
float dl = d_all * (us - 32);
|
243
243
|
|
244
|
-
__global float *y = yy +
|
244
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
|
245
245
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
246
246
|
const __global uint8_t *hm = x[i].hmask;
|
247
247
|
|
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
251
251
|
|
252
252
|
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
|
253
253
|
{
|
254
|
-
const int i = get_group_id(0);
|
254
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
255
255
|
const int tid = get_local_id(0);
|
256
256
|
const int il = tid / 8;
|
257
257
|
const int ir = tid % 8;
|
258
258
|
const int is = 2 * il;
|
259
259
|
const int n = 4;
|
260
260
|
|
261
|
-
__global float *y = yy +
|
261
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
|
262
262
|
|
263
263
|
const float dall = vload_half(0, &x[i].d);
|
264
264
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
|
281
281
|
|
282
282
|
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
|
283
283
|
{
|
284
|
-
const int i = get_group_id(0);
|
284
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
285
285
|
const int tid = get_local_id(0);
|
286
286
|
const int il = tid / 16;
|
287
287
|
const int ir = tid % 16;
|
288
288
|
const int is = 2 * il;
|
289
289
|
|
290
|
-
__global float *y = yy +
|
290
|
+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
|
291
291
|
|
292
292
|
const float dall = vload_half(0, &x[i].d);
|
293
293
|
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
|
313
313
|
|
314
314
|
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
|
315
315
|
{
|
316
|
-
const int i = get_group_id(0);
|
316
|
+
const int i = get_group_id(0) + get_global_offset(0);
|
317
317
|
const int tid = get_local_id(0);
|
318
318
|
const int ip = tid / 32;
|
319
319
|
const int il = tid - 32 * ip;
|
320
320
|
const int is = 8 * ip + il / 16;
|
321
321
|
|
322
|
-
__global float *y = yy +
|
322
|
+
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
|
323
323
|
|
324
324
|
const float d = vload_half(0, &x[i].d);
|
325
325
|
|
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
|
|
338
338
|
const int row = get_group_id(0);
|
339
339
|
|
340
340
|
const int num_blocks_per_row = ncols / QK_K;
|
341
|
-
const int ib0 = row*num_blocks_per_row;
|
341
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
342
342
|
|
343
343
|
__global const struct block_q2_K * x = xx + ib0;
|
344
344
|
|
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
|
|
413
413
|
const int row = get_group_id(0);
|
414
414
|
|
415
415
|
const int num_blocks_per_row = ncols / QK_K;
|
416
|
-
const int ib0 = row*num_blocks_per_row;
|
416
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
417
417
|
|
418
418
|
__global const struct block_q3_K * x = xx + ib0;
|
419
419
|
|
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
|
|
489
489
|
|
490
490
|
const int row = get_group_id(0);
|
491
491
|
const int num_blocks_per_row = ncols / QK_K;
|
492
|
-
const int ib0 = row*num_blocks_per_row;
|
492
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
493
493
|
|
494
494
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
495
495
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
|
|
562
562
|
|
563
563
|
const int row = get_group_id(0);
|
564
564
|
const int num_blocks_per_row = ncols / QK_K;
|
565
|
-
const int ib0 = row*num_blocks_per_row;
|
565
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
566
566
|
|
567
567
|
const int tid = get_local_id(0)/2; // 0...15
|
568
568
|
const int ix = get_local_id(0)%2;
|
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
641
641
|
const int row = get_group_id(0);
|
642
642
|
|
643
643
|
const int num_blocks_per_row = ncols / QK_K;
|
644
|
-
const int ib0 = row*num_blocks_per_row;
|
644
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
645
645
|
|
646
646
|
__global const struct block_q6_K * x = xx + ib0;
|
647
647
|
|
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
730
730
|
const uint qk = QUANT_K;
|
731
731
|
const uint qr = QUANT_R;
|
732
732
|
|
733
|
-
const int ib = i/qk; // block index
|
733
|
+
const int ib = i/qk + get_global_offset(0); // block index
|
734
734
|
const int iqs = (i%qk)/qr; // quant index
|
735
735
|
const int iybs = i - i%qk; // y block start index
|
736
736
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
745
745
|
|
746
746
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
747
747
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
748
|
-
const int
|
748
|
+
const int local_size = get_local_size(0);
|
749
749
|
const int row = get_group_id(0);
|
750
750
|
const int tid = get_local_id(0);
|
751
751
|
|
752
752
|
const uint qk = QUANT_K;
|
753
753
|
const uint qr = QUANT_R;
|
754
754
|
|
755
|
+
const int col_step = local_size * 2;
|
755
756
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
756
757
|
|
758
|
+
x += get_global_offset(0);
|
759
|
+
|
757
760
|
tmp[tid] = 0;
|
758
761
|
|
759
|
-
for (int
|
760
|
-
const int col = i*block_size + 2*tid;
|
762
|
+
for (int col = tid*2; col < ncols; col += col_step) {
|
761
763
|
const int ib = (row*ncols + col)/qk; // block index
|
762
764
|
const int iqs = (col%qk)/qr; // quant index
|
763
765
|
const int iybs = col - col%qk; // y block start index
|
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
773
775
|
|
774
776
|
// sum up partial sums and write back result
|
775
777
|
barrier(CLK_LOCAL_MEM_FENCE);
|
776
|
-
for (int s=
|
778
|
+
for (int s=local_size/2; s>0; s>>=1) {
|
777
779
|
if (tid < s) {
|
778
780
|
tmp[tid] += tmp[tid + s];
|
779
781
|
}
|
@@ -1349,30 +1351,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|
1349
1351
|
const enum ggml_type type = src->type;
|
1350
1352
|
const size_t ts = ggml_type_size(type);
|
1351
1353
|
const size_t bs = ggml_blck_size(type);
|
1354
|
+
const uint64_t row_size = ts*ne0/bs;
|
1352
1355
|
|
1353
|
-
const
|
1354
|
-
if (nb0 == ts && nb1 ==
|
1355
|
-
|
1356
|
-
return err;
|
1356
|
+
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
|
1357
|
+
if (nb0 == ts && nb1 == row_size) {
|
1358
|
+
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
|
1357
1359
|
}
|
1358
1360
|
if (nb0 == ts) {
|
1359
1361
|
const size_t buffer_origin[3] = { offset, 0, 0 };
|
1360
1362
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1361
|
-
const size_t region[3] = {
|
1362
|
-
|
1363
|
-
return err;
|
1363
|
+
const size_t region[3] = { row_size, ne1, 1 };
|
1364
|
+
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
|
1364
1365
|
}
|
1366
|
+
std::vector<cl_event> events;
|
1367
|
+
if (ev && ne1>1) events.reserve(ne1-1);
|
1365
1368
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
1366
1369
|
// pretend the row is a matrix with cols=1
|
1367
|
-
const size_t buffer_origin[3] = { offset
|
1370
|
+
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
|
1368
1371
|
const size_t host_origin[3] = { 0, 0, 0 };
|
1369
|
-
const size_t region[3] = { ts
|
1370
|
-
|
1372
|
+
const size_t region[3] = { ts, ne0/bs, 1 };
|
1373
|
+
// if an event is requested, make the last write wait for all previous writes to complete
|
1374
|
+
if (ev && i1) {
|
1375
|
+
events.push_back(*ev);
|
1376
|
+
}
|
1377
|
+
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
|
1378
|
+
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
|
1371
1379
|
if (err != CL_SUCCESS) {
|
1372
|
-
|
1380
|
+
for (auto event : events) {
|
1381
|
+
clReleaseEvent(event);
|
1382
|
+
}
|
1383
|
+
return err;
|
1373
1384
|
}
|
1374
1385
|
}
|
1375
|
-
|
1386
|
+
for (auto event : events) {
|
1387
|
+
CL_CHECK(clReleaseEvent(event));
|
1388
|
+
}
|
1389
|
+
return CL_SUCCESS;
|
1376
1390
|
}
|
1377
1391
|
|
1378
1392
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -1476,10 +1490,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1476
1490
|
|
1477
1491
|
const int64_t ne10 = src1->ne[0];
|
1478
1492
|
const int64_t ne11 = src1->ne[1];
|
1493
|
+
const int64_t ne12 = src1->ne[2];
|
1494
|
+
const int64_t ne13 = src1->ne[3];
|
1479
1495
|
|
1480
1496
|
const int nb2 = dst->nb[2];
|
1481
1497
|
const int nb3 = dst->nb[3];
|
1482
1498
|
|
1499
|
+
const int64_t r2 = ne12 / ne02;
|
1500
|
+
const int64_t r3 = ne13 / ne03;
|
1501
|
+
|
1483
1502
|
const float alpha = 1.0f;
|
1484
1503
|
const float beta = 0.0f;
|
1485
1504
|
const int x_ne = ne01 * ne00;
|
@@ -1498,13 +1517,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1498
1517
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1518
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
1500
1519
|
|
1501
|
-
|
1502
|
-
|
1520
|
+
size_t x_offset = 0;
|
1521
|
+
int64_t pi02 = -1;
|
1522
|
+
int64_t pi03 = -1;
|
1523
|
+
|
1524
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1525
|
+
int64_t i03 = i13 / r3;
|
1526
|
+
|
1527
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1528
|
+
int64_t i02 = i12 / r2;
|
1529
|
+
|
1503
1530
|
// copy data to device
|
1504
|
-
if (src0->backend
|
1531
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1532
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1533
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1505
1534
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1535
|
+
pi02 = i02;
|
1536
|
+
pi03 = i03;
|
1506
1537
|
}
|
1507
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1538
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1508
1539
|
|
1509
1540
|
CL_CHECK(clFinish(queue));
|
1510
1541
|
|
@@ -1514,7 +1545,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1514
1545
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1515
1546
|
ne01, ne11, ne10,
|
1516
1547
|
alpha,
|
1517
|
-
d_X,
|
1548
|
+
d_X, x_offset, ne00,
|
1518
1549
|
d_Y, 0, ne10,
|
1519
1550
|
beta,
|
1520
1551
|
d_D, 0, ne01,
|
@@ -1525,7 +1556,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1525
1556
|
}
|
1526
1557
|
|
1527
1558
|
// copy dst to host
|
1528
|
-
float * d = (float *) ((char *) dst->data +
|
1559
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1529
1560
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
1530
1561
|
}
|
1531
1562
|
}
|
@@ -1547,6 +1578,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1547
1578
|
|
1548
1579
|
const int64_t ne10 = src1->ne[0];
|
1549
1580
|
const int64_t ne11 = src1->ne[1];
|
1581
|
+
const int64_t ne12 = src1->ne[2];
|
1582
|
+
const int64_t ne13 = src1->ne[3];
|
1550
1583
|
|
1551
1584
|
const int nb10 = src1->nb[0];
|
1552
1585
|
const int nb11 = src1->nb[1];
|
@@ -1556,6 +1589,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1556
1589
|
const int nb2 = dst->nb[2];
|
1557
1590
|
const int nb3 = dst->nb[3];
|
1558
1591
|
|
1592
|
+
const int64_t r2 = ne12 / ne02;
|
1593
|
+
const int64_t r3 = ne13 / ne03;
|
1594
|
+
|
1559
1595
|
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
1560
1596
|
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
1561
1597
|
const int x_ne = ne01 * ne00;
|
@@ -1577,32 +1613,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1577
1613
|
bool src1_cont_rows = nb10 == sizeof(float);
|
1578
1614
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
1579
1615
|
|
1580
|
-
|
1581
|
-
|
1616
|
+
size_t x_offset = 0;
|
1617
|
+
int64_t pi02 = -1;
|
1618
|
+
int64_t pi03 = -1;
|
1619
|
+
|
1620
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1621
|
+
int64_t i03 = i13 / r3;
|
1622
|
+
|
1623
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1624
|
+
int64_t i02 = i12 / r2;
|
1625
|
+
|
1582
1626
|
// copy src0 to device
|
1583
|
-
if (src0->backend
|
1627
|
+
if (src0->backend == GGML_BACKEND_GPU) {
|
1628
|
+
x_offset = (i03 * ne02 + i02) * x_ne;
|
1629
|
+
} else if (i02 != pi02 || i03 != pi03) {
|
1584
1630
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
1631
|
+
pi02 = i02;
|
1632
|
+
pi03 = i03;
|
1585
1633
|
}
|
1586
1634
|
|
1587
1635
|
// convert src1 to fp16
|
1588
1636
|
// TODO: use multiple threads
|
1589
|
-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (
|
1590
|
-
char * src1i = (char *) src1->data +
|
1637
|
+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
1638
|
+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
1591
1639
|
if (src1_cont_rows) {
|
1592
1640
|
if (src1_cont_cols) {
|
1593
1641
|
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
1594
1642
|
}
|
1595
1643
|
else {
|
1596
|
-
for (int64_t
|
1597
|
-
ggml_fp32_to_fp16_row((float *) (src1i +
|
1644
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1645
|
+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
1598
1646
|
}
|
1599
1647
|
}
|
1600
1648
|
}
|
1601
1649
|
else {
|
1602
|
-
for (int64_t
|
1603
|
-
for (int64_t
|
1650
|
+
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
1651
|
+
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
1604
1652
|
// very slow due to no inlining
|
1605
|
-
tmp[
|
1653
|
+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
1606
1654
|
}
|
1607
1655
|
}
|
1608
1656
|
}
|
@@ -1618,7 +1666,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1618
1666
|
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
1619
1667
|
ne01, ne11, ne10,
|
1620
1668
|
alpha,
|
1621
|
-
d_X,
|
1669
|
+
d_X, x_offset, ne00,
|
1622
1670
|
d_Y, 0, ne10,
|
1623
1671
|
beta,
|
1624
1672
|
d_D, 0, ne01,
|
@@ -1631,7 +1679,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1631
1679
|
// copy dst to host, then convert to float
|
1632
1680
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
1633
1681
|
|
1634
|
-
float * d = (float *) ((char *) dst->data +
|
1682
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1635
1683
|
|
1636
1684
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
1637
1685
|
}
|
@@ -1652,18 +1700,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1652
1700
|
|
1653
1701
|
const int64_t ne10 = src1->ne[0];
|
1654
1702
|
const int64_t ne11 = src1->ne[1];
|
1703
|
+
const int64_t ne12 = src1->ne[2];
|
1704
|
+
const int64_t ne13 = src1->ne[3];
|
1655
1705
|
|
1656
1706
|
const int nb2 = dst->nb[2];
|
1657
1707
|
const int nb3 = dst->nb[3];
|
1658
1708
|
const ggml_type type = src0->type;
|
1659
|
-
const bool mul_mat_vec = ne11 == 1;
|
1709
|
+
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
|
1710
|
+
|
1711
|
+
const int64_t r2 = ne12 / ne02;
|
1712
|
+
const int64_t r3 = ne13 / ne03;
|
1660
1713
|
|
1661
1714
|
const float alpha = 1.0f;
|
1662
1715
|
const float beta = 0.0f;
|
1663
1716
|
const int x_ne = ne01 * ne00;
|
1664
1717
|
const int y_ne = ne11 * ne10;
|
1665
1718
|
const int d_ne = ne11 * ne01;
|
1666
|
-
const
|
1719
|
+
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
|
1720
|
+
const size_t q_sz = ggml_type_size(type) * x_bps;
|
1667
1721
|
|
1668
1722
|
size_t x_size;
|
1669
1723
|
size_t y_size;
|
@@ -1685,17 +1739,28 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1685
1739
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
1686
1740
|
|
1687
1741
|
const size_t global_denom = ggml_cl_global_denom(type);
|
1688
|
-
const size_t local = ggml_cl_local_size(type);
|
1742
|
+
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
|
1689
1743
|
|
1690
1744
|
size_t ev_idx = 0;
|
1691
1745
|
std::vector<cl_event> events;
|
1692
1746
|
|
1693
|
-
|
1694
|
-
|
1747
|
+
int64_t pi02 = -1;
|
1748
|
+
int64_t pi03 = -1;
|
1749
|
+
|
1750
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
1751
|
+
int64_t i03 = i13 / r3;
|
1752
|
+
|
1753
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
1754
|
+
int64_t i02 = i12 / r2;
|
1755
|
+
|
1695
1756
|
// copy src0 to device if necessary
|
1696
1757
|
if (src0->backend == GGML_BACKEND_CPU) {
|
1697
|
-
|
1698
|
-
|
1758
|
+
if (i02 != pi02 || i03 != pi03) {
|
1759
|
+
events.emplace_back();
|
1760
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1761
|
+
pi02 = i02;
|
1762
|
+
pi03 = i03;
|
1763
|
+
}
|
1699
1764
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
1765
|
d_Q = (cl_mem) src0->extra;
|
1701
1766
|
} else {
|
@@ -1704,11 +1769,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1769
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
1705
1770
|
// copy src1 to device
|
1706
1771
|
events.emplace_back();
|
1707
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1772
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1708
1773
|
|
1709
1774
|
// compute
|
1710
|
-
const size_t global = ne01 *
|
1711
|
-
const size_t
|
1775
|
+
const size_t global = ne01 * local;
|
1776
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1712
1777
|
const cl_int ncols = ne00;
|
1713
1778
|
events.emplace_back();
|
1714
1779
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1716,16 +1781,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1716
1781
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1717
1782
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1718
1783
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1719
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1,
|
1784
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1720
1785
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1721
1786
|
// convert src0 to fp32 on device
|
1722
1787
|
const size_t global = x_ne / global_denom;
|
1788
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1723
1789
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
1724
1790
|
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
1725
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1791
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
1726
1792
|
|
1727
1793
|
// copy src1 to device
|
1728
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1,
|
1794
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
1729
1795
|
|
1730
1796
|
events.emplace_back();
|
1731
1797
|
|
@@ -1749,7 +1815,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1749
1815
|
}
|
1750
1816
|
|
1751
1817
|
// copy dst to host
|
1752
|
-
float * d = (float *) ((char *) dst->data +
|
1818
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
1753
1819
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
1754
1820
|
for (auto *event : events) {
|
1755
1821
|
clReleaseEvent(event);
|
@@ -1844,17 +1910,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1844
1910
|
const int64_t ne3 = tensor->ne[3];
|
1845
1911
|
|
1846
1912
|
const ggml_type type = tensor->type;
|
1847
|
-
const size_t
|
1913
|
+
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
|
1914
|
+
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
|
1848
1915
|
|
1849
1916
|
size_t q_size;
|
1850
1917
|
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
|
1851
1918
|
|
1852
1919
|
tensor->data = data;
|
1853
1920
|
// copy tensor to device
|
1921
|
+
size_t offset = 0;
|
1854
1922
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1855
1923
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1856
|
-
|
1857
|
-
|
1924
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
|
1925
|
+
offset += s_sz;
|
1858
1926
|
}
|
1859
1927
|
}
|
1860
1928
|
|