llama_cpp 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
@@ -13,8 +13,8 @@ typedef struct {
|
|
13
13
|
|
14
14
|
#define QK4_1 32
|
15
15
|
typedef struct {
|
16
|
-
half d;
|
17
|
-
half m;
|
16
|
+
half d; // delta
|
17
|
+
half m; // min
|
18
18
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
19
19
|
} block_q4_1;
|
20
20
|
|
@@ -132,6 +132,13 @@ kernel void kernel_relu(
|
|
132
132
|
dst[tpig] = max(0.0f, src0[tpig]);
|
133
133
|
}
|
134
134
|
|
135
|
+
kernel void kernel_sqr(
|
136
|
+
device const float * src0,
|
137
|
+
device float * dst,
|
138
|
+
uint tpig[[thread_position_in_grid]]) {
|
139
|
+
dst[tpig] = src0[tpig] * src0[tpig];
|
140
|
+
}
|
141
|
+
|
135
142
|
constant float GELU_COEF_A = 0.044715f;
|
136
143
|
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
137
144
|
|
@@ -338,10 +345,11 @@ kernel void kernel_rms_norm(
|
|
338
345
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
339
346
|
uint tiisg[[thread_index_in_simdgroup]],
|
340
347
|
uint ntg[[threads_per_threadgroup]]) {
|
341
|
-
device const float4 * x
|
342
|
-
device const float
|
343
|
-
|
344
|
-
|
348
|
+
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
349
|
+
device const float * x_scalar = (device const float *) x;
|
350
|
+
|
351
|
+
float4 sumf = 0;
|
352
|
+
float all_sum = 0;
|
345
353
|
|
346
354
|
// parallel sum
|
347
355
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
@@ -354,6 +362,7 @@ kernel void kernel_rms_norm(
|
|
354
362
|
}
|
355
363
|
|
356
364
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
365
|
+
|
357
366
|
// broadcast, simd group number is ntg / 32
|
358
367
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
359
368
|
if (tpitg < i) {
|
@@ -361,7 +370,9 @@ kernel void kernel_rms_norm(
|
|
361
370
|
}
|
362
371
|
}
|
363
372
|
if (tpitg == 0) {
|
364
|
-
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
373
|
+
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
374
|
+
sum[0] += x_scalar[i];
|
375
|
+
}
|
365
376
|
sum[0] /= ne00;
|
366
377
|
}
|
367
378
|
|
@@ -376,7 +387,9 @@ kernel void kernel_rms_norm(
|
|
376
387
|
y[i00] = x[i00] * scale;
|
377
388
|
}
|
378
389
|
if (tpitg == 0) {
|
379
|
-
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
390
|
+
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
391
|
+
y_scalar[i00] = x_scalar[i00] * scale;
|
392
|
+
}
|
380
393
|
}
|
381
394
|
}
|
382
395
|
|
@@ -416,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
|
|
416
429
|
}
|
417
430
|
|
418
431
|
// putting them in the kernel cause a significant performance penalty
|
419
|
-
#define N_DST 4
|
420
|
-
#define N_SIMDGROUP 2
|
432
|
+
#define N_DST 4 // each SIMD group works on 4 rows
|
433
|
+
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
421
434
|
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
422
435
|
//Note: This is a template, but strictly speaking it only applies to
|
423
436
|
// quantizations where the block size is 32. It also does not
|
@@ -428,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
|
428
441
|
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
|
429
442
|
uint3 tgpig, uint tiisg, uint sgitg) {
|
430
443
|
const int nb = ne00/QK4_0;
|
444
|
+
|
431
445
|
const int r0 = tgpig.x;
|
432
446
|
const int r1 = tgpig.y;
|
433
447
|
const int im = tgpig.z;
|
448
|
+
|
434
449
|
const int first_row = (r0 * nsg + sgitg) * nr;
|
450
|
+
|
435
451
|
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
452
|
+
|
436
453
|
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
437
454
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
438
|
-
float yl[16]; // src1 vector cache
|
439
|
-
float sumf[nr]={0.f};
|
440
455
|
|
441
|
-
|
442
|
-
|
456
|
+
float yl[16]; // src1 vector cache
|
457
|
+
float sumf[nr] = {0.f};
|
458
|
+
|
459
|
+
const int ix = (tiisg/2);
|
460
|
+
const int il = (tiisg%2)*8;
|
443
461
|
|
444
462
|
device const float * yb = y + ix * QK4_0 + il;
|
445
463
|
|
@@ -450,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
|
450
468
|
sumy += yb[i] + yb[i+1];
|
451
469
|
yl[i+0] = yb[i+ 0];
|
452
470
|
yl[i+1] = yb[i+ 1]/256.f;
|
471
|
+
|
453
472
|
sumy += yb[i+16] + yb[i+17];
|
454
473
|
yl[i+8] = yb[i+16]/16.f;
|
455
474
|
yl[i+9] = yb[i+17]/4096.f;
|
@@ -465,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
|
465
484
|
for (int row = 0; row < nr; ++row) {
|
466
485
|
const float tot = simd_sum(sumf[row]);
|
467
486
|
if (tiisg == 0 && first_row + row < ne01) {
|
468
|
-
dst[
|
487
|
+
dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
|
469
488
|
}
|
470
489
|
}
|
471
490
|
}
|
472
491
|
|
473
|
-
kernel void
|
492
|
+
kernel void kernel_mul_mv_q4_0_f32(
|
474
493
|
device const void * src0,
|
475
494
|
device const float * src1,
|
476
495
|
device float * dst,
|
@@ -483,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|
483
502
|
constant int64_t & ne1[[buffer(16)]],
|
484
503
|
constant uint & gqa[[buffer(17)]],
|
485
504
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
486
|
-
uint
|
487
|
-
uint
|
505
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
506
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
488
507
|
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
489
508
|
}
|
490
509
|
|
491
|
-
kernel void
|
510
|
+
kernel void kernel_mul_mv_q4_1_f32(
|
492
511
|
device const void * src0,
|
493
512
|
device const float * src1,
|
494
513
|
device float * dst,
|
@@ -508,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
508
527
|
|
509
528
|
#define NB_Q8_0 8
|
510
529
|
|
511
|
-
kernel void
|
530
|
+
kernel void kernel_mul_mv_q8_0_f32(
|
512
531
|
device const void * src0,
|
513
532
|
device const float * src1,
|
514
533
|
device float * dst,
|
@@ -572,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
572
591
|
|
573
592
|
#define N_F32_F32 4
|
574
593
|
|
575
|
-
kernel void
|
594
|
+
kernel void kernel_mul_mv_f32_f32(
|
576
595
|
device const char * src0,
|
577
596
|
device const char * src1,
|
578
597
|
device float * dst,
|
@@ -643,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
|
|
643
662
|
}
|
644
663
|
}
|
645
664
|
|
646
|
-
kernel void
|
665
|
+
kernel void kernel_mul_mv_f16_f32_1row(
|
647
666
|
device const char * src0,
|
648
667
|
device const char * src1,
|
649
668
|
device float * dst,
|
@@ -662,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
|
662
681
|
constant int64_t & ne0,
|
663
682
|
constant int64_t & ne1,
|
664
683
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
665
|
-
uint
|
684
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
666
685
|
|
667
686
|
const int64_t r0 = tgpig.x;
|
668
687
|
const int64_t r1 = tgpig.y;
|
@@ -697,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
|
697
716
|
|
698
717
|
#define N_F16_F32 4
|
699
718
|
|
700
|
-
kernel void
|
719
|
+
kernel void kernel_mul_mv_f16_f32(
|
701
720
|
device const char * src0,
|
702
721
|
device const char * src1,
|
703
722
|
device float * dst,
|
@@ -769,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
769
788
|
}
|
770
789
|
|
771
790
|
// Assumes row size (ne00) is a multiple of 4
|
772
|
-
kernel void
|
791
|
+
kernel void kernel_mul_mv_f16_f32_l4(
|
773
792
|
device const char * src0,
|
774
793
|
device const char * src1,
|
775
794
|
device float * dst,
|
@@ -1098,6 +1117,62 @@ kernel void kernel_cpy_f32_f32(
|
|
1098
1117
|
}
|
1099
1118
|
}
|
1100
1119
|
|
1120
|
+
kernel void kernel_concat(
|
1121
|
+
device const char * src0,
|
1122
|
+
device const char * src1,
|
1123
|
+
device char * dst,
|
1124
|
+
constant int64_t & ne00,
|
1125
|
+
constant int64_t & ne01,
|
1126
|
+
constant int64_t & ne02,
|
1127
|
+
constant int64_t & ne03,
|
1128
|
+
constant uint64_t & nb00,
|
1129
|
+
constant uint64_t & nb01,
|
1130
|
+
constant uint64_t & nb02,
|
1131
|
+
constant uint64_t & nb03,
|
1132
|
+
constant int64_t & ne10,
|
1133
|
+
constant int64_t & ne11,
|
1134
|
+
constant int64_t & ne12,
|
1135
|
+
constant int64_t & ne13,
|
1136
|
+
constant uint64_t & nb10,
|
1137
|
+
constant uint64_t & nb11,
|
1138
|
+
constant uint64_t & nb12,
|
1139
|
+
constant uint64_t & nb13,
|
1140
|
+
constant int64_t & ne0,
|
1141
|
+
constant int64_t & ne1,
|
1142
|
+
constant int64_t & ne2,
|
1143
|
+
constant int64_t & ne3,
|
1144
|
+
constant uint64_t & nb0,
|
1145
|
+
constant uint64_t & nb1,
|
1146
|
+
constant uint64_t & nb2,
|
1147
|
+
constant uint64_t & nb3,
|
1148
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
1149
|
+
uint3 tpitg[[thread_position_in_threadgroup]],
|
1150
|
+
uint3 ntg[[threads_per_threadgroup]]) {
|
1151
|
+
|
1152
|
+
const int64_t i03 = tgpig.z;
|
1153
|
+
const int64_t i02 = tgpig.y;
|
1154
|
+
const int64_t i01 = tgpig.x;
|
1155
|
+
|
1156
|
+
const int64_t i13 = i03 % ne13;
|
1157
|
+
const int64_t i12 = i02 % ne12;
|
1158
|
+
const int64_t i11 = i01 % ne11;
|
1159
|
+
|
1160
|
+
device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
|
1161
|
+
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
1162
|
+
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
1163
|
+
|
1164
|
+
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
1165
|
+
if (i02 < ne02) {
|
1166
|
+
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
1167
|
+
src0_ptr += ntg.x*nb00;
|
1168
|
+
} else {
|
1169
|
+
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
1170
|
+
src1_ptr += ntg.x*nb10;
|
1171
|
+
}
|
1172
|
+
dst_ptr += ntg.x*nb0;
|
1173
|
+
}
|
1174
|
+
}
|
1175
|
+
|
1101
1176
|
//============================================ k-quants ======================================================
|
1102
1177
|
|
1103
1178
|
#ifndef QK_K
|
@@ -1190,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
|
1190
1265
|
|
1191
1266
|
//====================================== dot products =========================
|
1192
1267
|
|
1193
|
-
kernel void
|
1268
|
+
kernel void kernel_mul_mv_q2_K_f32(
|
1194
1269
|
device const void * src0,
|
1195
1270
|
device const float * src1,
|
1196
1271
|
device float * dst,
|
@@ -1334,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
|
|
1334
1409
|
}
|
1335
1410
|
|
1336
1411
|
#if QK_K == 256
|
1337
|
-
kernel void
|
1412
|
+
kernel void kernel_mul_mv_q3_K_f32(
|
1338
1413
|
device const void * src0,
|
1339
1414
|
device const float * src1,
|
1340
1415
|
device float * dst,
|
@@ -1486,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1486
1561
|
}
|
1487
1562
|
}
|
1488
1563
|
#else
|
1489
|
-
kernel void
|
1564
|
+
kernel void kernel_mul_mv_q3_K_f32(
|
1490
1565
|
device const void * src0,
|
1491
1566
|
device const float * src1,
|
1492
1567
|
device float * dst,
|
@@ -1557,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1557
1632
|
#endif
|
1558
1633
|
|
1559
1634
|
#if QK_K == 256
|
1560
|
-
kernel void
|
1635
|
+
kernel void kernel_mul_mv_q4_K_f32(
|
1561
1636
|
device const void * src0,
|
1562
1637
|
device const float * src1,
|
1563
1638
|
device float * dst,
|
@@ -1663,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1663
1738
|
}
|
1664
1739
|
}
|
1665
1740
|
#else
|
1666
|
-
kernel void
|
1741
|
+
kernel void kernel_mul_mv_q4_K_f32(
|
1667
1742
|
device const void * src0,
|
1668
1743
|
device const float * src1,
|
1669
1744
|
device float * dst,
|
@@ -1752,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1752
1827
|
}
|
1753
1828
|
#endif
|
1754
1829
|
|
1755
|
-
kernel void
|
1830
|
+
kernel void kernel_mul_mv_q5_K_f32(
|
1756
1831
|
device const void * src0,
|
1757
1832
|
device const float * src1,
|
1758
1833
|
device float * dst,
|
@@ -1925,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
|
|
1925
2000
|
|
1926
2001
|
}
|
1927
2002
|
|
1928
|
-
kernel void
|
2003
|
+
kernel void kernel_mul_mv_q6_K_f32(
|
1929
2004
|
device const void * src0,
|
1930
2005
|
device const float * src1,
|
1931
2006
|
device float * dst,
|
@@ -2263,7 +2338,7 @@ kernel void kernel_get_rows(
|
|
2263
2338
|
}
|
2264
2339
|
|
2265
2340
|
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
2266
|
-
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix
|
2341
|
+
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
2267
2342
|
#define BLOCK_SIZE_K 32
|
2268
2343
|
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
2269
2344
|
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
@@ -2300,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2300
2375
|
const uint r0 = tgpig.y;
|
2301
2376
|
const uint r1 = tgpig.x;
|
2302
2377
|
const uint im = tgpig.z;
|
2378
|
+
|
2303
2379
|
// if this block is of 64x32 shape or smaller
|
2304
2380
|
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
2305
2381
|
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
2382
|
+
|
2306
2383
|
// a thread shouldn't load data outside of the matrix
|
2307
2384
|
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
2308
2385
|
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
@@ -2326,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2326
2403
|
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
2327
2404
|
|
2328
2405
|
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
2329
|
-
//load data and store to threadgroup memory
|
2406
|
+
// load data and store to threadgroup memory
|
2330
2407
|
half4x4 temp_a;
|
2331
2408
|
dequantize_func(x, il, temp_a);
|
2332
2409
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2410
|
+
|
2333
2411
|
#pragma unroll(16)
|
2334
2412
|
for (int i = 0; i < 16; i++) {
|
2335
2413
|
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
2336
|
-
+
|
2337
|
-
+
|
2414
|
+
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
|
2415
|
+
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
2338
2416
|
}
|
2339
|
-
|
2340
|
-
|
2417
|
+
|
2418
|
+
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
|
2419
|
+
|
2341
2420
|
il = (il + 2 < nl) ? il + 2 : il % 2;
|
2342
2421
|
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
2343
2422
|
y += BLOCK_SIZE_K;
|
2344
2423
|
|
2345
2424
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2346
|
-
|
2425
|
+
|
2426
|
+
// load matrices from threadgroup memory and conduct outer products
|
2347
2427
|
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
2348
2428
|
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
2429
|
+
|
2349
2430
|
#pragma unroll(4)
|
2350
2431
|
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
2351
2432
|
#pragma unroll(4)
|
@@ -2360,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2360
2441
|
|
2361
2442
|
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
2362
2443
|
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
2444
|
+
|
2363
2445
|
#pragma unroll(8)
|
2364
2446
|
for (int i = 0; i < 8; i++){
|
2365
2447
|
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
|
@@ -2368,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2368
2450
|
}
|
2369
2451
|
|
2370
2452
|
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
|
2371
|
-
device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
|
2372
|
-
|
2453
|
+
device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
|
2454
|
+
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
|
2373
2455
|
for (int i = 0; i < 8; i++) {
|
2374
2456
|
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
|
2375
2457
|
}
|
2376
2458
|
} else {
|
2377
2459
|
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
2378
2460
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2379
|
-
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
2461
|
+
threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
|
2380
2462
|
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
2381
2463
|
for (int i = 0; i < 8; i++) {
|
2382
2464
|
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
2383
2465
|
}
|
2384
2466
|
|
2385
2467
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2386
|
-
|
2387
|
-
|
2468
|
+
|
2469
|
+
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
2470
|
+
if (sgitg == 0) {
|
2388
2471
|
for (int i = 0; i < n_rows; i++) {
|
2389
|
-
for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
|
2472
|
+
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
2390
2473
|
*(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
|
2391
2474
|
}
|
2392
2475
|
}
|
@@ -19,7 +19,7 @@
|
|
19
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
22
|
+
#define CL_DMMV_LOCAL_SIZE 32
|
23
23
|
|
24
24
|
#ifndef K_QUANTS_PER_ITERATION
|
25
25
|
#define K_QUANTS_PER_ITERATION 1
|
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
|
|
338
338
|
const int row = get_group_id(0);
|
339
339
|
|
340
340
|
const int num_blocks_per_row = ncols / QK_K;
|
341
|
-
const int ib0 = row*num_blocks_per_row;
|
341
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
342
342
|
|
343
343
|
__global const struct block_q2_K * x = xx + ib0;
|
344
344
|
|
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
|
|
413
413
|
const int row = get_group_id(0);
|
414
414
|
|
415
415
|
const int num_blocks_per_row = ncols / QK_K;
|
416
|
-
const int ib0 = row*num_blocks_per_row;
|
416
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
417
417
|
|
418
418
|
__global const struct block_q3_K * x = xx + ib0;
|
419
419
|
|
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
|
|
489
489
|
|
490
490
|
const int row = get_group_id(0);
|
491
491
|
const int num_blocks_per_row = ncols / QK_K;
|
492
|
-
const int ib0 = row*num_blocks_per_row;
|
492
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
493
493
|
|
494
494
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
495
495
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
|
|
562
562
|
|
563
563
|
const int row = get_group_id(0);
|
564
564
|
const int num_blocks_per_row = ncols / QK_K;
|
565
|
-
const int ib0 = row*num_blocks_per_row;
|
565
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
566
566
|
|
567
567
|
const int tid = get_local_id(0)/2; // 0...15
|
568
568
|
const int ix = get_local_id(0)%2;
|
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
641
641
|
const int row = get_group_id(0);
|
642
642
|
|
643
643
|
const int num_blocks_per_row = ncols / QK_K;
|
644
|
-
const int ib0 = row*num_blocks_per_row;
|
644
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
645
645
|
|
646
646
|
__global const struct block_q6_K * x = xx + ib0;
|
647
647
|
|
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
745
745
|
|
746
746
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
747
747
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
748
|
-
const int
|
748
|
+
const int local_size = get_local_size(0);
|
749
749
|
const int row = get_group_id(0);
|
750
750
|
const int tid = get_local_id(0);
|
751
751
|
|
752
752
|
const uint qk = QUANT_K;
|
753
753
|
const uint qr = QUANT_R;
|
754
754
|
|
755
|
+
const int col_step = local_size * 2;
|
755
756
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
756
757
|
|
758
|
+
x += get_global_offset(0);
|
759
|
+
|
757
760
|
tmp[tid] = 0;
|
758
761
|
|
759
|
-
for (int
|
760
|
-
const int col = i*block_size + 2*tid;
|
762
|
+
for (int col = tid*2; col < ncols; col += col_step) {
|
761
763
|
const int ib = (row*ncols + col)/qk; // block index
|
762
764
|
const int iqs = (col%qk)/qr; // quant index
|
763
765
|
const int iybs = col - col%qk; // y block start index
|
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
773
775
|
|
774
776
|
// sum up partial sums and write back result
|
775
777
|
barrier(CLK_LOCAL_MEM_FENCE);
|
776
|
-
for (int s=
|
778
|
+
for (int s=local_size/2; s>0; s>>=1) {
|
777
779
|
if (tid < s) {
|
778
780
|
tmp[tid] += tmp[tid + s];
|
779
781
|
}
|
@@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1706
|
const int nb2 = dst->nb[2];
|
1705
1707
|
const int nb3 = dst->nb[3];
|
1706
1708
|
const ggml_type type = src0->type;
|
1707
|
-
const bool mul_mat_vec = ne11 == 1;
|
1709
|
+
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
|
1708
1710
|
|
1709
1711
|
const int64_t r2 = ne12 / ne02;
|
1710
1712
|
const int64_t r3 = ne13 / ne03;
|
@@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1737
1739
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
1738
1740
|
|
1739
1741
|
const size_t global_denom = ggml_cl_global_denom(type);
|
1740
|
-
const size_t local = ggml_cl_local_size(type);
|
1742
|
+
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
|
1741
1743
|
|
1742
1744
|
size_t ev_idx = 0;
|
1743
1745
|
std::vector<cl_event> events;
|
@@ -1770,8 +1772,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1770
1772
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1771
1773
|
|
1772
1774
|
// compute
|
1773
|
-
const size_t global = ne01 *
|
1774
|
-
const size_t
|
1775
|
+
const size_t global = ne01 * local;
|
1776
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1775
1777
|
const cl_int ncols = ne00;
|
1776
1778
|
events.emplace_back();
|
1777
1779
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1779,7 +1781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1779
1781
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1780
1782
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1781
1783
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1782
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1,
|
1784
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1783
1785
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1784
1786
|
// convert src0 to fp32 on device
|
1785
1787
|
const size_t global = x_ne / global_denom;
|