llama_cpp 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +354 -126
- data/ext/llama_cpp/src/ggml-metal.metal +128 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +17 -15
- data/ext/llama_cpp/src/ggml.c +58 -46
- data/ext/llama_cpp/src/ggml.h +12 -7
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1360 -60
- data/lib/llama_cpp/version.rb +2 -2
- metadata +4 -2
@@ -13,8 +13,8 @@ typedef struct {
|
|
13
13
|
|
14
14
|
#define QK4_1 32
|
15
15
|
typedef struct {
|
16
|
-
half d;
|
17
|
-
half m;
|
16
|
+
half d; // delta
|
17
|
+
half m; // min
|
18
18
|
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
19
19
|
} block_q4_1;
|
20
20
|
|
@@ -132,6 +132,13 @@ kernel void kernel_relu(
|
|
132
132
|
dst[tpig] = max(0.0f, src0[tpig]);
|
133
133
|
}
|
134
134
|
|
135
|
+
kernel void kernel_sqr(
|
136
|
+
device const float * src0,
|
137
|
+
device float * dst,
|
138
|
+
uint tpig[[thread_position_in_grid]]) {
|
139
|
+
dst[tpig] = src0[tpig] * src0[tpig];
|
140
|
+
}
|
141
|
+
|
135
142
|
constant float GELU_COEF_A = 0.044715f;
|
136
143
|
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
137
144
|
|
@@ -338,10 +345,11 @@ kernel void kernel_rms_norm(
|
|
338
345
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
339
346
|
uint tiisg[[thread_index_in_simdgroup]],
|
340
347
|
uint ntg[[threads_per_threadgroup]]) {
|
341
|
-
device const float4 * x
|
342
|
-
device const float
|
343
|
-
|
344
|
-
|
348
|
+
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
349
|
+
device const float * x_scalar = (device const float *) x;
|
350
|
+
|
351
|
+
float4 sumf = 0;
|
352
|
+
float all_sum = 0;
|
345
353
|
|
346
354
|
// parallel sum
|
347
355
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
@@ -354,6 +362,7 @@ kernel void kernel_rms_norm(
|
|
354
362
|
}
|
355
363
|
|
356
364
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
365
|
+
|
357
366
|
// broadcast, simd group number is ntg / 32
|
358
367
|
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
359
368
|
if (tpitg < i) {
|
@@ -361,7 +370,9 @@ kernel void kernel_rms_norm(
|
|
361
370
|
}
|
362
371
|
}
|
363
372
|
if (tpitg == 0) {
|
364
|
-
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
373
|
+
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
374
|
+
sum[0] += x_scalar[i];
|
375
|
+
}
|
365
376
|
sum[0] /= ne00;
|
366
377
|
}
|
367
378
|
|
@@ -376,7 +387,9 @@ kernel void kernel_rms_norm(
|
|
376
387
|
y[i00] = x[i00] * scale;
|
377
388
|
}
|
378
389
|
if (tpitg == 0) {
|
379
|
-
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
390
|
+
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
391
|
+
y_scalar[i00] = x_scalar[i00] * scale;
|
392
|
+
}
|
380
393
|
}
|
381
394
|
}
|
382
395
|
|
@@ -416,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
|
|
416
429
|
}
|
417
430
|
|
418
431
|
// putting them in the kernel cause a significant performance penalty
|
419
|
-
#define N_DST 4
|
420
|
-
#define N_SIMDGROUP 2
|
432
|
+
#define N_DST 4 // each SIMD group works on 4 rows
|
433
|
+
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
421
434
|
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
422
435
|
//Note: This is a template, but strictly speaking it only applies to
|
423
436
|
// quantizations where the block size is 32. It also does not
|
@@ -428,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
|
428
441
|
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
|
429
442
|
uint3 tgpig, uint tiisg, uint sgitg) {
|
430
443
|
const int nb = ne00/QK4_0;
|
444
|
+
|
431
445
|
const int r0 = tgpig.x;
|
432
446
|
const int r1 = tgpig.y;
|
433
447
|
const int im = tgpig.z;
|
448
|
+
|
434
449
|
const int first_row = (r0 * nsg + sgitg) * nr;
|
450
|
+
|
435
451
|
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
452
|
+
|
436
453
|
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
437
454
|
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
438
|
-
float yl[16]; // src1 vector cache
|
439
|
-
float sumf[nr]={0.f};
|
440
455
|
|
441
|
-
|
442
|
-
|
456
|
+
float yl[16]; // src1 vector cache
|
457
|
+
float sumf[nr] = {0.f};
|
458
|
+
|
459
|
+
const int ix = (tiisg/2);
|
460
|
+
const int il = (tiisg%2)*8;
|
443
461
|
|
444
462
|
device const float * yb = y + ix * QK4_0 + il;
|
445
463
|
|
@@ -450,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
|
450
468
|
sumy += yb[i] + yb[i+1];
|
451
469
|
yl[i+0] = yb[i+ 0];
|
452
470
|
yl[i+1] = yb[i+ 1]/256.f;
|
471
|
+
|
453
472
|
sumy += yb[i+16] + yb[i+17];
|
454
473
|
yl[i+8] = yb[i+16]/16.f;
|
455
474
|
yl[i+9] = yb[i+17]/4096.f;
|
@@ -465,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
|
465
484
|
for (int row = 0; row < nr; ++row) {
|
466
485
|
const float tot = simd_sum(sumf[row]);
|
467
486
|
if (tiisg == 0 && first_row + row < ne01) {
|
468
|
-
dst[
|
487
|
+
dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
|
469
488
|
}
|
470
489
|
}
|
471
490
|
}
|
472
491
|
|
473
|
-
kernel void
|
492
|
+
kernel void kernel_mul_mv_q4_0_f32(
|
474
493
|
device const void * src0,
|
475
494
|
device const float * src1,
|
476
495
|
device float * dst,
|
@@ -483,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
|
|
483
502
|
constant int64_t & ne1[[buffer(16)]],
|
484
503
|
constant uint & gqa[[buffer(17)]],
|
485
504
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
486
|
-
uint
|
487
|
-
uint
|
505
|
+
uint tiisg[[thread_index_in_simdgroup]],
|
506
|
+
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
488
507
|
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
489
508
|
}
|
490
509
|
|
491
|
-
kernel void
|
510
|
+
kernel void kernel_mul_mv_q4_1_f32(
|
492
511
|
device const void * src0,
|
493
512
|
device const float * src1,
|
494
513
|
device float * dst,
|
@@ -508,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
|
508
527
|
|
509
528
|
#define NB_Q8_0 8
|
510
529
|
|
511
|
-
kernel void
|
530
|
+
kernel void kernel_mul_mv_q8_0_f32(
|
512
531
|
device const void * src0,
|
513
532
|
device const float * src1,
|
514
533
|
device float * dst,
|
@@ -572,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
|
572
591
|
|
573
592
|
#define N_F32_F32 4
|
574
593
|
|
575
|
-
kernel void
|
594
|
+
kernel void kernel_mul_mv_f32_f32(
|
576
595
|
device const char * src0,
|
577
596
|
device const char * src1,
|
578
597
|
device float * dst,
|
@@ -643,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
|
|
643
662
|
}
|
644
663
|
}
|
645
664
|
|
646
|
-
kernel void
|
665
|
+
kernel void kernel_mul_mv_f16_f32_1row(
|
647
666
|
device const char * src0,
|
648
667
|
device const char * src1,
|
649
668
|
device float * dst,
|
@@ -662,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
|
662
681
|
constant int64_t & ne0,
|
663
682
|
constant int64_t & ne1,
|
664
683
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
665
|
-
uint
|
684
|
+
uint tiisg[[thread_index_in_simdgroup]]) {
|
666
685
|
|
667
686
|
const int64_t r0 = tgpig.x;
|
668
687
|
const int64_t r1 = tgpig.y;
|
@@ -697,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
|
697
716
|
|
698
717
|
#define N_F16_F32 4
|
699
718
|
|
700
|
-
kernel void
|
719
|
+
kernel void kernel_mul_mv_f16_f32(
|
701
720
|
device const char * src0,
|
702
721
|
device const char * src1,
|
703
722
|
device float * dst,
|
@@ -769,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
|
|
769
788
|
}
|
770
789
|
|
771
790
|
// Assumes row size (ne00) is a multiple of 4
|
772
|
-
kernel void
|
791
|
+
kernel void kernel_mul_mv_f16_f32_l4(
|
773
792
|
device const char * src0,
|
774
793
|
device const char * src1,
|
775
794
|
device float * dst,
|
@@ -1098,6 +1117,62 @@ kernel void kernel_cpy_f32_f32(
|
|
1098
1117
|
}
|
1099
1118
|
}
|
1100
1119
|
|
1120
|
+
kernel void kernel_concat(
|
1121
|
+
device const char * src0,
|
1122
|
+
device const char * src1,
|
1123
|
+
device char * dst,
|
1124
|
+
constant int64_t & ne00,
|
1125
|
+
constant int64_t & ne01,
|
1126
|
+
constant int64_t & ne02,
|
1127
|
+
constant int64_t & ne03,
|
1128
|
+
constant uint64_t & nb00,
|
1129
|
+
constant uint64_t & nb01,
|
1130
|
+
constant uint64_t & nb02,
|
1131
|
+
constant uint64_t & nb03,
|
1132
|
+
constant int64_t & ne10,
|
1133
|
+
constant int64_t & ne11,
|
1134
|
+
constant int64_t & ne12,
|
1135
|
+
constant int64_t & ne13,
|
1136
|
+
constant uint64_t & nb10,
|
1137
|
+
constant uint64_t & nb11,
|
1138
|
+
constant uint64_t & nb12,
|
1139
|
+
constant uint64_t & nb13,
|
1140
|
+
constant int64_t & ne0,
|
1141
|
+
constant int64_t & ne1,
|
1142
|
+
constant int64_t & ne2,
|
1143
|
+
constant int64_t & ne3,
|
1144
|
+
constant uint64_t & nb0,
|
1145
|
+
constant uint64_t & nb1,
|
1146
|
+
constant uint64_t & nb2,
|
1147
|
+
constant uint64_t & nb3,
|
1148
|
+
uint3 tgpig[[threadgroup_position_in_grid]],
|
1149
|
+
uint3 tpitg[[thread_position_in_threadgroup]],
|
1150
|
+
uint3 ntg[[threads_per_threadgroup]]) {
|
1151
|
+
|
1152
|
+
const int64_t i03 = tgpig.z;
|
1153
|
+
const int64_t i02 = tgpig.y;
|
1154
|
+
const int64_t i01 = tgpig.x;
|
1155
|
+
|
1156
|
+
const int64_t i13 = i03 % ne13;
|
1157
|
+
const int64_t i12 = i02 % ne12;
|
1158
|
+
const int64_t i11 = i01 % ne11;
|
1159
|
+
|
1160
|
+
device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
|
1161
|
+
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
1162
|
+
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
1163
|
+
|
1164
|
+
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
1165
|
+
if (i02 < ne02) {
|
1166
|
+
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
1167
|
+
src0_ptr += ntg.x*nb00;
|
1168
|
+
} else {
|
1169
|
+
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
1170
|
+
src1_ptr += ntg.x*nb10;
|
1171
|
+
}
|
1172
|
+
dst_ptr += ntg.x*nb0;
|
1173
|
+
}
|
1174
|
+
}
|
1175
|
+
|
1101
1176
|
//============================================ k-quants ======================================================
|
1102
1177
|
|
1103
1178
|
#ifndef QK_K
|
@@ -1190,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
|
1190
1265
|
|
1191
1266
|
//====================================== dot products =========================
|
1192
1267
|
|
1193
|
-
kernel void
|
1268
|
+
kernel void kernel_mul_mv_q2_K_f32(
|
1194
1269
|
device const void * src0,
|
1195
1270
|
device const float * src1,
|
1196
1271
|
device float * dst,
|
@@ -1334,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
|
|
1334
1409
|
}
|
1335
1410
|
|
1336
1411
|
#if QK_K == 256
|
1337
|
-
kernel void
|
1412
|
+
kernel void kernel_mul_mv_q3_K_f32(
|
1338
1413
|
device const void * src0,
|
1339
1414
|
device const float * src1,
|
1340
1415
|
device float * dst,
|
@@ -1486,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1486
1561
|
}
|
1487
1562
|
}
|
1488
1563
|
#else
|
1489
|
-
kernel void
|
1564
|
+
kernel void kernel_mul_mv_q3_K_f32(
|
1490
1565
|
device const void * src0,
|
1491
1566
|
device const float * src1,
|
1492
1567
|
device float * dst,
|
@@ -1557,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
1557
1632
|
#endif
|
1558
1633
|
|
1559
1634
|
#if QK_K == 256
|
1560
|
-
kernel void
|
1635
|
+
kernel void kernel_mul_mv_q4_K_f32(
|
1561
1636
|
device const void * src0,
|
1562
1637
|
device const float * src1,
|
1563
1638
|
device float * dst,
|
@@ -1663,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1663
1738
|
}
|
1664
1739
|
}
|
1665
1740
|
#else
|
1666
|
-
kernel void
|
1741
|
+
kernel void kernel_mul_mv_q4_K_f32(
|
1667
1742
|
device const void * src0,
|
1668
1743
|
device const float * src1,
|
1669
1744
|
device float * dst,
|
@@ -1752,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
1752
1827
|
}
|
1753
1828
|
#endif
|
1754
1829
|
|
1755
|
-
kernel void
|
1830
|
+
kernel void kernel_mul_mv_q5_K_f32(
|
1756
1831
|
device const void * src0,
|
1757
1832
|
device const float * src1,
|
1758
1833
|
device float * dst,
|
@@ -1925,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
|
|
1925
2000
|
|
1926
2001
|
}
|
1927
2002
|
|
1928
|
-
kernel void
|
2003
|
+
kernel void kernel_mul_mv_q6_K_f32(
|
1929
2004
|
device const void * src0,
|
1930
2005
|
device const float * src1,
|
1931
2006
|
device float * dst,
|
@@ -2263,7 +2338,7 @@ kernel void kernel_get_rows(
|
|
2263
2338
|
}
|
2264
2339
|
|
2265
2340
|
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
2266
|
-
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix
|
2341
|
+
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
2267
2342
|
#define BLOCK_SIZE_K 32
|
2268
2343
|
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
2269
2344
|
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
@@ -2300,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2300
2375
|
const uint r0 = tgpig.y;
|
2301
2376
|
const uint r1 = tgpig.x;
|
2302
2377
|
const uint im = tgpig.z;
|
2378
|
+
|
2303
2379
|
// if this block is of 64x32 shape or smaller
|
2304
2380
|
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
2305
2381
|
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
2382
|
+
|
2306
2383
|
// a thread shouldn't load data outside of the matrix
|
2307
2384
|
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
2308
2385
|
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
@@ -2326,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2326
2403
|
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
2327
2404
|
|
2328
2405
|
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
2329
|
-
//load data and store to threadgroup memory
|
2406
|
+
// load data and store to threadgroup memory
|
2330
2407
|
half4x4 temp_a;
|
2331
2408
|
dequantize_func(x, il, temp_a);
|
2332
2409
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2410
|
+
|
2333
2411
|
#pragma unroll(16)
|
2334
2412
|
for (int i = 0; i < 16; i++) {
|
2335
2413
|
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
2336
|
-
+
|
2337
|
-
+
|
2414
|
+
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
|
2415
|
+
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
2338
2416
|
}
|
2339
|
-
|
2340
|
-
|
2417
|
+
|
2418
|
+
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
|
2419
|
+
|
2341
2420
|
il = (il + 2 < nl) ? il + 2 : il % 2;
|
2342
2421
|
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
2343
2422
|
y += BLOCK_SIZE_K;
|
2344
2423
|
|
2345
2424
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2346
|
-
|
2425
|
+
|
2426
|
+
// load matrices from threadgroup memory and conduct outer products
|
2347
2427
|
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
2348
2428
|
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
2429
|
+
|
2349
2430
|
#pragma unroll(4)
|
2350
2431
|
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
2351
2432
|
#pragma unroll(4)
|
@@ -2360,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2360
2441
|
|
2361
2442
|
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
2362
2443
|
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
2444
|
+
|
2363
2445
|
#pragma unroll(8)
|
2364
2446
|
for (int i = 0; i < 8; i++){
|
2365
2447
|
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
|
@@ -2368,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
2368
2450
|
}
|
2369
2451
|
|
2370
2452
|
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
|
2371
|
-
device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
|
2372
|
-
|
2453
|
+
device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
|
2454
|
+
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
|
2373
2455
|
for (int i = 0; i < 8; i++) {
|
2374
2456
|
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
|
2375
2457
|
}
|
2376
2458
|
} else {
|
2377
2459
|
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
2378
2460
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2379
|
-
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
2461
|
+
threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
|
2380
2462
|
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
2381
2463
|
for (int i = 0; i < 8; i++) {
|
2382
2464
|
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
2383
2465
|
}
|
2384
2466
|
|
2385
2467
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
2386
|
-
|
2387
|
-
|
2468
|
+
|
2469
|
+
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
2470
|
+
if (sgitg == 0) {
|
2388
2471
|
for (int i = 0; i < n_rows; i++) {
|
2389
|
-
for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
|
2472
|
+
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
2390
2473
|
*(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
|
2391
2474
|
}
|
2392
2475
|
}
|
@@ -19,7 +19,7 @@
|
|
19
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
22
|
+
#define CL_DMMV_LOCAL_SIZE 32
|
23
23
|
|
24
24
|
#ifndef K_QUANTS_PER_ITERATION
|
25
25
|
#define K_QUANTS_PER_ITERATION 1
|
@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
|
|
338
338
|
const int row = get_group_id(0);
|
339
339
|
|
340
340
|
const int num_blocks_per_row = ncols / QK_K;
|
341
|
-
const int ib0 = row*num_blocks_per_row;
|
341
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
342
342
|
|
343
343
|
__global const struct block_q2_K * x = xx + ib0;
|
344
344
|
|
@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
|
|
413
413
|
const int row = get_group_id(0);
|
414
414
|
|
415
415
|
const int num_blocks_per_row = ncols / QK_K;
|
416
|
-
const int ib0 = row*num_blocks_per_row;
|
416
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
417
417
|
|
418
418
|
__global const struct block_q3_K * x = xx + ib0;
|
419
419
|
|
@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
|
|
489
489
|
|
490
490
|
const int row = get_group_id(0);
|
491
491
|
const int num_blocks_per_row = ncols / QK_K;
|
492
|
-
const int ib0 = row*num_blocks_per_row;
|
492
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
493
493
|
|
494
494
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
495
495
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
|
|
562
562
|
|
563
563
|
const int row = get_group_id(0);
|
564
564
|
const int num_blocks_per_row = ncols / QK_K;
|
565
|
-
const int ib0 = row*num_blocks_per_row;
|
565
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
566
566
|
|
567
567
|
const int tid = get_local_id(0)/2; // 0...15
|
568
568
|
const int ix = get_local_id(0)%2;
|
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
641
641
|
const int row = get_group_id(0);
|
642
642
|
|
643
643
|
const int num_blocks_per_row = ncols / QK_K;
|
644
|
-
const int ib0 = row*num_blocks_per_row;
|
644
|
+
const int ib0 = row*num_blocks_per_row + get_global_offset(0);
|
645
645
|
|
646
646
|
__global const struct block_q6_K * x = xx + ib0;
|
647
647
|
|
@@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|
745
745
|
|
746
746
|
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
747
747
|
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
748
|
-
const int
|
748
|
+
const int local_size = get_local_size(0);
|
749
749
|
const int row = get_group_id(0);
|
750
750
|
const int tid = get_local_id(0);
|
751
751
|
|
752
752
|
const uint qk = QUANT_K;
|
753
753
|
const uint qr = QUANT_R;
|
754
754
|
|
755
|
+
const int col_step = local_size * 2;
|
755
756
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
756
757
|
|
758
|
+
x += get_global_offset(0);
|
759
|
+
|
757
760
|
tmp[tid] = 0;
|
758
761
|
|
759
|
-
for (int
|
760
|
-
const int col = i*block_size + 2*tid;
|
762
|
+
for (int col = tid*2; col < ncols; col += col_step) {
|
761
763
|
const int ib = (row*ncols + col)/qk; // block index
|
762
764
|
const int iqs = (col%qk)/qr; // quant index
|
763
765
|
const int iybs = col - col%qk; // y block start index
|
@@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
773
775
|
|
774
776
|
// sum up partial sums and write back result
|
775
777
|
barrier(CLK_LOCAL_MEM_FENCE);
|
776
|
-
for (int s=
|
778
|
+
for (int s=local_size/2; s>0; s>>=1) {
|
777
779
|
if (tid < s) {
|
778
780
|
tmp[tid] += tmp[tid + s];
|
779
781
|
}
|
@@ -1704,7 +1706,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1704
1706
|
const int nb2 = dst->nb[2];
|
1705
1707
|
const int nb3 = dst->nb[3];
|
1706
1708
|
const ggml_type type = src0->type;
|
1707
|
-
const bool mul_mat_vec = ne11 == 1;
|
1709
|
+
const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
|
1708
1710
|
|
1709
1711
|
const int64_t r2 = ne12 / ne02;
|
1710
1712
|
const int64_t r3 = ne13 / ne03;
|
@@ -1737,7 +1739,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1737
1739
|
GGML_ASSERT(to_fp32_cl != nullptr);
|
1738
1740
|
|
1739
1741
|
const size_t global_denom = ggml_cl_global_denom(type);
|
1740
|
-
const size_t local = ggml_cl_local_size(type);
|
1742
|
+
const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
|
1741
1743
|
|
1742
1744
|
size_t ev_idx = 0;
|
1743
1745
|
std::vector<cl_event> events;
|
@@ -1770,8 +1772,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1770
1772
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
1771
1773
|
|
1772
1774
|
// compute
|
1773
|
-
const size_t global = ne01 *
|
1774
|
-
const size_t
|
1775
|
+
const size_t global = ne01 * local;
|
1776
|
+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
1775
1777
|
const cl_int ncols = ne00;
|
1776
1778
|
events.emplace_back();
|
1777
1779
|
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
@@ -1779,7 +1781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1779
1781
|
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
1780
1782
|
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
1781
1783
|
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
1782
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1,
|
1784
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
1783
1785
|
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
1784
1786
|
// convert src0 to fp32 on device
|
1785
1787
|
const size_t global = x_ne / global_denom;
|