@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -86,35 +86,9 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
88
|
#else
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
float id[4];
|
|
93
|
-
|
|
94
|
-
for (int i = 0; i < nb; i++) {
|
|
95
|
-
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
96
|
-
float amax = 0.0f; // absolute max
|
|
97
|
-
|
|
98
|
-
for (int j = 0; j < QK8_0; j++) {
|
|
99
|
-
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
|
100
|
-
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
const float d = amax / ((1 << 7) - 1);
|
|
104
|
-
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
105
|
-
|
|
106
|
-
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
110
|
-
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
111
|
-
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
112
|
-
src_offset += (j % blck_size_interleave);
|
|
113
|
-
|
|
114
|
-
float x0 = srcv[src_id][src_offset] * id[src_id];
|
|
115
|
-
y[i].qs[j] = roundf(x0);
|
|
116
|
-
}
|
|
117
|
-
}
|
|
89
|
+
UNUSED(nb);
|
|
90
|
+
UNUSED(y);
|
|
91
|
+
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
|
|
118
92
|
#endif
|
|
119
93
|
}
|
|
120
94
|
|
|
@@ -205,35 +179,9 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
|
|
205
179
|
}
|
|
206
180
|
|
|
207
181
|
#else
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
float id[4];
|
|
212
|
-
|
|
213
|
-
for (int i = 0; i < nb; i++) {
|
|
214
|
-
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
|
215
|
-
float amax = 0.0f; // absolute max
|
|
216
|
-
|
|
217
|
-
for (int j = 0; j < QK8_0; j++) {
|
|
218
|
-
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
|
219
|
-
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
const float d = amax / ((1 << 7) - 1);
|
|
223
|
-
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
224
|
-
|
|
225
|
-
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
229
|
-
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
|
230
|
-
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
|
231
|
-
src_offset += (j % blck_size_interleave);
|
|
232
|
-
|
|
233
|
-
float x0 = srcv[src_id][src_offset] * id[src_id];
|
|
234
|
-
y[i].qs[j] = roundf(x0);
|
|
235
|
-
}
|
|
236
|
-
}
|
|
182
|
+
UNUSED(nb);
|
|
183
|
+
UNUSED(y);
|
|
184
|
+
ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
|
|
237
185
|
#endif
|
|
238
186
|
}
|
|
239
187
|
|
|
@@ -295,29 +243,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
295
243
|
}
|
|
296
244
|
return;
|
|
297
245
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
298
|
-
|
|
299
|
-
int sumi;
|
|
300
|
-
|
|
301
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
302
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
303
|
-
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
304
|
-
|
|
305
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
306
|
-
for (int l = 0; l < nb; l++) {
|
|
307
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
308
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
309
|
-
sumi = 0;
|
|
310
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
311
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
312
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
313
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
314
|
-
}
|
|
315
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
320
|
-
}
|
|
246
|
+
ggml_gemv_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
321
247
|
}
|
|
322
248
|
|
|
323
249
|
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -383,29 +309,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
383
309
|
}
|
|
384
310
|
return;
|
|
385
311
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
|
386
|
-
|
|
387
|
-
int sumi;
|
|
388
|
-
|
|
389
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
390
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
391
|
-
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
392
|
-
|
|
393
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
394
|
-
for (int l = 0; l < nb; l++) {
|
|
395
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
396
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
397
|
-
sumi = 0;
|
|
398
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
399
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
400
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
401
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
402
|
-
}
|
|
403
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
408
|
-
}
|
|
312
|
+
ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
409
313
|
}
|
|
410
314
|
|
|
411
315
|
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -497,31 +401,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
497
401
|
#endif // #if defined(__ARM_FEATURE_SVE)
|
|
498
402
|
|
|
499
403
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
|
500
|
-
|
|
501
|
-
float sumf[8];
|
|
502
|
-
int sumi;
|
|
503
|
-
|
|
504
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
505
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
506
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
507
|
-
|
|
508
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
509
|
-
for (int l = 0; l < nb; l++) {
|
|
510
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
511
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
512
|
-
sumi = 0;
|
|
513
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
514
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
515
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
516
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
517
|
-
}
|
|
518
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
523
|
-
}
|
|
524
|
-
}
|
|
404
|
+
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
525
405
|
}
|
|
526
406
|
|
|
527
407
|
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -591,31 +471,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
591
471
|
}
|
|
592
472
|
return;
|
|
593
473
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
|
594
|
-
|
|
595
|
-
float sumf[4];
|
|
596
|
-
int sumi;
|
|
597
|
-
|
|
598
|
-
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
599
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
600
|
-
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
601
|
-
|
|
602
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
|
603
|
-
for (int l = 0; l < nb; l++) {
|
|
604
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
605
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
606
|
-
sumi = 0;
|
|
607
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
608
|
-
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
609
|
-
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
610
|
-
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
611
|
-
}
|
|
612
|
-
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
613
|
-
}
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
|
617
|
-
}
|
|
618
|
-
}
|
|
474
|
+
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
619
475
|
}
|
|
620
476
|
|
|
621
477
|
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -1096,40 +952,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1096
952
|
);
|
|
1097
953
|
return;
|
|
1098
954
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
|
1099
|
-
|
|
1100
|
-
float sumf[4][4];
|
|
1101
|
-
int sumi;
|
|
1102
|
-
|
|
1103
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
1104
|
-
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1105
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1106
|
-
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
1107
|
-
for (int m = 0; m < 4; m++) {
|
|
1108
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1109
|
-
}
|
|
1110
|
-
for (int l = 0; l < nb; l++) {
|
|
1111
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1112
|
-
for (int m = 0; m < 4; m++) {
|
|
1113
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1114
|
-
sumi = 0;
|
|
1115
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
1116
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1117
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1118
|
-
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1119
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1120
|
-
}
|
|
1121
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1122
|
-
}
|
|
1123
|
-
}
|
|
1124
|
-
}
|
|
1125
|
-
}
|
|
1126
|
-
for (int m = 0; m < 4; m++) {
|
|
1127
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1128
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1129
|
-
}
|
|
1130
|
-
}
|
|
1131
|
-
}
|
|
1132
|
-
}
|
|
955
|
+
ggml_gemm_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
1133
956
|
}
|
|
1134
957
|
|
|
1135
958
|
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -1550,38 +1373,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1550
1373
|
);
|
|
1551
1374
|
return;
|
|
1552
1375
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
1553
|
-
|
|
1554
|
-
int sumi;
|
|
1555
|
-
|
|
1556
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
1557
|
-
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1558
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1559
|
-
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
|
1560
|
-
for (int m = 0; m < 4; m++) {
|
|
1561
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
1562
|
-
}
|
|
1563
|
-
for (int l = 0; l < nb; l++) {
|
|
1564
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
1565
|
-
for (int m = 0; m < 4; m++) {
|
|
1566
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1567
|
-
sumi = 0;
|
|
1568
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
1569
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
1570
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
1571
|
-
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
1572
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
1573
|
-
}
|
|
1574
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1575
|
-
}
|
|
1576
|
-
}
|
|
1577
|
-
}
|
|
1578
|
-
}
|
|
1579
|
-
for (int m = 0; m < 4; m++) {
|
|
1580
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
1581
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1582
|
-
}
|
|
1583
|
-
}
|
|
1584
|
-
}
|
|
1376
|
+
ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
1585
1377
|
}
|
|
1586
1378
|
|
|
1587
1379
|
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -2019,38 +1811,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
2019
1811
|
#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
|
2020
1812
|
|
|
2021
1813
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
|
2022
|
-
|
|
2023
|
-
int sumi;
|
|
2024
|
-
|
|
2025
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
2026
|
-
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
2027
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2028
|
-
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
|
2029
|
-
for (int m = 0; m < 4; m++) {
|
|
2030
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
2031
|
-
}
|
|
2032
|
-
for (int l = 0; l < nb; l++) {
|
|
2033
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
2034
|
-
for (int m = 0; m < 4; m++) {
|
|
2035
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2036
|
-
sumi = 0;
|
|
2037
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
2038
|
-
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
|
2039
|
-
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
2040
|
-
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2041
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
2042
|
-
}
|
|
2043
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
2044
|
-
}
|
|
2045
|
-
}
|
|
2046
|
-
}
|
|
2047
|
-
}
|
|
2048
|
-
for (int m = 0; m < 4; m++) {
|
|
2049
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
2050
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
2051
|
-
}
|
|
2052
|
-
}
|
|
2053
|
-
}
|
|
1814
|
+
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
2054
1815
|
}
|
|
2055
1816
|
|
|
2056
1817
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
@@ -2126,38 +1887,5 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
2126
1887
|
}
|
|
2127
1888
|
return;
|
|
2128
1889
|
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
|
2129
|
-
|
|
2130
|
-
float sumf[4][4];
|
|
2131
|
-
int sumi;
|
|
2132
|
-
|
|
2133
|
-
for (int y = 0; y < nr / 4; y++) {
|
|
2134
|
-
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
2135
|
-
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
2136
|
-
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
|
2137
|
-
for (int m = 0; m < 4; m++) {
|
|
2138
|
-
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
|
2139
|
-
}
|
|
2140
|
-
for (int l = 0; l < nb; l++) {
|
|
2141
|
-
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
|
2142
|
-
for (int m = 0; m < 4; m++) {
|
|
2143
|
-
for (int j = 0; j < ncols_interleaved; j++) {
|
|
2144
|
-
sumi = 0;
|
|
2145
|
-
for (int i = 0; i < blocklen; ++i) {
|
|
2146
|
-
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
|
2147
|
-
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
2148
|
-
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
2149
|
-
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
2150
|
-
}
|
|
2151
|
-
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
2152
|
-
}
|
|
2153
|
-
}
|
|
2154
|
-
}
|
|
2155
|
-
}
|
|
2156
|
-
for (int m = 0; m < 4; m++) {
|
|
2157
|
-
for (int j = 0; j < ncols_interleaved; j++)
|
|
2158
|
-
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
2159
|
-
}
|
|
2160
|
-
}
|
|
2161
|
-
}
|
|
2162
|
-
}
|
|
1890
|
+
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
|
2163
1891
|
}
|