@fugood/llama.node 1.1.1 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -0
- package/lib/index.js +3 -0
- package/lib/index.ts +6 -0
- package/package.json +14 -14
- package/src/LlamaCompletionWorker.cpp +3 -2
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +50 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
- package/src/tts_utils.h +3 -3
|
@@ -172,24 +172,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
172
172
|
|
|
173
173
|
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
|
174
174
|
|
|
175
|
-
#endif
|
|
176
|
-
for (; ib < nb; ++ib) {
|
|
177
|
-
int sumi0 = 0;
|
|
178
|
-
int sumi1 = 0;
|
|
179
|
-
|
|
180
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
181
|
-
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
|
182
|
-
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
|
183
|
-
|
|
184
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
185
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
int sumi = sumi0 + sumi1;
|
|
189
|
-
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
175
|
*s = sumf;
|
|
176
|
+
#else
|
|
177
|
+
UNUSED(nb);
|
|
178
|
+
UNUSED(x);
|
|
179
|
+
UNUSED(y);
|
|
180
|
+
UNUSED(ib);
|
|
181
|
+
UNUSED(sumf);
|
|
182
|
+
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
183
|
+
#endif
|
|
193
184
|
}
|
|
194
185
|
|
|
195
186
|
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -239,24 +230,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
239
230
|
|
|
240
231
|
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
|
|
241
232
|
|
|
242
|
-
#endif
|
|
243
|
-
for (; ib < nb; ++ib) {
|
|
244
|
-
int sumi0 = 0;
|
|
245
|
-
int sumi1 = 0;
|
|
246
|
-
|
|
247
|
-
for (int j = 0; j < qk/2; ++j) {
|
|
248
|
-
const int v0 = (x[ib].qs[j] & 0x0F);
|
|
249
|
-
const int v1 = (x[ib].qs[j] >> 4);
|
|
250
|
-
|
|
251
|
-
sumi0 += (v0 * y[ib].qs[j]);
|
|
252
|
-
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
int sumi = sumi0 + sumi1;
|
|
256
|
-
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
257
|
-
}
|
|
258
|
-
|
|
259
233
|
*s = sumf;
|
|
234
|
+
#else
|
|
235
|
+
UNUSED(nb);
|
|
236
|
+
UNUSED(x);
|
|
237
|
+
UNUSED(y);
|
|
238
|
+
UNUSED(ib);
|
|
239
|
+
UNUSED(sumf);
|
|
240
|
+
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
241
|
+
#endif
|
|
260
242
|
}
|
|
261
243
|
|
|
262
244
|
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -298,18 +280,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
298
280
|
|
|
299
281
|
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
|
300
282
|
|
|
301
|
-
#endif
|
|
302
|
-
for (; ib < nb; ++ib) {
|
|
303
|
-
int sumi = 0;
|
|
304
|
-
|
|
305
|
-
for (int j = 0; j < qk; j++) {
|
|
306
|
-
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
310
|
-
}
|
|
311
|
-
|
|
312
283
|
*s = sumf;
|
|
284
|
+
#else
|
|
285
|
+
UNUSED(nb);
|
|
286
|
+
UNUSED(x);
|
|
287
|
+
UNUSED(y);
|
|
288
|
+
UNUSED(ib);
|
|
289
|
+
UNUSED(sumf);
|
|
290
|
+
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
291
|
+
#endif
|
|
313
292
|
}
|
|
314
293
|
|
|
315
294
|
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -442,70 +421,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
442
421
|
*s = sum;
|
|
443
422
|
|
|
444
423
|
#else
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
452
|
-
|
|
453
|
-
int8_t aux8[QK_K];
|
|
454
|
-
int16_t aux16[8];
|
|
455
|
-
float sums [8];
|
|
456
|
-
int32_t aux32[8];
|
|
457
|
-
memset(sums, 0, 8*sizeof(float));
|
|
458
|
-
|
|
459
|
-
uint32_t auxs[4];
|
|
460
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
461
|
-
|
|
462
|
-
float sumf = 0;
|
|
463
|
-
for (int i = 0; i < nb; ++i) {
|
|
464
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
465
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
466
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
467
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
468
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
469
|
-
uint8_t m = 1;
|
|
470
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
471
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
472
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
473
|
-
a += 32; m <<= 1;
|
|
474
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
475
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
476
|
-
a += 32; m <<= 1;
|
|
477
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
478
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
479
|
-
a += 32; m <<= 1;
|
|
480
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
481
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
482
|
-
a += 32; m <<= 1;
|
|
483
|
-
q3 += 32;
|
|
484
|
-
}
|
|
485
|
-
a = aux8;
|
|
486
|
-
|
|
487
|
-
memcpy(auxs, x[i].scales, 12);
|
|
488
|
-
uint32_t tmp = auxs[2];
|
|
489
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
490
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
491
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
492
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
493
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
494
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
495
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
496
|
-
q8 += 8; a += 8;
|
|
497
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
498
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
499
|
-
q8 += 8; a += 8;
|
|
500
|
-
}
|
|
501
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
502
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
503
|
-
}
|
|
504
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
505
|
-
*s = sumf;
|
|
506
|
-
|
|
424
|
+
UNUSED(kmask1);
|
|
425
|
+
UNUSED(kmask2);
|
|
426
|
+
UNUSED(x);
|
|
427
|
+
UNUSED(y);
|
|
428
|
+
UNUSED(nb);
|
|
429
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
507
430
|
#endif
|
|
508
|
-
|
|
509
431
|
}
|
|
510
432
|
|
|
511
433
|
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -600,61 +522,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
600
522
|
*s = sumf;
|
|
601
523
|
|
|
602
524
|
#else
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
memset(sums, 0, 8*sizeof(float));
|
|
612
|
-
|
|
613
|
-
float sumf = 0;
|
|
614
|
-
for (int i = 0; i < nb; ++i) {
|
|
615
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
616
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
617
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
618
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
619
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
620
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
621
|
-
a += 32;
|
|
622
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
623
|
-
a += 32; q4 += 32;
|
|
624
|
-
}
|
|
625
|
-
memcpy(utmp, x[i].scales, 12);
|
|
626
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
627
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
628
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
629
|
-
utmp[2] = uaux;
|
|
630
|
-
utmp[0] &= kmask1;
|
|
631
|
-
|
|
632
|
-
int sumi = 0;
|
|
633
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
634
|
-
a = aux8;
|
|
635
|
-
int is = 0;
|
|
636
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
637
|
-
int32_t scale = scales[is++];
|
|
638
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
639
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
640
|
-
q8 += 8; a += 8;
|
|
641
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
642
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
643
|
-
q8 += 8; a += 8;
|
|
644
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
645
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
646
|
-
q8 += 8; a += 8;
|
|
647
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
648
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
649
|
-
q8 += 8; a += 8;
|
|
650
|
-
}
|
|
651
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
652
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
653
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
654
|
-
sumf -= dmin * sumi;
|
|
655
|
-
}
|
|
656
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
657
|
-
*s = sumf;
|
|
525
|
+
UNUSED(x);
|
|
526
|
+
UNUSED(y);
|
|
527
|
+
UNUSED(nb);
|
|
528
|
+
UNUSED(kmask1);
|
|
529
|
+
UNUSED(kmask2);
|
|
530
|
+
UNUSED(kmask3);
|
|
531
|
+
UNUSED(utmp);
|
|
532
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
658
533
|
#endif
|
|
659
534
|
}
|
|
660
535
|
|
|
@@ -767,66 +642,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
767
642
|
*s = sumf;
|
|
768
643
|
|
|
769
644
|
#else
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
memset(sums, 0, 8*sizeof(float));
|
|
779
|
-
|
|
780
|
-
float sumf = 0;
|
|
781
|
-
for (int i = 0; i < nb; ++i) {
|
|
782
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
783
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
784
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
785
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
786
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
787
|
-
uint8_t m = 1;
|
|
788
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
789
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
790
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
791
|
-
a += 32; m <<= 1;
|
|
792
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
793
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
794
|
-
a += 32; m <<= 1;
|
|
795
|
-
q4 += 32;
|
|
796
|
-
}
|
|
797
|
-
memcpy(utmp, x[i].scales, 12);
|
|
798
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
799
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
800
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
801
|
-
utmp[2] = uaux;
|
|
802
|
-
utmp[0] &= kmask1;
|
|
803
|
-
|
|
804
|
-
int sumi = 0;
|
|
805
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
806
|
-
a = aux8;
|
|
807
|
-
int is = 0;
|
|
808
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
809
|
-
int32_t scale = scales[is++];
|
|
810
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
811
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
812
|
-
q8 += 8; a += 8;
|
|
813
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
814
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
815
|
-
q8 += 8; a += 8;
|
|
816
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
817
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
818
|
-
q8 += 8; a += 8;
|
|
819
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
820
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
821
|
-
q8 += 8; a += 8;
|
|
822
|
-
}
|
|
823
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
824
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
825
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
826
|
-
sumf -= dmin * sumi;
|
|
827
|
-
}
|
|
828
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
829
|
-
*s = sumf;
|
|
645
|
+
UNUSED(x);
|
|
646
|
+
UNUSED(y);
|
|
647
|
+
UNUSED(nb);
|
|
648
|
+
UNUSED(kmask1);
|
|
649
|
+
UNUSED(kmask2);
|
|
650
|
+
UNUSED(kmask3);
|
|
651
|
+
UNUSED(utmp);
|
|
652
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
830
653
|
#endif
|
|
831
654
|
}
|
|
832
655
|
|
|
@@ -969,47 +792,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
969
792
|
*s = sum;
|
|
970
793
|
|
|
971
794
|
#else
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
int32_t aux32[8];
|
|
977
|
-
memset(sums, 0, 8*sizeof(float));
|
|
978
|
-
|
|
979
|
-
float sumf = 0;
|
|
980
|
-
for (int i = 0; i < nb; ++i) {
|
|
981
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
982
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
983
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
984
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
985
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
986
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
987
|
-
for (int l = 0; l < 32; ++l) {
|
|
988
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
989
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
990
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
991
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
992
|
-
}
|
|
993
|
-
a += 128;
|
|
994
|
-
q4 += 64;
|
|
995
|
-
qh += 32;
|
|
996
|
-
}
|
|
997
|
-
a = aux8;
|
|
998
|
-
int is = 0;
|
|
999
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
1000
|
-
int scale = x[i].scales[is++];
|
|
1001
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1002
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1003
|
-
q8 += 8; a += 8;
|
|
1004
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
1005
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
1006
|
-
q8 += 8; a += 8;
|
|
1007
|
-
}
|
|
1008
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
1009
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
1010
|
-
}
|
|
1011
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
1012
|
-
*s = sumf;
|
|
795
|
+
UNUSED(x);
|
|
796
|
+
UNUSED(y);
|
|
797
|
+
UNUSED(nb);
|
|
798
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1013
799
|
#endif
|
|
1014
800
|
}
|
|
1015
801
|
|
|
@@ -1186,17 +972,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1186
972
|
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
|
1187
973
|
}
|
|
1188
974
|
|
|
1189
|
-
#endif
|
|
1190
|
-
for (; ib < nb; ++ib) {
|
|
1191
|
-
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
1192
|
-
int sumi1 = 0, sumi2 = 0;
|
|
1193
|
-
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
1194
|
-
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
1195
|
-
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
|
1196
|
-
}
|
|
1197
|
-
sumf += d * (sumi1 + sumi2);
|
|
1198
|
-
}
|
|
1199
975
|
*s = sumf;
|
|
976
|
+
#else
|
|
977
|
+
UNUSED(x);
|
|
978
|
+
UNUSED(y);
|
|
979
|
+
UNUSED(nb);
|
|
980
|
+
UNUSED(ib);
|
|
981
|
+
UNUSED(sumf);
|
|
982
|
+
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
983
|
+
#endif
|
|
1200
984
|
}
|
|
1201
985
|
|
|
1202
986
|
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
@@ -1264,37 +1048,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
1264
1048
|
*s = sumf;
|
|
1265
1049
|
|
|
1266
1050
|
#else
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
const uint8_t * qs = x[ibl].qs;
|
|
1272
|
-
const int8_t * q8 = y[ibl].qs;
|
|
1273
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
1274
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
1275
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
1276
|
-
h >>= 4;
|
|
1277
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
1278
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
1279
|
-
int sumi1 = 0, sumi2 = 0;
|
|
1280
|
-
for (int j = 0; j < 16; ++j) {
|
|
1281
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
1282
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
1283
|
-
}
|
|
1284
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
1285
|
-
qs += 16;
|
|
1286
|
-
q8 += 32;
|
|
1287
|
-
sumi1 = sumi2 = 0;
|
|
1288
|
-
for (int j = 0; j < 16; ++j) {
|
|
1289
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
1290
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
1291
|
-
}
|
|
1292
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
1293
|
-
qs += 16;
|
|
1294
|
-
q8 += 32;
|
|
1295
|
-
}
|
|
1296
|
-
}
|
|
1297
|
-
*s = sumf;
|
|
1051
|
+
UNUSED(x);
|
|
1052
|
+
UNUSED(y);
|
|
1053
|
+
UNUSED(nb);
|
|
1054
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1298
1055
|
#endif
|
|
1299
1056
|
}
|
|
1300
1057
|
|