@fugood/llama.node 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -172,24 +172,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
172
172
 
173
173
  sumf = acc[0] + acc[1] + acc[2] + acc[3];
174
174
 
175
- #endif
176
- for (; ib < nb; ++ib) {
177
- int sumi0 = 0;
178
- int sumi1 = 0;
179
-
180
- for (int j = 0; j < qk/2; ++j) {
181
- const int v0 = (x[ib].qs[j] & 0x0F) - 8;
182
- const int v1 = (x[ib].qs[j] >> 4) - 8;
183
-
184
- sumi0 += (v0 * y[ib].qs[j]);
185
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
186
- }
187
-
188
- int sumi = sumi0 + sumi1;
189
- sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
190
- }
191
-
192
175
  *s = sumf;
176
+ #else
177
+ UNUSED(nb);
178
+ UNUSED(x);
179
+ UNUSED(y);
180
+ UNUSED(ib);
181
+ UNUSED(sumf);
182
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
183
+ #endif
193
184
  }
194
185
 
195
186
  void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -239,24 +230,15 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
239
230
 
240
231
  sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
241
232
 
242
- #endif
243
- for (; ib < nb; ++ib) {
244
- int sumi0 = 0;
245
- int sumi1 = 0;
246
-
247
- for (int j = 0; j < qk/2; ++j) {
248
- const int v0 = (x[ib].qs[j] & 0x0F);
249
- const int v1 = (x[ib].qs[j] >> 4);
250
-
251
- sumi0 += (v0 * y[ib].qs[j]);
252
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
253
- }
254
-
255
- int sumi = sumi0 + sumi1;
256
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
257
- }
258
-
259
233
  *s = sumf;
234
+ #else
235
+ UNUSED(nb);
236
+ UNUSED(x);
237
+ UNUSED(y);
238
+ UNUSED(ib);
239
+ UNUSED(sumf);
240
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
241
+ #endif
260
242
  }
261
243
 
262
244
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -298,18 +280,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
298
280
 
299
281
  sumf = acc[0] + acc[1] + acc[2] + acc[3];
300
282
 
301
- #endif
302
- for (; ib < nb; ++ib) {
303
- int sumi = 0;
304
-
305
- for (int j = 0; j < qk; j++) {
306
- sumi += x[ib].qs[j]*y[ib].qs[j];
307
- }
308
-
309
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
310
- }
311
-
312
283
  *s = sumf;
284
+ #else
285
+ UNUSED(nb);
286
+ UNUSED(x);
287
+ UNUSED(y);
288
+ UNUSED(ib);
289
+ UNUSED(sumf);
290
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
291
+ #endif
313
292
  }
314
293
 
315
294
  void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -442,70 +421,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
442
421
  *s = sum;
443
422
 
444
423
  #else
445
- // scalar version
446
- // This function is written like this so the compiler can manage to vectorize most of it
447
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
448
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
449
- // The ideal situation would be if we could just write the code once, and the compiler would
450
- // automatically produce the best possible set of machine instructions, instead of us having to manually
451
- // write vectorized versions for AVX, ARM_NEON, etc.
452
-
453
- int8_t aux8[QK_K];
454
- int16_t aux16[8];
455
- float sums [8];
456
- int32_t aux32[8];
457
- memset(sums, 0, 8*sizeof(float));
458
-
459
- uint32_t auxs[4];
460
- const int8_t * scales = (const int8_t*)auxs;
461
-
462
- float sumf = 0;
463
- for (int i = 0; i < nb; ++i) {
464
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
465
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
466
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
467
- memset(aux32, 0, 8*sizeof(int32_t));
468
- int8_t * GGML_RESTRICT a = aux8;
469
- uint8_t m = 1;
470
- for (int j = 0; j < QK_K; j += 128) {
471
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
472
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
473
- a += 32; m <<= 1;
474
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
475
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
476
- a += 32; m <<= 1;
477
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
478
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
479
- a += 32; m <<= 1;
480
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
481
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
482
- a += 32; m <<= 1;
483
- q3 += 32;
484
- }
485
- a = aux8;
486
-
487
- memcpy(auxs, x[i].scales, 12);
488
- uint32_t tmp = auxs[2];
489
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
490
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
491
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
492
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
493
- for (int j = 0; j < QK_K/16; ++j) {
494
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
495
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
496
- q8 += 8; a += 8;
497
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
498
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
499
- q8 += 8; a += 8;
500
- }
501
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
502
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
503
- }
504
- for (int l = 0; l < 8; ++l) sumf += sums[l];
505
- *s = sumf;
506
-
424
+ UNUSED(kmask1);
425
+ UNUSED(kmask2);
426
+ UNUSED(x);
427
+ UNUSED(y);
428
+ UNUSED(nb);
429
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
507
430
  #endif
508
-
509
431
  }
510
432
 
511
433
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -600,61 +522,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
600
522
  *s = sumf;
601
523
 
602
524
  #else
603
-
604
- const uint8_t * scales = (const uint8_t*)&utmp[0];
605
- const uint8_t * mins = (const uint8_t*)&utmp[2];
606
-
607
- int8_t aux8[QK_K];
608
- int16_t aux16[8];
609
- float sums [8];
610
- int32_t aux32[8];
611
- memset(sums, 0, 8*sizeof(float));
612
-
613
- float sumf = 0;
614
- for (int i = 0; i < nb; ++i) {
615
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
616
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
617
- memset(aux32, 0, 8*sizeof(int32_t));
618
- int8_t * GGML_RESTRICT a = aux8;
619
- for (int j = 0; j < QK_K/64; ++j) {
620
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
621
- a += 32;
622
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
623
- a += 32; q4 += 32;
624
- }
625
- memcpy(utmp, x[i].scales, 12);
626
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
627
- const uint32_t uaux = utmp[1] & kmask1;
628
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
629
- utmp[2] = uaux;
630
- utmp[0] &= kmask1;
631
-
632
- int sumi = 0;
633
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
634
- a = aux8;
635
- int is = 0;
636
- for (int j = 0; j < QK_K/32; ++j) {
637
- int32_t scale = scales[is++];
638
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
639
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
640
- q8 += 8; a += 8;
641
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
642
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
643
- q8 += 8; a += 8;
644
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
645
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
646
- q8 += 8; a += 8;
647
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
648
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
649
- q8 += 8; a += 8;
650
- }
651
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
652
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
653
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
654
- sumf -= dmin * sumi;
655
- }
656
- for (int l = 0; l < 8; ++l) sumf += sums[l];
657
- *s = sumf;
525
+ UNUSED(x);
526
+ UNUSED(y);
527
+ UNUSED(nb);
528
+ UNUSED(kmask1);
529
+ UNUSED(kmask2);
530
+ UNUSED(kmask3);
531
+ UNUSED(utmp);
532
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
658
533
  #endif
659
534
  }
660
535
 
@@ -767,66 +642,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
767
642
  *s = sumf;
768
643
 
769
644
  #else
770
-
771
- const uint8_t * scales = (const uint8_t*)&utmp[0];
772
- const uint8_t * mins = (const uint8_t*)&utmp[2];
773
-
774
- int8_t aux8[QK_K];
775
- int16_t aux16[8];
776
- float sums [8];
777
- int32_t aux32[8];
778
- memset(sums, 0, 8*sizeof(float));
779
-
780
- float sumf = 0;
781
- for (int i = 0; i < nb; ++i) {
782
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
783
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
784
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
785
- memset(aux32, 0, 8*sizeof(int32_t));
786
- int8_t * GGML_RESTRICT a = aux8;
787
- uint8_t m = 1;
788
- for (int j = 0; j < QK_K/64; ++j) {
789
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
790
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
791
- a += 32; m <<= 1;
792
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
793
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
794
- a += 32; m <<= 1;
795
- q4 += 32;
796
- }
797
- memcpy(utmp, x[i].scales, 12);
798
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
799
- const uint32_t uaux = utmp[1] & kmask1;
800
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
801
- utmp[2] = uaux;
802
- utmp[0] &= kmask1;
803
-
804
- int sumi = 0;
805
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
806
- a = aux8;
807
- int is = 0;
808
- for (int j = 0; j < QK_K/32; ++j) {
809
- int32_t scale = scales[is++];
810
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
811
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
812
- q8 += 8; a += 8;
813
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
814
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
815
- q8 += 8; a += 8;
816
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
817
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
818
- q8 += 8; a += 8;
819
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
820
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
821
- q8 += 8; a += 8;
822
- }
823
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
824
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
825
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
826
- sumf -= dmin * sumi;
827
- }
828
- for (int l = 0; l < 8; ++l) sumf += sums[l];
829
- *s = sumf;
645
+ UNUSED(x);
646
+ UNUSED(y);
647
+ UNUSED(nb);
648
+ UNUSED(kmask1);
649
+ UNUSED(kmask2);
650
+ UNUSED(kmask3);
651
+ UNUSED(utmp);
652
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
830
653
  #endif
831
654
  }
832
655
 
@@ -969,47 +792,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
969
792
  *s = sum;
970
793
 
971
794
  #else
972
-
973
- int8_t aux8[QK_K];
974
- int16_t aux16[8];
975
- float sums [8];
976
- int32_t aux32[8];
977
- memset(sums, 0, 8*sizeof(float));
978
-
979
- float sumf = 0;
980
- for (int i = 0; i < nb; ++i) {
981
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
982
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
983
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
984
- memset(aux32, 0, 8*sizeof(int32_t));
985
- int8_t * GGML_RESTRICT a = aux8;
986
- for (int j = 0; j < QK_K; j += 128) {
987
- for (int l = 0; l < 32; ++l) {
988
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
989
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
990
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
991
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
992
- }
993
- a += 128;
994
- q4 += 64;
995
- qh += 32;
996
- }
997
- a = aux8;
998
- int is = 0;
999
- for (int j = 0; j < QK_K/16; ++j) {
1000
- int scale = x[i].scales[is++];
1001
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1002
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1003
- q8 += 8; a += 8;
1004
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1005
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1006
- q8 += 8; a += 8;
1007
- }
1008
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1009
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1010
- }
1011
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1012
- *s = sumf;
795
+ UNUSED(x);
796
+ UNUSED(y);
797
+ UNUSED(nb);
798
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1013
799
  #endif
1014
800
  }
1015
801
 
@@ -1186,17 +972,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
1186
972
  sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1187
973
  }
1188
974
 
1189
- #endif
1190
- for (; ib < nb; ++ib) {
1191
- const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1192
- int sumi1 = 0, sumi2 = 0;
1193
- for (int j = 0; j < QK4_NL/2; ++j) {
1194
- sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
1195
- sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
1196
- }
1197
- sumf += d * (sumi1 + sumi2);
1198
- }
1199
975
  *s = sumf;
976
+ #else
977
+ UNUSED(x);
978
+ UNUSED(y);
979
+ UNUSED(nb);
980
+ UNUSED(ib);
981
+ UNUSED(sumf);
982
+ ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
983
+ #endif
1200
984
  }
1201
985
 
1202
986
  void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1264,37 +1048,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1264
1048
  *s = sumf;
1265
1049
 
1266
1050
  #else
1267
- float sumf = 0;
1268
- for (int ibl = 0; ibl < nb; ++ibl) {
1269
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1270
- uint16_t h = x[ibl].scales_h;
1271
- const uint8_t * qs = x[ibl].qs;
1272
- const int8_t * q8 = y[ibl].qs;
1273
- for (int ib = 0; ib < QK_K/32; ib += 2) {
1274
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
1275
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
1276
- h >>= 4;
1277
- const float d1 = d4d8*(ls1 - 32);
1278
- const float d2 = d4d8*(ls2 - 32);
1279
- int sumi1 = 0, sumi2 = 0;
1280
- for (int j = 0; j < 16; ++j) {
1281
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1282
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1283
- }
1284
- sumf += d1 * (sumi1 + sumi2);
1285
- qs += 16;
1286
- q8 += 32;
1287
- sumi1 = sumi2 = 0;
1288
- for (int j = 0; j < 16; ++j) {
1289
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
1290
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
1291
- }
1292
- sumf += d2 * (sumi1 + sumi2);
1293
- qs += 16;
1294
- q8 += 32;
1295
- }
1296
- }
1297
- *s = sumf;
1051
+ UNUSED(x);
1052
+ UNUSED(y);
1053
+ UNUSED(nb);
1054
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1298
1055
  #endif
1299
1056
  }
1300
1057