@fugood/llama.node 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -435,30 +435,15 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
435
435
  sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
436
436
  wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
437
437
 
438
- #endif
439
- for (; ib < nb; ++ib) {
440
- uint32_t qh;
441
- memcpy(&qh, x[ib].qh, sizeof(qh));
442
-
443
- int sumi0 = 0;
444
- int sumi1 = 0;
445
-
446
- for (int j = 0; j < qk/2; ++j) {
447
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
448
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
449
-
450
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
451
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
452
-
453
- sumi0 += (x0 * y[ib].qs[j]);
454
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
455
- }
456
-
457
- int sumi = sumi0 + sumi1;
458
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
459
- }
460
-
461
438
  *s = sumf;
439
+ #else
440
+ UNUSED(nb);
441
+ UNUSED(ib);
442
+ UNUSED(sumf);
443
+ UNUSED(x);
444
+ UNUSED(y);
445
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
446
+ #endif
462
447
  }
463
448
 
464
449
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -545,30 +530,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
545
530
  sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
546
531
  wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
547
532
 
548
- #endif
549
- for (; ib < nb; ++ib) {
550
- uint32_t qh;
551
- memcpy(&qh, x[ib].qh, sizeof(qh));
552
-
553
- int sumi0 = 0;
554
- int sumi1 = 0;
555
-
556
- for (int j = 0; j < qk/2; ++j) {
557
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
558
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
559
-
560
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
561
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
562
-
563
- sumi0 += (x0 * y[ib].qs[j]);
564
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
565
- }
566
-
567
- int sumi = sumi0 + sumi1;
568
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
569
- }
570
-
571
533
  *s = sumf;
534
+ #else
535
+ UNUSED(nb);
536
+ UNUSED(ib);
537
+ UNUSED(sumf);
538
+ UNUSED(x);
539
+ UNUSED(y);
540
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
541
+ #endif
572
542
  }
573
543
 
574
544
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -628,18 +598,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
628
598
  sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
629
599
  wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
630
600
 
631
- #endif
632
- for (; ib < nb; ++ib) {
633
- int sumi = 0;
634
-
635
- for (int j = 0; j < qk; j++) {
636
- sumi += x[ib].qs[j]*y[ib].qs[j];
637
- }
638
-
639
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
640
- }
641
-
642
601
  *s = sumf;
602
+ #else
603
+ UNUSED(nb);
604
+ UNUSED(x);
605
+ UNUSED(y);
606
+ UNUSED(ib);
607
+ UNUSED(sumf);
608
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
609
+ #endif
643
610
  }
644
611
 
645
612
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -755,45 +722,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
755
722
  *s = sumf;
756
723
 
757
724
  #else
758
-
759
- float sumf = 0;
760
-
761
- for (int i = 0; i < nb; ++i) {
762
-
763
- const uint8_t * q2 = x[i].qs;
764
- const int8_t * q8 = y[i].qs;
765
- const uint8_t * sc = x[i].scales;
766
-
767
- int summs = 0;
768
- for (int j = 0; j < 16; ++j) {
769
- summs += y[i].bsums[j] * (sc[j] >> 4);
770
- }
771
-
772
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
773
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
774
-
775
- int isum = 0;
776
- int is = 0;
777
- int d;
778
- for (int k = 0; k < QK_K/128; ++k) {
779
- int shift = 0;
780
- for (int j = 0; j < 4; ++j) {
781
- d = sc[is++] & 0xF;
782
- int isuml = 0;
783
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
784
- isum += d * isuml;
785
- d = sc[is++] & 0xF;
786
- isuml = 0;
787
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
788
- isum += d * isuml;
789
- shift += 2;
790
- q8 += 32;
791
- }
792
- q2 += 32;
793
- }
794
- sumf += dall * isum - dmin * summs;
795
- }
796
- *s = sumf;
725
+ UNUSED(x);
726
+ UNUSED(y);
727
+ UNUSED(nb);
728
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
797
729
  #endif
798
730
  }
799
731
 
@@ -902,68 +834,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
902
834
  *s = sumf;
903
835
 
904
836
  #else
905
- // scalar version
906
- // This function is written like this so the compiler can manage to vectorize most of it
907
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
908
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
909
- // The ideal situation would be if we could just write the code once, and the compiler would
910
- // automatically produce the best possible set of machine instructions, instead of us having to manually
911
- // write vectorized versions for AVX, ARM_NEON, etc.
912
-
913
- int8_t aux8[QK_K];
914
- int16_t aux16[8];
915
- float sums [8];
916
- int32_t aux32[8];
917
- memset(sums, 0, 8*sizeof(float));
918
-
919
- uint32_t auxs[4];
920
- const int8_t * scales = (const int8_t*)auxs;
921
-
922
- float sumf = 0;
923
- for (int i = 0; i < nb; ++i) {
924
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
925
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
926
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
927
- memset(aux32, 0, 8*sizeof(int32_t));
928
- int8_t * GGML_RESTRICT a = aux8;
929
- uint8_t m = 1;
930
- for (int j = 0; j < QK_K; j += 128) {
931
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
932
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
933
- a += 32; m <<= 1;
934
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
935
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
936
- a += 32; m <<= 1;
937
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
938
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
939
- a += 32; m <<= 1;
940
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
941
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
942
- a += 32; m <<= 1;
943
- q3 += 32;
944
- }
945
- a = aux8;
946
-
947
- memcpy(auxs, x[i].scales, 12);
948
- uint32_t tmp = auxs[2];
949
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
950
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
951
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
952
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
953
- for (int j = 0; j < QK_K/16; ++j) {
954
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
955
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
956
- q8 += 8; a += 8;
957
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
958
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
959
- q8 += 8; a += 8;
960
- }
961
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
962
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
963
- }
964
- for (int l = 0; l < 8; ++l) sumf += sums[l];
965
- *s = sumf;
966
-
837
+ UNUSED(kmask1);
838
+ UNUSED(kmask2);
839
+ UNUSED(x);
840
+ UNUSED(y);
841
+ UNUSED(nb);
842
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
967
843
  #endif
968
844
 
969
845
  }
@@ -1089,61 +965,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1089
965
  *s = sumf;
1090
966
 
1091
967
  #else
1092
-
1093
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1094
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1095
-
1096
- int8_t aux8[QK_K];
1097
- int16_t aux16[8];
1098
- float sums [8];
1099
- int32_t aux32[8];
1100
- memset(sums, 0, 8*sizeof(float));
1101
-
1102
- float sumf = 0;
1103
- for (int i = 0; i < nb; ++i) {
1104
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1105
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1106
- memset(aux32, 0, 8*sizeof(int32_t));
1107
- int8_t * GGML_RESTRICT a = aux8;
1108
- for (int j = 0; j < QK_K/64; ++j) {
1109
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1110
- a += 32;
1111
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1112
- a += 32; q4 += 32;
1113
- }
1114
- memcpy(utmp, x[i].scales, 12);
1115
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1116
- const uint32_t uaux = utmp[1] & kmask1;
1117
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1118
- utmp[2] = uaux;
1119
- utmp[0] &= kmask1;
1120
-
1121
- int sumi = 0;
1122
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1123
- a = aux8;
1124
- int is = 0;
1125
- for (int j = 0; j < QK_K/32; ++j) {
1126
- int32_t scale = scales[is++];
1127
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1128
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1129
- q8 += 8; a += 8;
1130
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1131
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1132
- q8 += 8; a += 8;
1133
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1134
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1135
- q8 += 8; a += 8;
1136
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1137
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1138
- q8 += 8; a += 8;
1139
- }
1140
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1141
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1142
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1143
- sumf -= dmin * sumi;
1144
- }
1145
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1146
- *s = sumf;
968
+ UNUSED(x);
969
+ UNUSED(y);
970
+ UNUSED(nb);
971
+ UNUSED(kmask1);
972
+ UNUSED(kmask2);
973
+ UNUSED(kmask3);
974
+ UNUSED(utmp);
975
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1147
976
  #endif
1148
977
  }
1149
978
 
@@ -1279,66 +1108,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1279
1108
  *s = sumf;
1280
1109
 
1281
1110
  #else
1282
-
1283
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1284
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1285
-
1286
- int8_t aux8[QK_K];
1287
- int16_t aux16[8];
1288
- float sums [8];
1289
- int32_t aux32[8];
1290
- memset(sums, 0, 8*sizeof(float));
1291
-
1292
- float sumf = 0;
1293
- for (int i = 0; i < nb; ++i) {
1294
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1295
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1296
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1297
- memset(aux32, 0, 8*sizeof(int32_t));
1298
- int8_t * GGML_RESTRICT a = aux8;
1299
- uint8_t m = 1;
1300
- for (int j = 0; j < QK_K/64; ++j) {
1301
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1302
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1303
- a += 32; m <<= 1;
1304
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1305
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1306
- a += 32; m <<= 1;
1307
- q4 += 32;
1308
- }
1309
- memcpy(utmp, x[i].scales, 12);
1310
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1311
- const uint32_t uaux = utmp[1] & kmask1;
1312
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1313
- utmp[2] = uaux;
1314
- utmp[0] &= kmask1;
1315
-
1316
- int sumi = 0;
1317
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1318
- a = aux8;
1319
- int is = 0;
1320
- for (int j = 0; j < QK_K/32; ++j) {
1321
- int32_t scale = scales[is++];
1322
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1323
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1324
- q8 += 8; a += 8;
1325
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1326
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1327
- q8 += 8; a += 8;
1328
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1329
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1330
- q8 += 8; a += 8;
1331
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1332
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1333
- q8 += 8; a += 8;
1334
- }
1335
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1336
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1337
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1338
- sumf -= dmin * sumi;
1339
- }
1340
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1341
- *s = sumf;
1111
+ UNUSED(x);
1112
+ UNUSED(y);
1113
+ UNUSED(nb);
1114
+ UNUSED(kmask1);
1115
+ UNUSED(kmask2);
1116
+ UNUSED(kmask3);
1117
+ UNUSED(utmp);
1118
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1342
1119
  #endif
1343
1120
  }
1344
1121
 
@@ -1435,47 +1212,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1435
1212
  *s = sumf;
1436
1213
 
1437
1214
  #else
1438
-
1439
- int8_t aux8[QK_K];
1440
- int16_t aux16[8];
1441
- float sums [8];
1442
- int32_t aux32[8];
1443
- memset(sums, 0, 8*sizeof(float));
1444
-
1445
- float sumf = 0;
1446
- for (int i = 0; i < nb; ++i) {
1447
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1448
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
1449
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1450
- memset(aux32, 0, 8*sizeof(int32_t));
1451
- int8_t * GGML_RESTRICT a = aux8;
1452
- for (int j = 0; j < QK_K; j += 128) {
1453
- for (int l = 0; l < 32; ++l) {
1454
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1455
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1456
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1457
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1458
- }
1459
- a += 128;
1460
- q4 += 64;
1461
- qh += 32;
1462
- }
1463
- a = aux8;
1464
- int is = 0;
1465
- for (int j = 0; j < QK_K/16; ++j) {
1466
- int scale = x[i].scales[is++];
1467
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1468
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1469
- q8 += 8; a += 8;
1470
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1471
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1472
- q8 += 8; a += 8;
1473
- }
1474
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1475
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1476
- }
1477
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1478
- *s = sumf;
1215
+ UNUSED(x);
1216
+ UNUSED(y);
1217
+ UNUSED(nb);
1218
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1479
1219
  #endif
1480
1220
  }
1481
1221