@fugood/llama.node 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -201,24 +201,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
201
201
 
202
202
  sumf = vec_extract(vsumf0, 0);
203
203
 
204
- #endif
205
- for (; ib < nb; ++ib) {
206
- int sumi0 = 0;
207
- int sumi1 = 0;
208
-
209
- for (int j = 0; j < qk/2; ++j) {
210
- const int v0 = (x[ib].qs[j] & 0x0F) - 8;
211
- const int v1 = (x[ib].qs[j] >> 4) - 8;
212
-
213
- sumi0 += (v0 * y[ib].qs[j]);
214
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
215
- }
216
-
217
- int sumi = sumi0 + sumi1;
218
- sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
219
- }
220
-
221
204
  *s = sumf;
205
+ #else
206
+ UNUSED(x);
207
+ UNUSED(y);
208
+ UNUSED(ib);
209
+ UNUSED(sumf);
210
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
211
+ #endif
222
212
  }
223
213
 
224
214
  void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -278,24 +268,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
278
268
 
279
269
  sumf = vec_extract(vsumf0, 0);
280
270
 
281
- #endif
282
- for (; ib < nb; ++ib) {
283
- int sumi0 = 0;
284
- int sumi1 = 0;
285
-
286
- for (int j = 0; j < qk/2; ++j) {
287
- const int v0 = (x[ib].qs[j] & 0x0F);
288
- const int v1 = (x[ib].qs[j] >> 4);
289
-
290
- sumi0 += (v0 * y[ib].qs[j]);
291
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
292
- }
293
-
294
- int sumi = sumi0 + sumi1;
295
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
296
- }
297
-
298
271
  *s = sumf;
272
+ #else
273
+ UNUSED(x);
274
+ UNUSED(y);
275
+ UNUSED(ib);
276
+ UNUSED(sumf);
277
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
278
+ #endif
299
279
  }
300
280
 
301
281
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -360,30 +340,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
360
340
 
361
341
  sumf = vec_extract(vsumf0, 0);
362
342
 
363
- #endif
364
- for (; ib < nb; ++ib) {
365
- uint32_t qh;
366
- memcpy(&qh, x[ib].qh, sizeof(qh));
367
-
368
- int sumi0 = 0;
369
- int sumi1 = 0;
370
-
371
- for (int j = 0; j < qk/2; ++j) {
372
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
373
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
374
-
375
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
376
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
377
-
378
- sumi0 += (x0 * y[ib].qs[j]);
379
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
380
- }
381
-
382
- int sumi = sumi0 + sumi1;
383
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
384
- }
385
-
386
343
  *s = sumf;
344
+ #else
345
+ UNUSED(ib);
346
+ UNUSED(sumf);
347
+ UNUSED(x);
348
+ UNUSED(y);
349
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
350
+ #endif
387
351
  }
388
352
 
389
353
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -451,30 +415,15 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
451
415
 
452
416
  sumf = vec_extract(vsumf0, 0);
453
417
 
454
- #endif
455
- for (; ib < nb; ++ib) {
456
- uint32_t qh;
457
- memcpy(&qh, x[ib].qh, sizeof(qh));
458
-
459
- int sumi0 = 0;
460
- int sumi1 = 0;
461
-
462
- for (int j = 0; j < qk/2; ++j) {
463
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
464
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
465
-
466
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
467
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
468
-
469
- sumi0 += (x0 * y[ib].qs[j]);
470
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
471
- }
472
-
473
- int sumi = sumi0 + sumi1;
474
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
475
- }
476
-
477
418
  *s = sumf;
419
+ #else
420
+ UNUSED(nb);
421
+ UNUSED(ib);
422
+ UNUSED(sumf);
423
+ UNUSED(x);
424
+ UNUSED(y);
425
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
426
+ #endif
478
427
  }
479
428
 
480
429
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -535,18 +484,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
535
484
 
536
485
  sumf = vec_extract(vsumf0, 0);
537
486
 
538
- #endif
539
- for (; ib < nb; ++ib) {
540
- int sumi = 0;
541
-
542
- for (int j = 0; j < qk; j++) {
543
- sumi += x[ib].qs[j]*y[ib].qs[j];
544
- }
545
-
546
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
547
- }
548
-
549
487
  *s = sumf;
488
+ #else
489
+ UNUSED(nb);
490
+ UNUSED(x);
491
+ UNUSED(y);
492
+ UNUSED(ib);
493
+ UNUSED(sumf);
494
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
495
+ #endif
550
496
  }
551
497
 
552
498
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -695,45 +641,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
695
641
  *s = vec_extract(vsumf0, 0);
696
642
 
697
643
  #else
698
-
699
- float sumf = 0;
700
-
701
- for (int i = 0; i < nb; ++i) {
702
-
703
- const uint8_t * q2 = x[i].qs;
704
- const int8_t * q8 = y[i].qs;
705
- const uint8_t * sc = x[i].scales;
706
-
707
- int summs = 0;
708
- for (int j = 0; j < 16; ++j) {
709
- summs += y[i].bsums[j] * (sc[j] >> 4);
710
- }
711
-
712
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
713
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
714
-
715
- int isum = 0;
716
- int is = 0;
717
- int d;
718
- for (int k = 0; k < QK_K/128; ++k) {
719
- int shift = 0;
720
- for (int j = 0; j < 4; ++j) {
721
- d = sc[is++] & 0xF;
722
- int isuml = 0;
723
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
724
- isum += d * isuml;
725
- d = sc[is++] & 0xF;
726
- isuml = 0;
727
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
728
- isum += d * isuml;
729
- shift += 2;
730
- q8 += 32;
731
- }
732
- q2 += 32;
733
- }
734
- sumf += dall * isum - dmin * summs;
735
- }
736
- *s = sumf;
644
+ UNUSED(x);
645
+ UNUSED(y);
646
+ UNUSED(nb);
647
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
737
648
  #endif
738
649
  }
739
650
 
@@ -907,70 +818,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
907
818
  *s = vec_extract(vsumf0, 0);
908
819
 
909
820
  #else
910
- // scalar version
911
- // This function is written like this so the compiler can manage to vectorize most of it
912
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
913
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
914
- // The ideal situation would be if we could just write the code once, and the compiler would
915
- // automatically produce the best possible set of machine instructions, instead of us having to manually
916
- // write vectorized versions for AVX, ARM_NEON, etc.
917
-
918
- int8_t aux8[QK_K];
919
- int16_t aux16[8];
920
- float sums [8];
921
- int32_t aux32[8];
922
- memset(sums, 0, 8*sizeof(float));
923
-
924
- uint32_t auxs[4];
925
- const int8_t * scales = (const int8_t*)auxs;
926
-
927
- float sumf = 0;
928
- for (int i = 0; i < nb; ++i) {
929
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
930
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
931
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
932
- memset(aux32, 0, 8*sizeof(int32_t));
933
- int8_t * GGML_RESTRICT a = aux8;
934
- uint8_t m = 1;
935
- for (int j = 0; j < QK_K; j += 128) {
936
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
937
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
938
- a += 32; m <<= 1;
939
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
940
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
941
- a += 32; m <<= 1;
942
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
943
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
944
- a += 32; m <<= 1;
945
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
946
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
947
- a += 32; m <<= 1;
948
- q3 += 32;
949
- }
950
- a = aux8;
951
-
952
- memcpy(auxs, x[i].scales, 12);
953
- uint32_t tmp = auxs[2];
954
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
955
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
956
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
957
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
958
- for (int j = 0; j < QK_K/16; ++j) {
959
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
960
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
961
- q8 += 8; a += 8;
962
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
963
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
964
- q8 += 8; a += 8;
965
- }
966
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
967
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
968
- }
969
- for (int l = 0; l < 8; ++l) sumf += sums[l];
970
- *s = sumf;
971
-
821
+ UNUSED(kmask1);
822
+ UNUSED(kmask2);
823
+ UNUSED(x);
824
+ UNUSED(y);
825
+ UNUSED(nb);
826
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
972
827
  #endif
973
-
974
828
  }
975
829
 
976
830
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -1130,61 +984,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1130
984
  *s = vec_extract(vsumf0, 0);
1131
985
 
1132
986
  #else
1133
-
1134
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1135
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1136
-
1137
- int8_t aux8[QK_K];
1138
- int16_t aux16[8];
1139
- float sums [8];
1140
- int32_t aux32[8];
1141
- memset(sums, 0, 8*sizeof(float));
1142
-
1143
- float sumf = 0;
1144
- for (int i = 0; i < nb; ++i) {
1145
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1146
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1147
- memset(aux32, 0, 8*sizeof(int32_t));
1148
- int8_t * GGML_RESTRICT a = aux8;
1149
- for (int j = 0; j < QK_K/64; ++j) {
1150
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1151
- a += 32;
1152
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1153
- a += 32; q4 += 32;
1154
- }
1155
- memcpy(utmp, x[i].scales, 12);
1156
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1157
- const uint32_t uaux = utmp[1] & kmask1;
1158
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1159
- utmp[2] = uaux;
1160
- utmp[0] &= kmask1;
1161
-
1162
- int sumi = 0;
1163
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1164
- a = aux8;
1165
- int is = 0;
1166
- for (int j = 0; j < QK_K/32; ++j) {
1167
- int32_t scale = scales[is++];
1168
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1169
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1170
- q8 += 8; a += 8;
1171
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1172
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1173
- q8 += 8; a += 8;
1174
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1175
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1176
- q8 += 8; a += 8;
1177
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1178
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1179
- q8 += 8; a += 8;
1180
- }
1181
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1182
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1183
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1184
- sumf -= dmin * sumi;
1185
- }
1186
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1187
- *s = sumf;
987
+ UNUSED(x);
988
+ UNUSED(y);
989
+ UNUSED(nb);
990
+ UNUSED(kmask1);
991
+ UNUSED(kmask2);
992
+ UNUSED(kmask3);
993
+ UNUSED(utmp);
994
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1188
995
  #endif
1189
996
  }
1190
997
 
@@ -1342,66 +1149,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1342
1149
  *s = vec_extract(vsumf0, 0);
1343
1150
 
1344
1151
  #else
1345
-
1346
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1347
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1348
-
1349
- int8_t aux8[QK_K];
1350
- int16_t aux16[8];
1351
- float sums [8];
1352
- int32_t aux32[8];
1353
- memset(sums, 0, 8*sizeof(float));
1354
-
1355
- float sumf = 0;
1356
- for (int i = 0; i < nb; ++i) {
1357
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1358
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1359
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1360
- memset(aux32, 0, 8*sizeof(int32_t));
1361
- int8_t * GGML_RESTRICT a = aux8;
1362
- uint8_t m = 1;
1363
- for (int j = 0; j < QK_K/64; ++j) {
1364
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1365
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1366
- a += 32; m <<= 1;
1367
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1368
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1369
- a += 32; m <<= 1;
1370
- q4 += 32;
1371
- }
1372
- memcpy(utmp, x[i].scales, 12);
1373
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1374
- const uint32_t uaux = utmp[1] & kmask1;
1375
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1376
- utmp[2] = uaux;
1377
- utmp[0] &= kmask1;
1378
-
1379
- int sumi = 0;
1380
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1381
- a = aux8;
1382
- int is = 0;
1383
- for (int j = 0; j < QK_K/32; ++j) {
1384
- int32_t scale = scales[is++];
1385
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1386
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1387
- q8 += 8; a += 8;
1388
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1389
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1390
- q8 += 8; a += 8;
1391
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1392
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1393
- q8 += 8; a += 8;
1394
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1395
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1396
- q8 += 8; a += 8;
1397
- }
1398
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1399
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1400
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1401
- sumf -= dmin * sumi;
1402
- }
1403
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1404
- *s = sumf;
1152
+ UNUSED(x);
1153
+ UNUSED(y);
1154
+ UNUSED(nb);
1155
+ UNUSED(kmask1);
1156
+ UNUSED(kmask2);
1157
+ UNUSED(kmask3);
1158
+ UNUSED(utmp);
1159
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1405
1160
  #endif
1406
1161
  }
1407
1162
 
@@ -1556,47 +1311,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1556
1311
  *s = vec_extract(vsumf0, 0);
1557
1312
 
1558
1313
  #else
1559
-
1560
- int8_t aux8[QK_K];
1561
- int16_t aux16[8];
1562
- float sums [8];
1563
- int32_t aux32[8];
1564
- memset(sums, 0, 8*sizeof(float));
1565
-
1566
- float sumf = 0;
1567
- for (int i = 0; i < nb; ++i) {
1568
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1569
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
1570
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1571
- memset(aux32, 0, 8*sizeof(int32_t));
1572
- int8_t * GGML_RESTRICT a = aux8;
1573
- for (int j = 0; j < QK_K; j += 128) {
1574
- for (int l = 0; l < 32; ++l) {
1575
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1576
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1577
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1578
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1579
- }
1580
- a += 128;
1581
- q4 += 64;
1582
- qh += 32;
1583
- }
1584
- a = aux8;
1585
- int is = 0;
1586
- for (int j = 0; j < QK_K/16; ++j) {
1587
- int scale = x[i].scales[is++];
1588
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1589
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1590
- q8 += 8; a += 8;
1591
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1592
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1593
- q8 += 8; a += 8;
1594
- }
1595
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1596
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1597
- }
1598
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1599
- *s = sumf;
1314
+ UNUSED(x);
1315
+ UNUSED(y);
1316
+ UNUSED(nb);
1317
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1600
1318
  #endif
1601
1319
  }
1602
1320
 
@@ -1737,34 +1455,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1737
1455
  *s = 0.125f * vec_extract(vsumf0, 0);
1738
1456
 
1739
1457
  #else
1740
-
1741
- uint32_t aux32[2];
1742
- const uint8_t * aux8 = (const uint8_t *)aux32;
1743
-
1744
- float sumf = 0.f;
1745
- for (int i = 0; i < nb; ++i) {
1746
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1747
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1748
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1749
- int32_t bsum = 0;
1750
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1751
- memcpy(aux32, q2, 2*sizeof(uint32_t));
1752
- q2 += 4;
1753
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1754
- int32_t sumi = 0;
1755
- for (int l = 0; l < 4; ++l) {
1756
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1757
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1758
- for (int j = 0; j < 8; ++j) {
1759
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1760
- }
1761
- q8 += 8;
1762
- }
1763
- bsum += sumi * ls;
1764
- }
1765
- sumf += d * bsum;
1766
- }
1767
- *s = 0.125f * sumf;
1458
+ UNUSED(x);
1459
+ UNUSED(y);
1460
+ UNUSED(nb);
1461
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1768
1462
  #endif
1769
1463
  }
1770
1464
 
@@ -1869,42 +1563,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1869
1563
  *s = 0.125f * vec_extract(vsumf0, 0);
1870
1564
 
1871
1565
  #else
1872
-
1873
- float sumf = 0.f;
1874
- for (int i = 0; i < nb; ++i) {
1875
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1876
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1877
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
1878
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1879
- int32_t bsum = 0;
1880
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1881
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
1882
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
1883
- int32_t sumi = 0;
1884
- for (int l = 0; l < 2; ++l) {
1885
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1886
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1887
- for (int j = 0; j < 8; ++j) {
1888
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1889
- }
1890
- q8 += 8;
1891
- }
1892
- bsum += sumi * ls1;
1893
- sumi = 0;
1894
- for (int l = 2; l < 4; ++l) {
1895
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1896
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1897
- for (int j = 0; j < 8; ++j) {
1898
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1899
- }
1900
- q8 += 8;
1901
- }
1902
- bsum += sumi * ls2;
1903
- q2 += 4;
1904
- }
1905
- sumf += d * bsum;
1906
- }
1907
- *s = 0.125f * sumf;
1566
+ UNUSED(x);
1567
+ UNUSED(y);
1568
+ UNUSED(nb);
1569
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1908
1570
  #endif
1909
1571
  }
1910
1572
 
@@ -2030,47 +1692,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2030
1692
  *s = 0.125f * vec_extract(vsumf0, 0);
2031
1693
 
2032
1694
  #else
2033
-
2034
- float sumf = 0;
2035
- for (int i = 0; i < nb; i++) {
2036
-
2037
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2038
- const int8_t * q8 = y[i].qs;
2039
- const uint8_t * qs = x[i].qs;
2040
- const uint8_t * qh = x[i].qh;
2041
- const uint8_t * signs = qs + QK_K/8;
2042
-
2043
- int bsum = 0;
2044
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2045
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
2046
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
2047
- int sumi1 = 0, sumi2 = 0;
2048
- for (int l = 0; l < 2; ++l) {
2049
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2050
- for (int j = 0; j < 8; ++j) {
2051
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2052
- }
2053
- q8 += 8;
2054
- }
2055
- for (int l = 2; l < 4; ++l) {
2056
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2057
- for (int j = 0; j < 8; ++j) {
2058
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2059
- }
2060
- q8 += 8;
2061
- }
2062
- bsum += ls1 * sumi1 + ls2 * sumi2;
2063
- qs += 4;
2064
- signs += 4;
2065
- }
2066
-
2067
- sumf += d * bsum;
2068
- }
2069
-
2070
- *s = 0.125f * sumf;
2071
-
1695
+ UNUSED(x);
1696
+ UNUSED(y);
1697
+ UNUSED(nb);
1698
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2072
1699
  #endif
2073
-
2074
1700
  }
2075
1701
 
2076
1702
  void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2172,36 +1798,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2172
1798
  *s = 0.25f * vec_extract(vsumf0, 0);
2173
1799
 
2174
1800
  #else
2175
-
2176
- uint32_t aux32;
2177
-
2178
- float sumf = 0.f;
2179
- for (int i = 0; i < nb; ++i) {
2180
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2181
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2182
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2183
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2184
- int32_t bsum = 0;
2185
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2186
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
2187
- const uint32_t ls = 2*(aux32 >> 28) + 1;
2188
- int32_t sumi = 0;
2189
- for (int l = 0; l < 4; ++l) {
2190
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
2191
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
2192
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
2193
- for (int j = 0; j < 4; ++j) {
2194
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
2195
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
2196
- }
2197
- q8 += 8;
2198
- }
2199
- q3 += 8;
2200
- bsum += sumi * ls;
2201
- }
2202
- sumf += d * bsum;
2203
- }
2204
- *s = 0.25f * sumf;
1801
+ UNUSED(x);
1802
+ UNUSED(y);
1803
+ UNUSED(nb);
1804
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2205
1805
  #endif
2206
1806
  }
2207
1807
 
@@ -2327,48 +1927,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2327
1927
  *s = vec_extract(vsumf0, 0);
2328
1928
 
2329
1929
  #else
2330
-
2331
- float sumf = 0.f;
2332
- for (int i = 0; i < nb; ++i) {
2333
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2334
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
2335
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2336
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
2337
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2338
- int32_t bsum = 0;
2339
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2340
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
2341
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
2342
- int32_t sumi = 0;
2343
- for (int l = 0; l < 4; ++l) {
2344
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
2345
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
2346
- for (int j = 0; j < 4; ++j) {
2347
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2348
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2349
- }
2350
- q8 += 8;
2351
- }
2352
- qs += 8;
2353
- signs += 4;
2354
- bsum += sumi * ls1;
2355
- sumi = 0;
2356
- for (int l = 0; l < 4; ++l) {
2357
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
2358
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
2359
- for (int j = 0; j < 4; ++j) {
2360
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2361
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2362
- }
2363
- q8 += 8;
2364
- }
2365
- qs += 8;
2366
- signs += 4;
2367
- bsum += sumi * ls2;
2368
- }
2369
- sumf += d * bsum;
2370
- }
2371
- *s = sumf;
1930
+ UNUSED(x);
1931
+ UNUSED(y);
1932
+ UNUSED(nb);
1933
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2372
1934
  #endif
2373
1935
  }
2374
1936
 
@@ -2481,36 +2043,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2481
2043
  *s = vec_extract(vsumf0, 0);
2482
2044
 
2483
2045
  #else
2484
-
2485
- float sumf = 0;
2486
- for (int i = 0; i < nb; i++) {
2487
-
2488
- const int8_t * q8 = y[i].qs;
2489
- const uint8_t * qs = x[i].qs;
2490
- const uint16_t * qh = x[i].qh;
2491
-
2492
- int sumi = 0, sumi1 = 0;
2493
- for (int ib = 0; ib < QK_K/32; ++ib) {
2494
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
2495
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
2496
- int lsum = 0;
2497
- for (int l = 0; l < 4; ++l) {
2498
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
2499
- for (int j = 0; j < 8; ++j) {
2500
- lsum += q8[j] * grid[j];
2501
- }
2502
- q8 += 8;
2503
- }
2504
- sumi += ls * lsum;
2505
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
2506
- qs += 4;
2507
- }
2508
-
2509
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2510
- }
2511
-
2512
- *s = sumf;
2513
-
2046
+ UNUSED(x);
2047
+ UNUSED(y);
2048
+ UNUSED(nb);
2049
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2514
2050
  #endif
2515
2051
  }
2516
2052
 
@@ -2581,17 +2117,15 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
2581
2117
 
2582
2118
  sumf = vec_extract(vsumf0, 0);
2583
2119
 
2584
- #endif
2585
- for (; ib < nb; ++ib) {
2586
- const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
2587
- int sumi1 = 0, sumi2 = 0;
2588
- for (int j = 0; j < QK4_NL/2; ++j) {
2589
- sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
2590
- sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
2591
- }
2592
- sumf += d * (sumi1 + sumi2);
2593
- }
2594
2120
  *s = sumf;
2121
+ #else
2122
+ UNUSED(x);
2123
+ UNUSED(y);
2124
+ UNUSED(nb);
2125
+ UNUSED(ib);
2126
+ UNUSED(sumf);
2127
+ ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
2128
+ #endif
2595
2129
  }
2596
2130
 
2597
2131
  void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -2696,37 +2230,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2696
2230
  *s = vec_extract(vsumf0, 0);
2697
2231
 
2698
2232
  #else
2699
- float sumf = 0;
2700
- for (int ibl = 0; ibl < nb; ++ibl) {
2701
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2702
- uint16_t h = x[ibl].scales_h;
2703
- const uint8_t * qs = x[ibl].qs;
2704
- const int8_t * q8 = y[ibl].qs;
2705
- for (int ib = 0; ib < QK_K/32; ib += 2) {
2706
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
2707
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
2708
- h >>= 4;
2709
- const float d1 = d4d8*(ls1 - 32);
2710
- const float d2 = d4d8*(ls2 - 32);
2711
- int sumi1 = 0, sumi2 = 0;
2712
- for (int j = 0; j < 16; ++j) {
2713
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2714
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2715
- }
2716
- sumf += d1 * (sumi1 + sumi2);
2717
- qs += 16;
2718
- q8 += 32;
2719
- sumi1 = sumi2 = 0;
2720
- for (int j = 0; j < 16; ++j) {
2721
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2722
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2723
- }
2724
- sumf += d2 * (sumi1 + sumi2);
2725
- qs += 16;
2726
- q8 += 32;
2727
- }
2728
- }
2729
- *s = sumf;
2233
+ UNUSED(x);
2234
+ UNUSED(y);
2235
+ UNUSED(nb);
2236
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2730
2237
  #endif
2731
2238
  }
2732
2239