@fugood/llama.node 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1236,44 +1236,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1236
1236
  *s = sumf;
1237
1237
 
1238
1238
  #else
1239
- const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
1240
-
1241
- float sumf = 0.0f;
1242
-
1243
- for (int i = 0; i < nb; ++i) {
1244
- int sum = 0;
1245
-
1246
- for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
1247
- for (size_t l = 0; l < 5; ++l) {
1248
- for (size_t m = 0; m < 32; ++m) {
1249
- uint8_t q = x[i].qs[j + m] * pow3[l];
1250
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1251
- sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
1252
- }
1253
- }
1254
- }
1255
- for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
1256
- for (size_t l = 0; l < 5; ++l) {
1257
- for (size_t m = 0; m < 16; ++m) {
1258
- uint8_t q = x[i].qs[j + m] * pow3[l];
1259
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1260
- sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
1261
- }
1262
- }
1263
- }
1264
-
1265
- for (size_t l = 0; l < 4; ++l) {
1266
- for (size_t j = 0; j < sizeof(x->qh); ++j) {
1267
- uint8_t q = x[i].qh[j] * pow3[l];
1268
- uint16_t xi = ((uint16_t) q * 3) >> 8;
1269
- sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
1270
- }
1271
- }
1272
-
1273
- sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
1274
- }
1275
-
1276
- *s = sumf;
1239
+ UNUSED(x);
1240
+ UNUSED(y);
1241
+ UNUSED(nb);
1242
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1277
1243
  #endif
1278
1244
  }
1279
1245
 
@@ -1381,25 +1347,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1381
1347
  *s = sumf;
1382
1348
 
1383
1349
  #else
1384
- float sumf = 0.0f;
1385
-
1386
- for (int i = 0; i < nb; ++i) {
1387
- int32_t sumi = 0;
1388
-
1389
- for (size_t j = 0; j < sizeof(x->qs); j += 32) {
1390
- for (size_t l = 0; l < 4; ++l) {
1391
- for (size_t k = 0; k < 32; ++k) {
1392
- sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
1393
- }
1394
- }
1395
- }
1396
-
1397
- const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1398
-
1399
- sumf += (float) sumi * d;
1400
- }
1401
-
1402
- *s = sumf;
1350
+ UNUSED(x);
1351
+ UNUSED(y);
1352
+ UNUSED(nb);
1353
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1403
1354
  #endif
1404
1355
  }
1405
1356
 
@@ -1729,45 +1680,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1729
1680
  *s = sum;
1730
1681
 
1731
1682
  #else
1732
-
1733
- float sumf = 0;
1734
-
1735
- for (int i = 0; i < nb; ++i) {
1736
-
1737
- const uint8_t * q2 = x[i].qs;
1738
- const int8_t * q8 = y[i].qs;
1739
- const uint8_t * sc = x[i].scales;
1740
-
1741
- int summs = 0;
1742
- for (int j = 0; j < 16; ++j) {
1743
- summs += y[i].bsums[j] * (sc[j] >> 4);
1744
- }
1745
-
1746
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1747
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1748
-
1749
- int isum = 0;
1750
- int is = 0;
1751
- int d;
1752
- for (int k = 0; k < QK_K/128; ++k) {
1753
- int shift = 0;
1754
- for (int j = 0; j < 4; ++j) {
1755
- d = sc[is++] & 0xF;
1756
- int isuml = 0;
1757
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1758
- isum += d * isuml;
1759
- d = sc[is++] & 0xF;
1760
- isuml = 0;
1761
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1762
- isum += d * isuml;
1763
- shift += 2;
1764
- q8 += 32;
1765
- }
1766
- q2 += 32;
1767
- }
1768
- sumf += dall * isum - dmin * summs;
1769
- }
1770
- *s = sumf;
1683
+ UNUSED(x);
1684
+ UNUSED(y);
1685
+ UNUSED(nb);
1686
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1771
1687
  #endif
1772
1688
  }
1773
1689
 
@@ -2057,68 +1973,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2057
1973
  *s = sum;
2058
1974
 
2059
1975
  #else
2060
- // scalar version
2061
- // This function is written like this so the compiler can manage to vectorize most of it
2062
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
2063
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
2064
- // The ideal situation would be if we could just write the code once, and the compiler would
2065
- // automatically produce the best possible set of machine instructions, instead of us having to manually
2066
- // write vectorized versions for AVX, ARM_NEON, etc.
2067
-
2068
- int8_t aux8[QK_K];
2069
- int16_t aux16[8];
2070
- float sums [8];
2071
- int32_t aux32[8];
2072
- memset(sums, 0, 8*sizeof(float));
2073
-
2074
- uint32_t auxs[4];
2075
- const int8_t * scales = (const int8_t*)auxs;
2076
-
2077
- float sumf = 0;
2078
- for (int i = 0; i < nb; ++i) {
2079
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2080
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
2081
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2082
- memset(aux32, 0, 8*sizeof(int32_t));
2083
- int8_t * GGML_RESTRICT a = aux8;
2084
- uint8_t m = 1;
2085
- for (int j = 0; j < QK_K; j += 128) {
2086
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
2087
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2088
- a += 32; m <<= 1;
2089
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
2090
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2091
- a += 32; m <<= 1;
2092
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
2093
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2094
- a += 32; m <<= 1;
2095
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
2096
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
2097
- a += 32; m <<= 1;
2098
- q3 += 32;
2099
- }
2100
- a = aux8;
2101
-
2102
- memcpy(auxs, x[i].scales, 12);
2103
- uint32_t tmp = auxs[2];
2104
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
2105
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
2106
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
2107
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
2108
- for (int j = 0; j < QK_K/16; ++j) {
2109
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2110
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
2111
- q8 += 8; a += 8;
2112
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2113
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
2114
- q8 += 8; a += 8;
2115
- }
2116
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2117
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2118
- }
2119
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2120
- *s = sumf;
2121
-
1976
+ UNUSED(kmask1);
1977
+ UNUSED(kmask2);
1978
+ UNUSED(x);
1979
+ UNUSED(y);
1980
+ UNUSED(nb);
1981
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2122
1982
  #endif
2123
1983
 
2124
1984
  }
@@ -2431,61 +2291,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2431
2291
  *s = sumf;
2432
2292
 
2433
2293
  #else
2434
-
2435
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2436
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2437
-
2438
- int8_t aux8[QK_K];
2439
- int16_t aux16[8];
2440
- float sums [8];
2441
- int32_t aux32[8];
2442
- memset(sums, 0, 8*sizeof(float));
2443
-
2444
- float sumf = 0;
2445
- for (int i = 0; i < nb; ++i) {
2446
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2447
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2448
- memset(aux32, 0, 8*sizeof(int32_t));
2449
- int8_t * GGML_RESTRICT a = aux8;
2450
- for (int j = 0; j < QK_K/64; ++j) {
2451
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2452
- a += 32;
2453
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2454
- a += 32; q4 += 32;
2455
- }
2456
- memcpy(utmp, x[i].scales, 12);
2457
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2458
- const uint32_t uaux = utmp[1] & kmask1;
2459
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2460
- utmp[2] = uaux;
2461
- utmp[0] &= kmask1;
2462
-
2463
- int sumi = 0;
2464
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2465
- a = aux8;
2466
- int is = 0;
2467
- for (int j = 0; j < QK_K/32; ++j) {
2468
- int32_t scale = scales[is++];
2469
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2470
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2471
- q8 += 8; a += 8;
2472
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2473
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2474
- q8 += 8; a += 8;
2475
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2476
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2477
- q8 += 8; a += 8;
2478
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2479
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2480
- q8 += 8; a += 8;
2481
- }
2482
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2483
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2484
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2485
- sumf -= dmin * sumi;
2486
- }
2487
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2488
- *s = sumf;
2294
+ UNUSED(x);
2295
+ UNUSED(y);
2296
+ UNUSED(nb);
2297
+ UNUSED(kmask1);
2298
+ UNUSED(kmask2);
2299
+ UNUSED(kmask3);
2300
+ UNUSED(utmp);
2301
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2489
2302
  #endif
2490
2303
  }
2491
2304
 
@@ -2578,66 +2391,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2578
2391
  *s = sumf;
2579
2392
 
2580
2393
  #else
2581
-
2582
- const uint8_t * scales = (const uint8_t*)&utmp[0];
2583
- const uint8_t * mins = (const uint8_t*)&utmp[2];
2584
-
2585
- int8_t aux8[QK_K];
2586
- int16_t aux16[8];
2587
- float sums [8];
2588
- int32_t aux32[8];
2589
- memset(sums, 0, 8*sizeof(float));
2590
-
2591
- float sumf = 0;
2592
- for (int i = 0; i < nb; ++i) {
2593
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
2594
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
2595
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2596
- memset(aux32, 0, 8*sizeof(int32_t));
2597
- int8_t * GGML_RESTRICT a = aux8;
2598
- uint8_t m = 1;
2599
- for (int j = 0; j < QK_K/64; ++j) {
2600
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
2601
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2602
- a += 32; m <<= 1;
2603
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
2604
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
2605
- a += 32; m <<= 1;
2606
- q4 += 32;
2607
- }
2608
- memcpy(utmp, x[i].scales, 12);
2609
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
2610
- const uint32_t uaux = utmp[1] & kmask1;
2611
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2612
- utmp[2] = uaux;
2613
- utmp[0] &= kmask1;
2614
-
2615
- int sumi = 0;
2616
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
2617
- a = aux8;
2618
- int is = 0;
2619
- for (int j = 0; j < QK_K/32; ++j) {
2620
- int32_t scale = scales[is++];
2621
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2622
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2623
- q8 += 8; a += 8;
2624
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2625
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2626
- q8 += 8; a += 8;
2627
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2628
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2629
- q8 += 8; a += 8;
2630
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2631
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2632
- q8 += 8; a += 8;
2633
- }
2634
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2635
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2636
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2637
- sumf -= dmin * sumi;
2638
- }
2639
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2640
- *s = sumf;
2394
+ UNUSED(x);
2395
+ UNUSED(y);
2396
+ UNUSED(nb);
2397
+ UNUSED(kmask1);
2398
+ UNUSED(kmask2);
2399
+ UNUSED(kmask3);
2400
+ UNUSED(utmp);
2401
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2641
2402
  #endif
2642
2403
  }
2643
2404
 
@@ -3093,47 +2854,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
3093
2854
  }
3094
2855
  *s = sum;
3095
2856
  #else
3096
-
3097
- int8_t aux8[QK_K];
3098
- int16_t aux16[8];
3099
- float sums [8];
3100
- int32_t aux32[8];
3101
- memset(sums, 0, 8*sizeof(float));
3102
-
3103
- float sumf = 0;
3104
- for (int i = 0; i < nb; ++i) {
3105
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
3106
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
3107
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3108
- memset(aux32, 0, 8*sizeof(int32_t));
3109
- int8_t * GGML_RESTRICT a = aux8;
3110
- for (int j = 0; j < QK_K; j += 128) {
3111
- for (int l = 0; l < 32; ++l) {
3112
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
3113
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
3114
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
3115
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
3116
- }
3117
- a += 128;
3118
- q4 += 64;
3119
- qh += 32;
3120
- }
3121
- a = aux8;
3122
- int is = 0;
3123
- for (int j = 0; j < QK_K/16; ++j) {
3124
- int scale = x[i].scales[is++];
3125
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
3126
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3127
- q8 += 8; a += 8;
3128
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
3129
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3130
- q8 += 8; a += 8;
3131
- }
3132
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3133
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
3134
- }
3135
- for (int l = 0; l < 8; ++l) sumf += sums[l];
3136
- *s = sumf;
2857
+ UNUSED(x);
2858
+ UNUSED(y);
2859
+ UNUSED(nb);
2860
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3137
2861
  #endif
3138
2862
  }
3139
2863
 
@@ -3229,34 +2953,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3229
2953
  *s = 0.25f * sumf;
3230
2954
 
3231
2955
  #else
3232
-
3233
- uint32_t aux32[2];
3234
- const uint8_t * aux8 = (const uint8_t *)aux32;
3235
-
3236
- float sumf = 0.f;
3237
- for (int i = 0; i < nb; ++i) {
3238
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3239
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3240
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3241
- int32_t bsum = 0;
3242
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3243
- memcpy(aux32, q2, 2*sizeof(uint32_t));
3244
- q2 += 4;
3245
- const uint32_t ls = 2*(aux32[1] >> 28) + 1;
3246
- int32_t sumi = 0;
3247
- for (int l = 0; l < 4; ++l) {
3248
- const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
3249
- const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
3250
- for (int j = 0; j < 8; ++j) {
3251
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3252
- }
3253
- q8 += 8;
3254
- }
3255
- bsum += sumi * ls;
3256
- }
3257
- sumf += d * bsum;
3258
- }
3259
- *s = 0.125f * sumf;
2956
+ UNUSED(x);
2957
+ UNUSED(y);
2958
+ UNUSED(nb);
2959
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3260
2960
  #endif
3261
2961
  }
3262
2962
 
@@ -3327,42 +3027,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
3327
3027
  *s = 0.125f * sumf;
3328
3028
 
3329
3029
  #else
3330
-
3331
- float sumf = 0.f;
3332
- for (int i = 0; i < nb; ++i) {
3333
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3334
- const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3335
- const uint8_t * GGML_RESTRICT sc = x[i].scales;
3336
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3337
- int32_t bsum = 0;
3338
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3339
- const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
3340
- const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
3341
- int32_t sumi = 0;
3342
- for (int l = 0; l < 2; ++l) {
3343
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3344
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3345
- for (int j = 0; j < 8; ++j) {
3346
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3347
- }
3348
- q8 += 8;
3349
- }
3350
- bsum += sumi * ls1;
3351
- sumi = 0;
3352
- for (int l = 2; l < 4; ++l) {
3353
- const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
3354
- const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
3355
- for (int j = 0; j < 8; ++j) {
3356
- sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
3357
- }
3358
- q8 += 8;
3359
- }
3360
- bsum += sumi * ls2;
3361
- q2 += 4;
3362
- }
3363
- sumf += d * bsum;
3364
- }
3365
- *s = 0.125f * sumf;
3030
+ UNUSED(x);
3031
+ UNUSED(y);
3032
+ UNUSED(nb);
3033
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3366
3034
  #endif
3367
3035
  }
3368
3036
 
@@ -3455,45 +3123,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3455
3123
  *s = 0.125f * sumf;
3456
3124
 
3457
3125
  #else
3458
-
3459
- float sumf = 0;
3460
- for (int i = 0; i < nb; i++) {
3461
-
3462
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3463
- const int8_t * q8 = y[i].qs;
3464
- const uint8_t * qs = x[i].qs;
3465
- const uint8_t * qh = x[i].qh;
3466
- const uint8_t * signs = qs + QK_K/8;
3467
-
3468
- int bsum = 0;
3469
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3470
- int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
3471
- int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
3472
- int sumi1 = 0, sumi2 = 0;
3473
- for (int l = 0; l < 2; ++l) {
3474
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3475
- for (int j = 0; j < 8; ++j) {
3476
- sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3477
- }
3478
- q8 += 8;
3479
- }
3480
- for (int l = 2; l < 4; ++l) {
3481
- const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
3482
- for (int j = 0; j < 8; ++j) {
3483
- sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
3484
- }
3485
- q8 += 8;
3486
- }
3487
- bsum += ls1 * sumi1 + ls2 * sumi2;
3488
- qs += 4;
3489
- signs += 4;
3490
- }
3491
-
3492
- sumf += d * bsum;
3493
- }
3494
-
3495
- *s = 0.125f * sumf;
3496
-
3126
+ UNUSED(x);
3127
+ UNUSED(y);
3128
+ UNUSED(nb);
3129
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3497
3130
  #endif
3498
3131
 
3499
3132
  }
@@ -3553,36 +3186,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3553
3186
  *s = 0.5f * sumf;
3554
3187
 
3555
3188
  #else
3556
-
3557
- uint32_t aux32;
3558
-
3559
- float sumf = 0.f;
3560
- for (int i = 0; i < nb; ++i) {
3561
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3562
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3563
- const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3564
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3565
- int32_t bsum = 0;
3566
- for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
3567
- memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
3568
- const uint32_t ls = 2*(aux32 >> 28) + 1;
3569
- int32_t sumi = 0;
3570
- for (int l = 0; l < 4; ++l) {
3571
- const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
3572
- const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
3573
- const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
3574
- for (int j = 0; j < 4; ++j) {
3575
- sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
3576
- sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
3577
- }
3578
- q8 += 8;
3579
- }
3580
- q3 += 8;
3581
- bsum += sumi * ls;
3582
- }
3583
- sumf += d * bsum;
3584
- }
3585
- *s = 0.25f * sumf;
3189
+ UNUSED(x);
3190
+ UNUSED(y);
3191
+ UNUSED(nb);
3192
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3586
3193
  #endif
3587
3194
  }
3588
3195
 
@@ -3689,48 +3296,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3689
3296
  *s = sumf;
3690
3297
 
3691
3298
  #else
3692
-
3693
- float sumf = 0.f;
3694
- for (int i = 0; i < nb; ++i) {
3695
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3696
- const uint8_t * GGML_RESTRICT qs = x[i].qs;
3697
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
3698
- const uint8_t * GGML_RESTRICT signs = x[i].signs;
3699
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
3700
- int32_t bsum = 0;
3701
- for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
3702
- const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
3703
- const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
3704
- int32_t sumi = 0;
3705
- for (int l = 0; l < 4; ++l) {
3706
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
3707
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
3708
- for (int j = 0; j < 4; ++j) {
3709
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3710
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3711
- }
3712
- q8 += 8;
3713
- }
3714
- qs += 8;
3715
- signs += 4;
3716
- bsum += sumi * ls1;
3717
- sumi = 0;
3718
- for (int l = 0; l < 4; ++l) {
3719
- const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
3720
- const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
3721
- for (int j = 0; j < 4; ++j) {
3722
- sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
3723
- sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
3724
- }
3725
- q8 += 8;
3726
- }
3727
- qs += 8;
3728
- signs += 4;
3729
- bsum += sumi * ls2;
3730
- }
3731
- sumf += d * bsum;
3732
- }
3733
- *s = sumf;
3299
+ UNUSED(x);
3300
+ UNUSED(y);
3301
+ UNUSED(nb);
3302
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3734
3303
  #endif
3735
3304
  }
3736
3305
 
@@ -3793,36 +3362,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3793
3362
  *s = sumf;
3794
3363
 
3795
3364
  #else
3796
-
3797
- float sumf = 0;
3798
- for (int i = 0; i < nb; i++) {
3799
-
3800
- const int8_t * q8 = y[i].qs;
3801
- const uint8_t * qs = x[i].qs;
3802
- const uint16_t * qh = x[i].qh;
3803
-
3804
- int sumi = 0, sumi1 = 0;
3805
- for (int ib = 0; ib < QK_K/32; ++ib) {
3806
- const int ls = 2*((qh[ib] >> 12) & 7) + 1;
3807
- const int delta = qh[ib] & 0x8000 ? -1 : 1;
3808
- int lsum = 0;
3809
- for (int l = 0; l < 4; ++l) {
3810
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
3811
- for (int j = 0; j < 8; ++j) {
3812
- lsum += q8[j] * grid[j];
3813
- }
3814
- q8 += 8;
3815
- }
3816
- sumi += ls * lsum;
3817
- sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
3818
- qs += 4;
3819
- }
3820
-
3821
- sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3822
- }
3823
-
3824
- *s = sumf;
3825
-
3365
+ UNUSED(x);
3366
+ UNUSED(y);
3367
+ UNUSED(nb);
3368
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3826
3369
  #endif
3827
3370
  }
3828
3371
 
@@ -3912,52 +3455,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3912
3455
  *s = sumf;
3913
3456
 
3914
3457
  #else
3915
-
3916
- int sum1[2], sum2[2], delta[4];
3917
-
3918
- float sumf = 0;
3919
- for (int i = 0; i < nb; i++) {
3920
-
3921
- const int8_t * q8 = y[i].qs;
3922
- const uint8_t * qs = x[i].qs;
3923
- const uint8_t * qh = x[i].qh;
3924
- const uint16_t * sc = (const uint16_t *)x[i].scales;
3925
-
3926
- scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
3927
-
3928
- int sumi1 = 0, sumi2 = 0;
3929
- for (int ib = 0; ib < QK_K/32; ++ib) {
3930
- delta[0] = qh[0] & 0x08 ? -1 : 1;
3931
- delta[1] = qh[0] & 0x80 ? -1 : 1;
3932
- delta[2] = qh[1] & 0x08 ? -1 : 1;
3933
- delta[3] = qh[1] & 0x80 ? -1 : 1;
3934
- sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
3935
- for (int l = 0; l < 4; ++l) {
3936
- const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
3937
- int lsum1 = 0, lsum2 = 0;
3938
- for (int j = 0; j < 8; ++j) {
3939
- lsum1 += q8[j] * grid[j];
3940
- lsum2 += q8[j];
3941
- }
3942
- q8 += 8;
3943
- sum1[l/2] += lsum1;
3944
- sum2[l/2] += lsum2*delta[l];
3945
- }
3946
-
3947
- const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
3948
- const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
3949
-
3950
- sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
3951
- sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
3952
- qs += 4;
3953
- qh += 2;
3954
- }
3955
-
3956
- sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
3957
- }
3958
-
3959
- *s = sumf;
3960
-
3458
+ UNUSED(x);
3459
+ UNUSED(y);
3460
+ UNUSED(nb);
3461
+ UNUSED(scale);
3462
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
3961
3463
  #endif
3962
3464
  }
3963
3465
 
@@ -4078,37 +3580,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4078
3580
  *s = sumf;
4079
3581
 
4080
3582
  #else
4081
- float sumf = 0;
4082
- for (int ibl = 0; ibl < nb; ++ibl) {
4083
- const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4084
- uint16_t h = x[ibl].scales_h;
4085
- const uint8_t * qs = x[ibl].qs;
4086
- const int8_t * q8 = y[ibl].qs;
4087
- for (int ib = 0; ib < QK_K/32; ib += 2) {
4088
- const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
4089
- const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
4090
- h >>= 4;
4091
- const float d1 = d4d8*(ls1 - 32);
4092
- const float d2 = d4d8*(ls2 - 32);
4093
- int sumi1 = 0, sumi2 = 0;
4094
- for (int j = 0; j < 16; ++j) {
4095
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4096
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4097
- }
4098
- sumf += d1 * (sumi1 + sumi2);
4099
- qs += 16;
4100
- q8 += 32;
4101
- sumi1 = sumi2 = 0;
4102
- for (int j = 0; j < 16; ++j) {
4103
- sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
4104
- sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
4105
- }
4106
- sumf += d2 * (sumi1 + sumi2);
4107
- qs += 16;
4108
- q8 += 32;
4109
- }
4110
- }
4111
- *s = sumf;
3583
+ UNUSED(x);
3584
+ UNUSED(y);
3585
+ UNUSED(nb);
3586
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
4112
3587
  #endif
4113
3588
  }
4114
3589