@fugood/llama.node 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
|
@@ -1236,44 +1236,10 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1236
1236
|
*s = sumf;
|
|
1237
1237
|
|
|
1238
1238
|
#else
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
for (int i = 0; i < nb; ++i) {
|
|
1244
|
-
int sum = 0;
|
|
1245
|
-
|
|
1246
|
-
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
|
1247
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1248
|
-
for (size_t m = 0; m < 32; ++m) {
|
|
1249
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1250
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1251
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
|
1252
|
-
}
|
|
1253
|
-
}
|
|
1254
|
-
}
|
|
1255
|
-
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
|
1256
|
-
for (size_t l = 0; l < 5; ++l) {
|
|
1257
|
-
for (size_t m = 0; m < 16; ++m) {
|
|
1258
|
-
uint8_t q = x[i].qs[j + m] * pow3[l];
|
|
1259
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1260
|
-
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
|
1261
|
-
}
|
|
1262
|
-
}
|
|
1263
|
-
}
|
|
1264
|
-
|
|
1265
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1266
|
-
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
|
1267
|
-
uint8_t q = x[i].qh[j] * pow3[l];
|
|
1268
|
-
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
|
1269
|
-
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
|
1270
|
-
}
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
1274
|
-
}
|
|
1275
|
-
|
|
1276
|
-
*s = sumf;
|
|
1239
|
+
UNUSED(x);
|
|
1240
|
+
UNUSED(y);
|
|
1241
|
+
UNUSED(nb);
|
|
1242
|
+
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1277
1243
|
#endif
|
|
1278
1244
|
}
|
|
1279
1245
|
|
|
@@ -1381,25 +1347,10 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
1381
1347
|
*s = sumf;
|
|
1382
1348
|
|
|
1383
1349
|
#else
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
|
1390
|
-
for (size_t l = 0; l < 4; ++l) {
|
|
1391
|
-
for (size_t k = 0; k < 32; ++k) {
|
|
1392
|
-
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
|
1393
|
-
}
|
|
1394
|
-
}
|
|
1395
|
-
}
|
|
1396
|
-
|
|
1397
|
-
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1398
|
-
|
|
1399
|
-
sumf += (float) sumi * d;
|
|
1400
|
-
}
|
|
1401
|
-
|
|
1402
|
-
*s = sumf;
|
|
1350
|
+
UNUSED(x);
|
|
1351
|
+
UNUSED(y);
|
|
1352
|
+
UNUSED(nb);
|
|
1353
|
+
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1403
1354
|
#endif
|
|
1404
1355
|
}
|
|
1405
1356
|
|
|
@@ -1729,45 +1680,10 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
1729
1680
|
*s = sum;
|
|
1730
1681
|
|
|
1731
1682
|
#else
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
const uint8_t * q2 = x[i].qs;
|
|
1738
|
-
const int8_t * q8 = y[i].qs;
|
|
1739
|
-
const uint8_t * sc = x[i].scales;
|
|
1740
|
-
|
|
1741
|
-
int summs = 0;
|
|
1742
|
-
for (int j = 0; j < 16; ++j) {
|
|
1743
|
-
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
1744
|
-
}
|
|
1745
|
-
|
|
1746
|
-
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
1747
|
-
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
1748
|
-
|
|
1749
|
-
int isum = 0;
|
|
1750
|
-
int is = 0;
|
|
1751
|
-
int d;
|
|
1752
|
-
for (int k = 0; k < QK_K/128; ++k) {
|
|
1753
|
-
int shift = 0;
|
|
1754
|
-
for (int j = 0; j < 4; ++j) {
|
|
1755
|
-
d = sc[is++] & 0xF;
|
|
1756
|
-
int isuml = 0;
|
|
1757
|
-
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1758
|
-
isum += d * isuml;
|
|
1759
|
-
d = sc[is++] & 0xF;
|
|
1760
|
-
isuml = 0;
|
|
1761
|
-
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
|
1762
|
-
isum += d * isuml;
|
|
1763
|
-
shift += 2;
|
|
1764
|
-
q8 += 32;
|
|
1765
|
-
}
|
|
1766
|
-
q2 += 32;
|
|
1767
|
-
}
|
|
1768
|
-
sumf += dall * isum - dmin * summs;
|
|
1769
|
-
}
|
|
1770
|
-
*s = sumf;
|
|
1683
|
+
UNUSED(x);
|
|
1684
|
+
UNUSED(y);
|
|
1685
|
+
UNUSED(nb);
|
|
1686
|
+
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
1771
1687
|
#endif
|
|
1772
1688
|
}
|
|
1773
1689
|
|
|
@@ -2057,68 +1973,12 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2057
1973
|
*s = sum;
|
|
2058
1974
|
|
|
2059
1975
|
#else
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
// write vectorized versions for AVX, ARM_NEON, etc.
|
|
2067
|
-
|
|
2068
|
-
int8_t aux8[QK_K];
|
|
2069
|
-
int16_t aux16[8];
|
|
2070
|
-
float sums [8];
|
|
2071
|
-
int32_t aux32[8];
|
|
2072
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2073
|
-
|
|
2074
|
-
uint32_t auxs[4];
|
|
2075
|
-
const int8_t * scales = (const int8_t*)auxs;
|
|
2076
|
-
|
|
2077
|
-
float sumf = 0;
|
|
2078
|
-
for (int i = 0; i < nb; ++i) {
|
|
2079
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
2080
|
-
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
2081
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2082
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2083
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2084
|
-
uint8_t m = 1;
|
|
2085
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
2086
|
-
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
|
2087
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2088
|
-
a += 32; m <<= 1;
|
|
2089
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
|
2090
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2091
|
-
a += 32; m <<= 1;
|
|
2092
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
|
2093
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2094
|
-
a += 32; m <<= 1;
|
|
2095
|
-
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
|
2096
|
-
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
|
2097
|
-
a += 32; m <<= 1;
|
|
2098
|
-
q3 += 32;
|
|
2099
|
-
}
|
|
2100
|
-
a = aux8;
|
|
2101
|
-
|
|
2102
|
-
memcpy(auxs, x[i].scales, 12);
|
|
2103
|
-
uint32_t tmp = auxs[2];
|
|
2104
|
-
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
|
2105
|
-
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
|
2106
|
-
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
|
2107
|
-
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
|
2108
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
2109
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2110
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
2111
|
-
q8 += 8; a += 8;
|
|
2112
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2113
|
-
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
2114
|
-
q8 += 8; a += 8;
|
|
2115
|
-
}
|
|
2116
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2117
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2118
|
-
}
|
|
2119
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2120
|
-
*s = sumf;
|
|
2121
|
-
|
|
1976
|
+
UNUSED(kmask1);
|
|
1977
|
+
UNUSED(kmask2);
|
|
1978
|
+
UNUSED(x);
|
|
1979
|
+
UNUSED(y);
|
|
1980
|
+
UNUSED(nb);
|
|
1981
|
+
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2122
1982
|
#endif
|
|
2123
1983
|
|
|
2124
1984
|
}
|
|
@@ -2431,61 +2291,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2431
2291
|
*s = sumf;
|
|
2432
2292
|
|
|
2433
2293
|
#else
|
|
2434
|
-
|
|
2435
|
-
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2443
|
-
|
|
2444
|
-
float sumf = 0;
|
|
2445
|
-
for (int i = 0; i < nb; ++i) {
|
|
2446
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
2447
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2448
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2449
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2450
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2451
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2452
|
-
a += 32;
|
|
2453
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2454
|
-
a += 32; q4 += 32;
|
|
2455
|
-
}
|
|
2456
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2457
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2458
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2459
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2460
|
-
utmp[2] = uaux;
|
|
2461
|
-
utmp[0] &= kmask1;
|
|
2462
|
-
|
|
2463
|
-
int sumi = 0;
|
|
2464
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2465
|
-
a = aux8;
|
|
2466
|
-
int is = 0;
|
|
2467
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2468
|
-
int32_t scale = scales[is++];
|
|
2469
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2470
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2471
|
-
q8 += 8; a += 8;
|
|
2472
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2473
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2474
|
-
q8 += 8; a += 8;
|
|
2475
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2476
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2477
|
-
q8 += 8; a += 8;
|
|
2478
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2479
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2480
|
-
q8 += 8; a += 8;
|
|
2481
|
-
}
|
|
2482
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2483
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2484
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2485
|
-
sumf -= dmin * sumi;
|
|
2486
|
-
}
|
|
2487
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2488
|
-
*s = sumf;
|
|
2294
|
+
UNUSED(x);
|
|
2295
|
+
UNUSED(y);
|
|
2296
|
+
UNUSED(nb);
|
|
2297
|
+
UNUSED(kmask1);
|
|
2298
|
+
UNUSED(kmask2);
|
|
2299
|
+
UNUSED(kmask3);
|
|
2300
|
+
UNUSED(utmp);
|
|
2301
|
+
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2489
2302
|
#endif
|
|
2490
2303
|
}
|
|
2491
2304
|
|
|
@@ -2578,66 +2391,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
2578
2391
|
*s = sumf;
|
|
2579
2392
|
|
|
2580
2393
|
#else
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
memset(sums, 0, 8*sizeof(float));
|
|
2590
|
-
|
|
2591
|
-
float sumf = 0;
|
|
2592
|
-
for (int i = 0; i < nb; ++i) {
|
|
2593
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
|
2594
|
-
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
|
2595
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
2596
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
2597
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
2598
|
-
uint8_t m = 1;
|
|
2599
|
-
for (int j = 0; j < QK_K/64; ++j) {
|
|
2600
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
|
2601
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2602
|
-
a += 32; m <<= 1;
|
|
2603
|
-
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
|
2604
|
-
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
|
2605
|
-
a += 32; m <<= 1;
|
|
2606
|
-
q4 += 32;
|
|
2607
|
-
}
|
|
2608
|
-
memcpy(utmp, x[i].scales, 12);
|
|
2609
|
-
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
|
2610
|
-
const uint32_t uaux = utmp[1] & kmask1;
|
|
2611
|
-
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
|
2612
|
-
utmp[2] = uaux;
|
|
2613
|
-
utmp[0] &= kmask1;
|
|
2614
|
-
|
|
2615
|
-
int sumi = 0;
|
|
2616
|
-
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
|
2617
|
-
a = aux8;
|
|
2618
|
-
int is = 0;
|
|
2619
|
-
for (int j = 0; j < QK_K/32; ++j) {
|
|
2620
|
-
int32_t scale = scales[is++];
|
|
2621
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2622
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2623
|
-
q8 += 8; a += 8;
|
|
2624
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2625
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2626
|
-
q8 += 8; a += 8;
|
|
2627
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2628
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2629
|
-
q8 += 8; a += 8;
|
|
2630
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
2631
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
2632
|
-
q8 += 8; a += 8;
|
|
2633
|
-
}
|
|
2634
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
2635
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
2636
|
-
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
2637
|
-
sumf -= dmin * sumi;
|
|
2638
|
-
}
|
|
2639
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
2640
|
-
*s = sumf;
|
|
2394
|
+
UNUSED(x);
|
|
2395
|
+
UNUSED(y);
|
|
2396
|
+
UNUSED(nb);
|
|
2397
|
+
UNUSED(kmask1);
|
|
2398
|
+
UNUSED(kmask2);
|
|
2399
|
+
UNUSED(kmask3);
|
|
2400
|
+
UNUSED(utmp);
|
|
2401
|
+
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
2641
2402
|
#endif
|
|
2642
2403
|
}
|
|
2643
2404
|
|
|
@@ -3093,47 +2854,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
3093
2854
|
}
|
|
3094
2855
|
*s = sum;
|
|
3095
2856
|
#else
|
|
3096
|
-
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
int32_t aux32[8];
|
|
3101
|
-
memset(sums, 0, 8*sizeof(float));
|
|
3102
|
-
|
|
3103
|
-
float sumf = 0;
|
|
3104
|
-
for (int i = 0; i < nb; ++i) {
|
|
3105
|
-
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
|
3106
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3107
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3108
|
-
memset(aux32, 0, 8*sizeof(int32_t));
|
|
3109
|
-
int8_t * GGML_RESTRICT a = aux8;
|
|
3110
|
-
for (int j = 0; j < QK_K; j += 128) {
|
|
3111
|
-
for (int l = 0; l < 32; ++l) {
|
|
3112
|
-
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
|
3113
|
-
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
|
3114
|
-
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
|
3115
|
-
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
|
3116
|
-
}
|
|
3117
|
-
a += 128;
|
|
3118
|
-
q4 += 64;
|
|
3119
|
-
qh += 32;
|
|
3120
|
-
}
|
|
3121
|
-
a = aux8;
|
|
3122
|
-
int is = 0;
|
|
3123
|
-
for (int j = 0; j < QK_K/16; ++j) {
|
|
3124
|
-
int scale = x[i].scales[is++];
|
|
3125
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
3126
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
3127
|
-
q8 += 8; a += 8;
|
|
3128
|
-
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
|
3129
|
-
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
3130
|
-
q8 += 8; a += 8;
|
|
3131
|
-
}
|
|
3132
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3133
|
-
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
3134
|
-
}
|
|
3135
|
-
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
3136
|
-
*s = sumf;
|
|
2857
|
+
UNUSED(x);
|
|
2858
|
+
UNUSED(y);
|
|
2859
|
+
UNUSED(nb);
|
|
2860
|
+
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3137
2861
|
#endif
|
|
3138
2862
|
}
|
|
3139
2863
|
|
|
@@ -3229,34 +2953,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3229
2953
|
*s = 0.25f * sumf;
|
|
3230
2954
|
|
|
3231
2955
|
#else
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
float sumf = 0.f;
|
|
3237
|
-
for (int i = 0; i < nb; ++i) {
|
|
3238
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3239
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3240
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3241
|
-
int32_t bsum = 0;
|
|
3242
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3243
|
-
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
|
3244
|
-
q2 += 4;
|
|
3245
|
-
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
|
3246
|
-
int32_t sumi = 0;
|
|
3247
|
-
for (int l = 0; l < 4; ++l) {
|
|
3248
|
-
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
|
3249
|
-
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
|
3250
|
-
for (int j = 0; j < 8; ++j) {
|
|
3251
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3252
|
-
}
|
|
3253
|
-
q8 += 8;
|
|
3254
|
-
}
|
|
3255
|
-
bsum += sumi * ls;
|
|
3256
|
-
}
|
|
3257
|
-
sumf += d * bsum;
|
|
3258
|
-
}
|
|
3259
|
-
*s = 0.125f * sumf;
|
|
2956
|
+
UNUSED(x);
|
|
2957
|
+
UNUSED(y);
|
|
2958
|
+
UNUSED(nb);
|
|
2959
|
+
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3260
2960
|
#endif
|
|
3261
2961
|
}
|
|
3262
2962
|
|
|
@@ -3327,42 +3027,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
3327
3027
|
*s = 0.125f * sumf;
|
|
3328
3028
|
|
|
3329
3029
|
#else
|
|
3330
|
-
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
|
3335
|
-
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
|
3336
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3337
|
-
int32_t bsum = 0;
|
|
3338
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3339
|
-
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
|
3340
|
-
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
|
3341
|
-
int32_t sumi = 0;
|
|
3342
|
-
for (int l = 0; l < 2; ++l) {
|
|
3343
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3344
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3345
|
-
for (int j = 0; j < 8; ++j) {
|
|
3346
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3347
|
-
}
|
|
3348
|
-
q8 += 8;
|
|
3349
|
-
}
|
|
3350
|
-
bsum += sumi * ls1;
|
|
3351
|
-
sumi = 0;
|
|
3352
|
-
for (int l = 2; l < 4; ++l) {
|
|
3353
|
-
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
|
3354
|
-
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
|
3355
|
-
for (int j = 0; j < 8; ++j) {
|
|
3356
|
-
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
|
3357
|
-
}
|
|
3358
|
-
q8 += 8;
|
|
3359
|
-
}
|
|
3360
|
-
bsum += sumi * ls2;
|
|
3361
|
-
q2 += 4;
|
|
3362
|
-
}
|
|
3363
|
-
sumf += d * bsum;
|
|
3364
|
-
}
|
|
3365
|
-
*s = 0.125f * sumf;
|
|
3030
|
+
UNUSED(x);
|
|
3031
|
+
UNUSED(y);
|
|
3032
|
+
UNUSED(nb);
|
|
3033
|
+
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3366
3034
|
#endif
|
|
3367
3035
|
}
|
|
3368
3036
|
|
|
@@ -3455,45 +3123,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3455
3123
|
*s = 0.125f * sumf;
|
|
3456
3124
|
|
|
3457
3125
|
#else
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3463
|
-
const int8_t * q8 = y[i].qs;
|
|
3464
|
-
const uint8_t * qs = x[i].qs;
|
|
3465
|
-
const uint8_t * qh = x[i].qh;
|
|
3466
|
-
const uint8_t * signs = qs + QK_K/8;
|
|
3467
|
-
|
|
3468
|
-
int bsum = 0;
|
|
3469
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3470
|
-
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
|
3471
|
-
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
|
3472
|
-
int sumi1 = 0, sumi2 = 0;
|
|
3473
|
-
for (int l = 0; l < 2; ++l) {
|
|
3474
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3475
|
-
for (int j = 0; j < 8; ++j) {
|
|
3476
|
-
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3477
|
-
}
|
|
3478
|
-
q8 += 8;
|
|
3479
|
-
}
|
|
3480
|
-
for (int l = 2; l < 4; ++l) {
|
|
3481
|
-
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
|
3482
|
-
for (int j = 0; j < 8; ++j) {
|
|
3483
|
-
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
|
3484
|
-
}
|
|
3485
|
-
q8 += 8;
|
|
3486
|
-
}
|
|
3487
|
-
bsum += ls1 * sumi1 + ls2 * sumi2;
|
|
3488
|
-
qs += 4;
|
|
3489
|
-
signs += 4;
|
|
3490
|
-
}
|
|
3491
|
-
|
|
3492
|
-
sumf += d * bsum;
|
|
3493
|
-
}
|
|
3494
|
-
|
|
3495
|
-
*s = 0.125f * sumf;
|
|
3496
|
-
|
|
3126
|
+
UNUSED(x);
|
|
3127
|
+
UNUSED(y);
|
|
3128
|
+
UNUSED(nb);
|
|
3129
|
+
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3497
3130
|
#endif
|
|
3498
3131
|
|
|
3499
3132
|
}
|
|
@@ -3553,36 +3186,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
|
|
3553
3186
|
*s = 0.5f * sumf;
|
|
3554
3187
|
|
|
3555
3188
|
#else
|
|
3556
|
-
|
|
3557
|
-
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
for (int i = 0; i < nb; ++i) {
|
|
3561
|
-
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
3562
|
-
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
|
3563
|
-
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
3564
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3565
|
-
int32_t bsum = 0;
|
|
3566
|
-
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
|
3567
|
-
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
|
3568
|
-
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
|
3569
|
-
int32_t sumi = 0;
|
|
3570
|
-
for (int l = 0; l < 4; ++l) {
|
|
3571
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
|
3572
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
|
3573
|
-
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
|
3574
|
-
for (int j = 0; j < 4; ++j) {
|
|
3575
|
-
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3576
|
-
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3577
|
-
}
|
|
3578
|
-
q8 += 8;
|
|
3579
|
-
}
|
|
3580
|
-
q3 += 8;
|
|
3581
|
-
bsum += sumi * ls;
|
|
3582
|
-
}
|
|
3583
|
-
sumf += d * bsum;
|
|
3584
|
-
}
|
|
3585
|
-
*s = 0.25f * sumf;
|
|
3189
|
+
UNUSED(x);
|
|
3190
|
+
UNUSED(y);
|
|
3191
|
+
UNUSED(nb);
|
|
3192
|
+
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3586
3193
|
#endif
|
|
3587
3194
|
}
|
|
3588
3195
|
|
|
@@ -3689,48 +3296,10 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3689
3296
|
*s = sumf;
|
|
3690
3297
|
|
|
3691
3298
|
#else
|
|
3692
|
-
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3696
|
-
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
|
3697
|
-
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
3698
|
-
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
|
3699
|
-
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
|
3700
|
-
int32_t bsum = 0;
|
|
3701
|
-
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
|
3702
|
-
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
|
3703
|
-
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
|
3704
|
-
int32_t sumi = 0;
|
|
3705
|
-
for (int l = 0; l < 4; ++l) {
|
|
3706
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
|
3707
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
|
3708
|
-
for (int j = 0; j < 4; ++j) {
|
|
3709
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3710
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3711
|
-
}
|
|
3712
|
-
q8 += 8;
|
|
3713
|
-
}
|
|
3714
|
-
qs += 8;
|
|
3715
|
-
signs += 4;
|
|
3716
|
-
bsum += sumi * ls1;
|
|
3717
|
-
sumi = 0;
|
|
3718
|
-
for (int l = 0; l < 4; ++l) {
|
|
3719
|
-
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
|
3720
|
-
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
|
3721
|
-
for (int j = 0; j < 4; ++j) {
|
|
3722
|
-
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
|
3723
|
-
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
|
3724
|
-
}
|
|
3725
|
-
q8 += 8;
|
|
3726
|
-
}
|
|
3727
|
-
qs += 8;
|
|
3728
|
-
signs += 4;
|
|
3729
|
-
bsum += sumi * ls2;
|
|
3730
|
-
}
|
|
3731
|
-
sumf += d * bsum;
|
|
3732
|
-
}
|
|
3733
|
-
*s = sumf;
|
|
3299
|
+
UNUSED(x);
|
|
3300
|
+
UNUSED(y);
|
|
3301
|
+
UNUSED(nb);
|
|
3302
|
+
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3734
3303
|
#endif
|
|
3735
3304
|
}
|
|
3736
3305
|
|
|
@@ -3793,36 +3362,10 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3793
3362
|
*s = sumf;
|
|
3794
3363
|
|
|
3795
3364
|
#else
|
|
3796
|
-
|
|
3797
|
-
|
|
3798
|
-
|
|
3799
|
-
|
|
3800
|
-
const int8_t * q8 = y[i].qs;
|
|
3801
|
-
const uint8_t * qs = x[i].qs;
|
|
3802
|
-
const uint16_t * qh = x[i].qh;
|
|
3803
|
-
|
|
3804
|
-
int sumi = 0, sumi1 = 0;
|
|
3805
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
3806
|
-
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
|
3807
|
-
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
|
3808
|
-
int lsum = 0;
|
|
3809
|
-
for (int l = 0; l < 4; ++l) {
|
|
3810
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
|
3811
|
-
for (int j = 0; j < 8; ++j) {
|
|
3812
|
-
lsum += q8[j] * grid[j];
|
|
3813
|
-
}
|
|
3814
|
-
q8 += 8;
|
|
3815
|
-
}
|
|
3816
|
-
sumi += ls * lsum;
|
|
3817
|
-
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
|
3818
|
-
qs += 4;
|
|
3819
|
-
}
|
|
3820
|
-
|
|
3821
|
-
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
3822
|
-
}
|
|
3823
|
-
|
|
3824
|
-
*s = sumf;
|
|
3825
|
-
|
|
3365
|
+
UNUSED(x);
|
|
3366
|
+
UNUSED(y);
|
|
3367
|
+
UNUSED(nb);
|
|
3368
|
+
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3826
3369
|
#endif
|
|
3827
3370
|
}
|
|
3828
3371
|
|
|
@@ -3912,52 +3455,11 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
3912
3455
|
*s = sumf;
|
|
3913
3456
|
|
|
3914
3457
|
#else
|
|
3915
|
-
|
|
3916
|
-
|
|
3917
|
-
|
|
3918
|
-
|
|
3919
|
-
|
|
3920
|
-
|
|
3921
|
-
const int8_t * q8 = y[i].qs;
|
|
3922
|
-
const uint8_t * qs = x[i].qs;
|
|
3923
|
-
const uint8_t * qh = x[i].qh;
|
|
3924
|
-
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
|
3925
|
-
|
|
3926
|
-
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
|
3927
|
-
|
|
3928
|
-
int sumi1 = 0, sumi2 = 0;
|
|
3929
|
-
for (int ib = 0; ib < QK_K/32; ++ib) {
|
|
3930
|
-
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
|
3931
|
-
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
|
3932
|
-
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
|
3933
|
-
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
|
3934
|
-
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
|
3935
|
-
for (int l = 0; l < 4; ++l) {
|
|
3936
|
-
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
|
3937
|
-
int lsum1 = 0, lsum2 = 0;
|
|
3938
|
-
for (int j = 0; j < 8; ++j) {
|
|
3939
|
-
lsum1 += q8[j] * grid[j];
|
|
3940
|
-
lsum2 += q8[j];
|
|
3941
|
-
}
|
|
3942
|
-
q8 += 8;
|
|
3943
|
-
sum1[l/2] += lsum1;
|
|
3944
|
-
sum2[l/2] += lsum2*delta[l];
|
|
3945
|
-
}
|
|
3946
|
-
|
|
3947
|
-
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
|
3948
|
-
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
|
3949
|
-
|
|
3950
|
-
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
|
3951
|
-
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
|
3952
|
-
qs += 4;
|
|
3953
|
-
qh += 2;
|
|
3954
|
-
}
|
|
3955
|
-
|
|
3956
|
-
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
3957
|
-
}
|
|
3958
|
-
|
|
3959
|
-
*s = sumf;
|
|
3960
|
-
|
|
3458
|
+
UNUSED(x);
|
|
3459
|
+
UNUSED(y);
|
|
3460
|
+
UNUSED(nb);
|
|
3461
|
+
UNUSED(scale);
|
|
3462
|
+
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
3961
3463
|
#endif
|
|
3962
3464
|
}
|
|
3963
3465
|
|
|
@@ -4078,37 +3580,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
|
|
4078
3580
|
*s = sumf;
|
|
4079
3581
|
|
|
4080
3582
|
#else
|
|
4081
|
-
|
|
4082
|
-
|
|
4083
|
-
|
|
4084
|
-
|
|
4085
|
-
const uint8_t * qs = x[ibl].qs;
|
|
4086
|
-
const int8_t * q8 = y[ibl].qs;
|
|
4087
|
-
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
|
4088
|
-
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
|
4089
|
-
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
|
4090
|
-
h >>= 4;
|
|
4091
|
-
const float d1 = d4d8*(ls1 - 32);
|
|
4092
|
-
const float d2 = d4d8*(ls2 - 32);
|
|
4093
|
-
int sumi1 = 0, sumi2 = 0;
|
|
4094
|
-
for (int j = 0; j < 16; ++j) {
|
|
4095
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4096
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4097
|
-
}
|
|
4098
|
-
sumf += d1 * (sumi1 + sumi2);
|
|
4099
|
-
qs += 16;
|
|
4100
|
-
q8 += 32;
|
|
4101
|
-
sumi1 = sumi2 = 0;
|
|
4102
|
-
for (int j = 0; j < 16; ++j) {
|
|
4103
|
-
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
|
4104
|
-
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
|
4105
|
-
}
|
|
4106
|
-
sumf += d2 * (sumi1 + sumi2);
|
|
4107
|
-
qs += 16;
|
|
4108
|
-
q8 += 32;
|
|
4109
|
-
}
|
|
4110
|
-
}
|
|
4111
|
-
*s = sumf;
|
|
3583
|
+
UNUSED(x);
|
|
3584
|
+
UNUSED(y);
|
|
3585
|
+
UNUSED(nb);
|
|
3586
|
+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
|
4112
3587
|
#endif
|
|
4113
3588
|
}
|
|
4114
3589
|
|