@fugood/llama.node 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -116,6 +116,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
116
116
  //===================================== Dot products =================================
117
117
 
118
118
  void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
119
+ #if defined(__riscv_v)
119
120
  const int qk = QK8_0;
120
121
  const int nb = n / qk;
121
122
 
@@ -132,7 +133,6 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
132
133
  int ib = 0;
133
134
  float sumf = 0;
134
135
 
135
- #if defined(__riscv_v)
136
136
  size_t vl = qk / 2;
137
137
 
138
138
  for (; ib < nb; ++ib) {
@@ -164,27 +164,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
164
164
  sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
165
165
  }
166
166
 
167
- #endif
168
- for (; ib < nb; ++ib) {
169
- int sumi0 = 0;
170
- int sumi1 = 0;
171
-
172
- for (int j = 0; j < qk/2; ++j) {
173
- const int v0 = (x[ib].qs[j] & 0x0F) - 8;
174
- const int v1 = (x[ib].qs[j] >> 4) - 8;
175
-
176
- sumi0 += (v0 * y[ib].qs[j]);
177
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
178
- }
179
-
180
- int sumi = sumi0 + sumi1;
181
- sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
182
- }
183
-
184
167
  *s = sumf;
168
+ #else
169
+ ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
170
+ #endif
185
171
  }
186
172
 
187
173
  void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
174
+ #if defined(__riscv_v)
188
175
  const int qk = QK8_1;
189
176
  const int nb = n / qk;
190
177
 
@@ -201,7 +188,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
201
188
  int ib = 0;
202
189
  float sumf = 0;
203
190
 
204
- #if defined(__riscv_v)
205
191
  size_t vl = qk / 2;
206
192
 
207
193
  for (; ib < nb; ++ib) {
@@ -229,27 +215,14 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
229
215
  sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
230
216
  }
231
217
 
232
- #endif
233
- for (; ib < nb; ++ib) {
234
- int sumi0 = 0;
235
- int sumi1 = 0;
236
-
237
- for (int j = 0; j < qk/2; ++j) {
238
- const int v0 = (x[ib].qs[j] & 0x0F);
239
- const int v1 = (x[ib].qs[j] >> 4);
240
-
241
- sumi0 += (v0 * y[ib].qs[j]);
242
- sumi1 += (v1 * y[ib].qs[j + qk/2]);
243
- }
244
-
245
- int sumi = sumi0 + sumi1;
246
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
247
- }
248
-
249
218
  *s = sumf;
219
+ #else
220
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
221
+ #endif
250
222
  }
251
223
 
252
224
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
225
+ #if defined(__riscv_v)
253
226
  const int qk = QK8_0;
254
227
  const int nb = n / qk;
255
228
 
@@ -267,7 +240,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
267
240
  const block_q5_0 * GGML_RESTRICT x = vx;
268
241
  const block_q8_0 * GGML_RESTRICT y = vy;
269
242
 
270
- #if defined(__riscv_v)
271
243
  size_t vl;
272
244
  size_t vlenb = __riscv_vlenb();
273
245
 
@@ -297,33 +269,14 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
297
269
  sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
298
270
  }
299
271
 
300
- #endif
301
- for (; ib < nb; ++ib) {
302
- uint32_t qh;
303
- memcpy(&qh, x[ib].qh, sizeof(qh));
304
-
305
- int sumi0 = 0;
306
- int sumi1 = 0;
307
-
308
- for (int j = 0; j < qk/2; ++j) {
309
- const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
310
- const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
311
-
312
- const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
313
- const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
314
-
315
- sumi0 += (x0 * y[ib].qs[j]);
316
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
317
- }
318
-
319
- int sumi = sumi0 + sumi1;
320
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
321
- }
322
-
323
272
  *s = sumf;
273
+ #else
274
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
275
+ #endif
324
276
  }
325
277
 
326
278
  void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
279
+ #if defined(__riscv_v)
327
280
  const int qk = QK8_1;
328
281
  const int nb = n / qk;
329
282
 
@@ -341,7 +294,6 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
341
294
  const block_q5_1 * GGML_RESTRICT x = vx;
342
295
  const block_q8_1 * GGML_RESTRICT y = vy;
343
296
 
344
- #if defined(__riscv_v)
345
297
  size_t vl;
346
298
  size_t vlenb = __riscv_vlenb();
347
299
 
@@ -370,30 +322,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
370
322
  sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
371
323
  }
372
324
 
373
- #endif
374
- for (; ib < nb; ++ib) {
375
- uint32_t qh;
376
- memcpy(&qh, x[ib].qh, sizeof(qh));
377
-
378
- int sumi0 = 0;
379
- int sumi1 = 0;
380
-
381
- for (int j = 0; j < qk/2; ++j) {
382
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
383
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
384
-
385
- const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
386
- const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
387
-
388
- sumi0 += (x0 * y[ib].qs[j]);
389
- sumi1 += (x1 * y[ib].qs[j + qk/2]);
390
- }
391
-
392
- int sumi = sumi0 + sumi1;
393
- sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
394
- }
395
-
396
325
  *s = sumf;
326
+ #else
327
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
328
+ #endif
397
329
  }
398
330
 
399
331
  void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -431,18 +363,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
431
363
  sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
432
364
  }
433
365
 
434
- #endif
435
- for (; ib < nb; ++ib) {
436
- int sumi = 0;
366
+ *s = sumf;
367
+ #else
437
368
 
438
- for (int j = 0; j < qk; j++) {
439
- sumi += x[ib].qs[j]*y[ib].qs[j];
440
- }
369
+ UNUSED(nb);
370
+ UNUSED(x);
371
+ UNUSED(y);
372
+ UNUSED(ib);
373
+ UNUSED(sumf);
441
374
 
442
- sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
443
- }
444
-
445
- *s = sumf;
375
+ ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
376
+ #endif
446
377
  }
447
378
 
448
379
  void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -738,44 +669,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
738
669
 
739
670
  #else
740
671
 
741
- float sumf = 0;
742
-
743
- for (int i = 0; i < nb; ++i) {
744
-
745
- const uint8_t * q2 = x[i].qs;
746
- const int8_t * q8 = y[i].qs;
747
- const uint8_t * sc = x[i].scales;
748
-
749
- int summs = 0;
750
- for (int j = 0; j < 16; ++j) {
751
- summs += y[i].bsums[j] * (sc[j] >> 4);
752
- }
753
-
754
- const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
755
- const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
672
+ UNUSED(x);
673
+ UNUSED(y);
674
+ UNUSED(nb);
756
675
 
757
- int isum = 0;
758
- int is = 0;
759
- int d;
760
- for (int k = 0; k < QK_K/128; ++k) {
761
- int shift = 0;
762
- for (int j = 0; j < 4; ++j) {
763
- d = sc[is++] & 0xF;
764
- int isuml = 0;
765
- for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
766
- isum += d * isuml;
767
- d = sc[is++] & 0xF;
768
- isuml = 0;
769
- for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
770
- isum += d * isuml;
771
- shift += 2;
772
- q8 += 32;
773
- }
774
- q2 += 32;
775
- }
776
- sumf += dall * isum - dmin * summs;
777
- }
778
- *s = sumf;
676
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
779
677
  #endif
780
678
  }
781
679
 
@@ -1147,68 +1045,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1147
1045
  *s = sumf;
1148
1046
 
1149
1047
  #else
1150
- // scalar version
1151
- // This function is written like this so the compiler can manage to vectorize most of it
1152
- // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1153
- // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1154
- // The ideal situation would be if we could just write the code once, and the compiler would
1155
- // automatically produce the best possible set of machine instructions, instead of us having to manually
1156
- // write vectorized versions for AVX, ARM_NEON, etc.
1157
-
1158
- int8_t aux8[QK_K];
1159
- int16_t aux16[8];
1160
- float sums [8];
1161
- int32_t aux32[8];
1162
- memset(sums, 0, 8*sizeof(float));
1163
-
1164
- uint32_t auxs[4];
1165
- const int8_t * scales = (const int8_t*)auxs;
1166
1048
 
1167
- float sumf = 0;
1168
- for (int i = 0; i < nb; ++i) {
1169
- const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1170
- const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1171
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1172
- memset(aux32, 0, 8*sizeof(int32_t));
1173
- int8_t * GGML_RESTRICT a = aux8;
1174
- uint8_t m = 1;
1175
- for (int j = 0; j < QK_K; j += 128) {
1176
- for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1177
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1178
- a += 32; m <<= 1;
1179
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1180
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1181
- a += 32; m <<= 1;
1182
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1183
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1184
- a += 32; m <<= 1;
1185
- for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1186
- for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1187
- a += 32; m <<= 1;
1188
- q3 += 32;
1189
- }
1190
- a = aux8;
1191
-
1192
- memcpy(auxs, x[i].scales, 12);
1193
- uint32_t tmp = auxs[2];
1194
- auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1195
- auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1196
- auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1197
- auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1198
- for (int j = 0; j < QK_K/16; ++j) {
1199
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1200
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1201
- q8 += 8; a += 8;
1202
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1203
- for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1204
- q8 += 8; a += 8;
1205
- }
1206
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1207
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1208
- }
1209
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1210
- *s = sumf;
1049
+ UNUSED(kmask1);
1050
+ UNUSED(kmask2);
1051
+ UNUSED(x);
1052
+ UNUSED(y);
1053
+ UNUSED(nb);
1211
1054
 
1055
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1212
1056
  #endif
1213
1057
 
1214
1058
  }
@@ -1534,60 +1378,15 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1534
1378
 
1535
1379
  #else
1536
1380
 
1537
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1538
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1539
-
1540
- int8_t aux8[QK_K];
1541
- int16_t aux16[8];
1542
- float sums [8];
1543
- int32_t aux32[8];
1544
- memset(sums, 0, 8*sizeof(float));
1381
+ UNUSED(x);
1382
+ UNUSED(y);
1383
+ UNUSED(kmask1);
1384
+ UNUSED(kmask2);
1385
+ UNUSED(kmask3);
1386
+ UNUSED(nb);
1387
+ UNUSED(utmp);
1545
1388
 
1546
- float sumf = 0;
1547
- for (int i = 0; i < nb; ++i) {
1548
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1549
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1550
- memset(aux32, 0, 8*sizeof(int32_t));
1551
- int8_t * GGML_RESTRICT a = aux8;
1552
- for (int j = 0; j < QK_K/64; ++j) {
1553
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1554
- a += 32;
1555
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1556
- a += 32; q4 += 32;
1557
- }
1558
- memcpy(utmp, x[i].scales, 12);
1559
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1560
- const uint32_t uaux = utmp[1] & kmask1;
1561
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1562
- utmp[2] = uaux;
1563
- utmp[0] &= kmask1;
1564
-
1565
- int sumi = 0;
1566
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1567
- a = aux8;
1568
- int is = 0;
1569
- for (int j = 0; j < QK_K/32; ++j) {
1570
- int32_t scale = scales[is++];
1571
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1572
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1573
- q8 += 8; a += 8;
1574
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1575
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1576
- q8 += 8; a += 8;
1577
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1578
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1579
- q8 += 8; a += 8;
1580
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1581
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1582
- q8 += 8; a += 8;
1583
- }
1584
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1585
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1586
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1587
- sumf -= dmin * sumi;
1588
- }
1589
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1590
- *s = sumf;
1389
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1591
1390
  #endif
1592
1391
  }
1593
1392
 
@@ -1698,65 +1497,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1698
1497
 
1699
1498
  #else
1700
1499
 
1701
- const uint8_t * scales = (const uint8_t*)&utmp[0];
1702
- const uint8_t * mins = (const uint8_t*)&utmp[2];
1703
-
1704
- int8_t aux8[QK_K];
1705
- int16_t aux16[8];
1706
- float sums [8];
1707
- int32_t aux32[8];
1708
- memset(sums, 0, 8*sizeof(float));
1709
-
1710
- float sumf = 0;
1711
- for (int i = 0; i < nb; ++i) {
1712
- const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1713
- const uint8_t * GGML_RESTRICT hm = x[i].qh;
1714
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
1715
- memset(aux32, 0, 8*sizeof(int32_t));
1716
- int8_t * GGML_RESTRICT a = aux8;
1717
- uint8_t m = 1;
1718
- for (int j = 0; j < QK_K/64; ++j) {
1719
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1720
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1721
- a += 32; m <<= 1;
1722
- for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1723
- for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1724
- a += 32; m <<= 1;
1725
- q4 += 32;
1726
- }
1727
- memcpy(utmp, x[i].scales, 12);
1728
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1729
- const uint32_t uaux = utmp[1] & kmask1;
1730
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1731
- utmp[2] = uaux;
1732
- utmp[0] &= kmask1;
1500
+ UNUSED(x);
1501
+ UNUSED(y);
1502
+ UNUSED(kmask1);
1503
+ UNUSED(kmask2);
1504
+ UNUSED(kmask3);
1505
+ UNUSED(nb);
1506
+ UNUSED(utmp);
1733
1507
 
1734
- int sumi = 0;
1735
- for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1736
- a = aux8;
1737
- int is = 0;
1738
- for (int j = 0; j < QK_K/32; ++j) {
1739
- int32_t scale = scales[is++];
1740
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1741
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1742
- q8 += 8; a += 8;
1743
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1744
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1745
- q8 += 8; a += 8;
1746
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1747
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1748
- q8 += 8; a += 8;
1749
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1750
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1751
- q8 += 8; a += 8;
1752
- }
1753
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1754
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1755
- const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1756
- sumf -= dmin * sumi;
1757
- }
1758
- for (int l = 0; l < 8; ++l) sumf += sums[l];
1759
- *s = sumf;
1508
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1760
1509
  #endif
1761
1510
  }
1762
1511
 
@@ -2024,46 +1773,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2024
1773
 
2025
1774
  #else
2026
1775
 
2027
- int8_t aux8[QK_K];
2028
- int16_t aux16[8];
2029
- float sums [8];
2030
- int32_t aux32[8];
2031
- memset(sums, 0, 8*sizeof(float));
1776
+ UNUSED(x);
1777
+ UNUSED(y);
1778
+ UNUSED(nb);
2032
1779
 
2033
- float sumf = 0;
2034
- for (int i = 0; i < nb; ++i) {
2035
- const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2036
- const uint8_t * GGML_RESTRICT qh = x[i].qh;
2037
- const int8_t * GGML_RESTRICT q8 = y[i].qs;
2038
- memset(aux32, 0, 8*sizeof(int32_t));
2039
- int8_t * GGML_RESTRICT a = aux8;
2040
- for (int j = 0; j < QK_K; j += 128) {
2041
- for (int l = 0; l < 32; ++l) {
2042
- a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
2043
- a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
2044
- a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
2045
- a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
2046
- }
2047
- a += 128;
2048
- q4 += 64;
2049
- qh += 32;
2050
- }
2051
- a = aux8;
2052
- int is = 0;
2053
- for (int j = 0; j < QK_K/16; ++j) {
2054
- int scale = x[i].scales[is++];
2055
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2056
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2057
- q8 += 8; a += 8;
2058
- for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
2059
- for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2060
- q8 += 8; a += 8;
2061
- }
2062
- const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2063
- for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2064
- }
2065
- for (int l = 0; l < 8; ++l) sumf += sums[l];
2066
- *s = sumf;
1780
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2067
1781
  #endif
2068
1782
  }
2069
1783
 
@@ -112,31 +112,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
112
112
  }
113
113
 
114
114
  #endif
115
- {
116
- float sumf[8];
117
- int sumi;
118
-
119
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
120
- for (int x = 0; x < nc / ncols_interleaved; x++) {
121
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
122
-
123
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
124
- for (int l = 0; l < nb; l++) {
125
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
126
- for (int j = 0; j < ncols_interleaved; j++) {
127
- sumi = 0;
128
- for (int i = 0; i < blocklen; ++i) {
129
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
130
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
131
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
132
- }
133
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
134
- }
135
- }
136
- }
137
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
138
- }
139
- }
115
+ ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
140
116
  }
141
117
 
142
118
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -361,37 +337,6 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
361
337
  return;
362
338
  }
363
339
 
364
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
365
- float sumf[4][8];
366
- int sumi;
367
-
368
- for (int y = 0; y < nr / 4; y++) {
369
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
370
- for (int x = 0; x < nc / ncols_interleaved; x++) {
371
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
372
- for (int m = 0; m < 4; m++) {
373
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
374
- }
375
- for (int l = 0; l < nb; l++) {
376
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
377
- for (int m = 0; m < 4; m++) {
378
- for (int j = 0; j < ncols_interleaved; j++) {
379
- sumi = 0;
380
- for (int i = 0; i < blocklen; ++i) {
381
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
382
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
383
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
384
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
385
- }
386
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
387
- }
388
- }
389
- }
390
- }
391
- for (int m = 0; m < 4; m++) {
392
- for (int j = 0; j < ncols_interleaved; j++)
393
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
394
- }
395
- }
396
- }
340
+ #endif
341
+ ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
397
342
  }