@fugood/llama.node 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -86,35 +86,9 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
86
86
  }
87
87
  }
88
88
  #else
89
- // scalar
90
- const int blck_size_interleave = 4;
91
- float srcv[4][QK8_0];
92
- float id[4];
93
-
94
- for (int i = 0; i < nb; i++) {
95
- for (int row_iter = 0; row_iter < 4; row_iter++) {
96
- float amax = 0.0f; // absolute max
97
-
98
- for (int j = 0; j < QK8_0; j++) {
99
- srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
100
- amax = MAX(amax, fabsf(srcv[row_iter][j]));
101
- }
102
-
103
- const float d = amax / ((1 << 7) - 1);
104
- id[row_iter] = d ? 1.0f / d : 0.0f;
105
-
106
- y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
107
- }
108
-
109
- for (int j = 0; j < QK8_0 * 4; j++) {
110
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
111
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
112
- src_offset += (j % blck_size_interleave);
113
-
114
- float x0 = srcv[src_id][src_offset] * id[src_id];
115
- y[i].qs[j] = roundf(x0);
116
- }
117
- }
89
+ UNUSED(nb);
90
+ UNUSED(y);
91
+ ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
118
92
  #endif
119
93
  }
120
94
 
@@ -205,35 +179,9 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
205
179
  }
206
180
 
207
181
  #else
208
- // scalar
209
- const int blck_size_interleave = 8;
210
- float srcv[4][QK8_0];
211
- float id[4];
212
-
213
- for (int i = 0; i < nb; i++) {
214
- for (int row_iter = 0; row_iter < 4; row_iter++) {
215
- float amax = 0.0f; // absolute max
216
-
217
- for (int j = 0; j < QK8_0; j++) {
218
- srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
219
- amax = MAX(amax, fabsf(srcv[row_iter][j]));
220
- }
221
-
222
- const float d = amax / ((1 << 7) - 1);
223
- id[row_iter] = d ? 1.0f / d : 0.0f;
224
-
225
- y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
226
- }
227
-
228
- for (int j = 0; j < QK8_0 * 4; j++) {
229
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
230
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
231
- src_offset += (j % blck_size_interleave);
232
-
233
- float x0 = srcv[src_id][src_offset] * id[src_id];
234
- y[i].qs[j] = roundf(x0);
235
- }
236
- }
182
+ UNUSED(nb);
183
+ UNUSED(y);
184
+ ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
237
185
  #endif
238
186
  }
239
187
 
@@ -295,29 +243,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
295
243
  }
296
244
  return;
297
245
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
298
- float sumf[4];
299
- int sumi;
300
-
301
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
302
- for (int x = 0; x < nc / ncols_interleaved; x++) {
303
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
304
-
305
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
306
- for (int l = 0; l < nb; l++) {
307
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
308
- for (int j = 0; j < ncols_interleaved; j++) {
309
- sumi = 0;
310
- for (int i = 0; i < blocklen; ++i) {
311
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
312
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
313
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
314
- }
315
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
316
- }
317
- }
318
- }
319
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
320
- }
246
+ ggml_gemv_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
321
247
  }
322
248
 
323
249
  void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -383,29 +309,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
383
309
  }
384
310
  return;
385
311
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
386
- float sumf[4];
387
- int sumi;
388
-
389
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
390
- for (int x = 0; x < nc / ncols_interleaved; x++) {
391
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
392
-
393
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
394
- for (int l = 0; l < nb; l++) {
395
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
396
- for (int j = 0; j < ncols_interleaved; j++) {
397
- sumi = 0;
398
- for (int i = 0; i < blocklen; ++i) {
399
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
400
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
401
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
402
- }
403
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
404
- }
405
- }
406
- }
407
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
408
- }
312
+ ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
409
313
  }
410
314
 
411
315
  void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -497,31 +401,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
497
401
  #endif // #if defined(__ARM_FEATURE_SVE)
498
402
 
499
403
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
500
- {
501
- float sumf[8];
502
- int sumi;
503
-
504
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
505
- for (int x = 0; x < nc / ncols_interleaved; x++) {
506
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
507
-
508
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
509
- for (int l = 0; l < nb; l++) {
510
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
511
- for (int j = 0; j < ncols_interleaved; j++) {
512
- sumi = 0;
513
- for (int i = 0; i < blocklen; ++i) {
514
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
515
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
516
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
517
- }
518
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
519
- }
520
- }
521
- }
522
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
523
- }
524
- }
404
+ ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
525
405
  }
526
406
 
527
407
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -591,31 +471,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
591
471
  }
592
472
  return;
593
473
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
594
- {
595
- float sumf[4];
596
- int sumi;
597
-
598
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
599
- for (int x = 0; x < nc / ncols_interleaved; x++) {
600
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
601
-
602
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
603
- for (int l = 0; l < nb; l++) {
604
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
605
- for (int j = 0; j < ncols_interleaved; j++) {
606
- sumi = 0;
607
- for (int i = 0; i < blocklen; ++i) {
608
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
609
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
610
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
611
- }
612
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
613
- }
614
- }
615
- }
616
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
617
- }
618
- }
474
+ ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
619
475
  }
620
476
 
621
477
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1096,40 +952,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1096
952
  );
1097
953
  return;
1098
954
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1099
- {
1100
- float sumf[4][4];
1101
- int sumi;
1102
-
1103
- for (int y = 0; y < nr / 4; y++) {
1104
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1105
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1106
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1107
- for (int m = 0; m < 4; m++) {
1108
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1109
- }
1110
- for (int l = 0; l < nb; l++) {
1111
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1112
- for (int m = 0; m < 4; m++) {
1113
- for (int j = 0; j < ncols_interleaved; j++) {
1114
- sumi = 0;
1115
- for (int i = 0; i < blocklen; ++i) {
1116
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1117
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1118
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1119
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1120
- }
1121
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1122
- }
1123
- }
1124
- }
1125
- }
1126
- for (int m = 0; m < 4; m++) {
1127
- for (int j = 0; j < ncols_interleaved; j++)
1128
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1129
- }
1130
- }
1131
- }
1132
- }
955
+ ggml_gemm_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
1133
956
  }
1134
957
 
1135
958
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1550,38 +1373,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1550
1373
  );
1551
1374
  return;
1552
1375
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1553
- float sumf[4][4];
1554
- int sumi;
1555
-
1556
- for (int y = 0; y < nr / 4; y++) {
1557
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1558
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1559
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1560
- for (int m = 0; m < 4; m++) {
1561
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1562
- }
1563
- for (int l = 0; l < nb; l++) {
1564
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1565
- for (int m = 0; m < 4; m++) {
1566
- for (int j = 0; j < ncols_interleaved; j++) {
1567
- sumi = 0;
1568
- for (int i = 0; i < blocklen; ++i) {
1569
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1570
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1571
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1572
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1573
- }
1574
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1575
- }
1576
- }
1577
- }
1578
- }
1579
- for (int m = 0; m < 4; m++) {
1580
- for (int j = 0; j < ncols_interleaved; j++)
1581
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1582
- }
1583
- }
1584
- }
1376
+ ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
1585
1377
  }
1586
1378
 
1587
1379
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -2019,38 +1811,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
2019
1811
  #endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2020
1812
 
2021
1813
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2022
- float sumf[4][8];
2023
- int sumi;
2024
-
2025
- for (int y = 0; y < nr / 4; y++) {
2026
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2027
- for (int x = 0; x < nc / ncols_interleaved; x++) {
2028
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
2029
- for (int m = 0; m < 4; m++) {
2030
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2031
- }
2032
- for (int l = 0; l < nb; l++) {
2033
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2034
- for (int m = 0; m < 4; m++) {
2035
- for (int j = 0; j < ncols_interleaved; j++) {
2036
- sumi = 0;
2037
- for (int i = 0; i < blocklen; ++i) {
2038
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2039
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2040
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2041
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2042
- }
2043
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2044
- }
2045
- }
2046
- }
2047
- }
2048
- for (int m = 0; m < 4; m++) {
2049
- for (int j = 0; j < ncols_interleaved; j++)
2050
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2051
- }
2052
- }
2053
- }
1814
+ ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
2054
1815
  }
2055
1816
 
2056
1817
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -2126,38 +1887,5 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
2126
1887
  }
2127
1888
  return;
2128
1889
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
2129
- {
2130
- float sumf[4][4];
2131
- int sumi;
2132
-
2133
- for (int y = 0; y < nr / 4; y++) {
2134
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2135
- for (int x = 0; x < nc / ncols_interleaved; x++) {
2136
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
2137
- for (int m = 0; m < 4; m++) {
2138
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2139
- }
2140
- for (int l = 0; l < nb; l++) {
2141
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2142
- for (int m = 0; m < 4; m++) {
2143
- for (int j = 0; j < ncols_interleaved; j++) {
2144
- sumi = 0;
2145
- for (int i = 0; i < blocklen; ++i) {
2146
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
2147
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
2148
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2149
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2150
- }
2151
- sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2152
- }
2153
- }
2154
- }
2155
- }
2156
- for (int m = 0; m < 4; m++) {
2157
- for (int j = 0; j < ncols_interleaved; j++)
2158
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2159
- }
2160
- }
2161
- }
2162
- }
1890
+ ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
2163
1891
  }