llama_cpp 0.12.4 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ extern "C" {
19
19
  // fall back to the _Static_assert C11 keyword.
20
20
  // if C99 - static_assert is noop
21
21
  // ref: https://stackoverflow.com/a/53923785/4039976
22
+ #ifndef __cplusplus
22
23
  #ifndef static_assert
23
24
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
24
25
  #define static_assert(cond, msg) _Static_assert(cond, msg)
@@ -26,6 +27,7 @@ extern "C" {
26
27
  #define static_assert(cond, msg) struct global_scope_noop_trick
27
28
  #endif
28
29
  #endif
30
+ #endif
29
31
 
30
32
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
31
33
  #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
@@ -2381,19 +2381,20 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2381
2381
 
2382
2382
  uint8_t L[QK_K];
2383
2383
  uint8_t Laux[32];
2384
+ uint8_t Ls[QK_K/32];
2385
+ uint8_t Lm[QK_K/32];
2384
2386
  float weights[32];
2385
- float mins[QK_K/32];
2386
- float scales[QK_K/32];
2387
+ float sw[QK_K/32];
2388
+ float mins[QK_K/32];
2389
+ float scales[QK_K/32];
2387
2390
 
2388
2391
  for (int i = 0; i < nb; i++) {
2389
2392
 
2390
2393
  float sum_x2 = 0;
2391
2394
  for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2392
- float sigma2 = sum_x2/QK_K;
2395
+ float sigma2 = 2*sum_x2/QK_K;
2393
2396
  float av_x = sqrtf(sigma2);
2394
2397
 
2395
- float max_scale = 0; // as we are deducting the min, scales are always positive
2396
- float max_min = 0;
2397
2398
  for (int j = 0; j < QK_K/32; ++j) {
2398
2399
  if (quant_weights) {
2399
2400
  const float * qw = quant_weights + QK_K*i + 32*j;
@@ -2401,25 +2402,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2401
2402
  } else {
2402
2403
  for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2403
2404
  }
2405
+ float sumw = 0;
2406
+ for (int l = 0; l < 32; ++l) sumw += weights[l];
2407
+ sw[j] = sumw;
2404
2408
  scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2405
- //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2406
- float scale = scales[j];
2407
- if (scale > max_scale) {
2408
- max_scale = scale;
2409
- }
2410
- float min = mins[j];
2411
- if (min > max_min) {
2412
- max_min = min;
2413
- }
2414
2409
  }
2415
2410
 
2416
- float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2417
- float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2411
+ float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
2412
+ float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
2418
2413
  for (int j = 0; j < QK_K/32; ++j) {
2419
- uint8_t ls = nearest_int(inv_scale*scales[j]);
2420
- uint8_t lm = nearest_int(inv_min*mins[j]);
2421
- ls = MIN(63, ls);
2422
- lm = MIN(63, lm);
2414
+ uint8_t ls = Ls[j];
2415
+ uint8_t lm = Lm[j];
2423
2416
  if (j < 4) {
2424
2417
  y[i].scales[j] = ls;
2425
2418
  y[i].scales[j+4] = lm;
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2429
2422
  y[i].scales[j-0] |= ((lm >> 4) << 6);
2430
2423
  }
2431
2424
  }
2432
- y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2433
- y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2425
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2426
+ y[i].dmin = GGML_FP32_TO_FP16(m_block);
2434
2427
 
2435
2428
  uint8_t sc, m;
2436
2429
  for (int j = 0; j < QK_K/32; ++j) {
@@ -2688,20 +2681,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2688
2681
  const int nb = n_per_row / QK_K;
2689
2682
 
2690
2683
  uint8_t L[QK_K];
2691
- float mins[QK_K/32];
2692
- float scales[QK_K/32];
2693
- float weights[32];
2694
2684
  uint8_t Laux[32];
2685
+ uint8_t Ls[QK_K/32];
2686
+ uint8_t Lm[QK_K/32];
2687
+ float mins[QK_K/32];
2688
+ float scales[QK_K/32];
2689
+ float sw[QK_K/32];
2690
+ float weights[32];
2695
2691
 
2696
2692
  for (int i = 0; i < nb; i++) {
2697
2693
 
2698
2694
  float sum_x2 = 0;
2699
2695
  for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2700
- float sigma2 = sum_x2/QK_K;
2696
+ float sigma2 = 2*sum_x2/QK_K;
2701
2697
  float av_x = sqrtf(sigma2);
2702
2698
 
2703
- float max_scale = 0; // as we are deducting the min, scales are always positive
2704
- float max_min = 0;
2705
2699
  for (int j = 0; j < QK_K/32; ++j) {
2706
2700
  if (quant_weights) {
2707
2701
  const float * qw = quant_weights + QK_K*i + 32*j;
@@ -2709,22 +2703,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2709
2703
  } else {
2710
2704
  for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2711
2705
  }
2706
+ float sumw = 0;
2707
+ for (int l = 0; l < 32; ++l) sumw += weights[l];
2708
+ sw[j] = sumw;
2709
+
2712
2710
  scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2713
- float scale = scales[j];
2714
- if (scale > max_scale) {
2715
- max_scale = scale;
2716
- }
2717
- float min = mins[j];
2718
- if (min > max_min) {
2719
- max_min = min;
2720
- }
2721
2711
  }
2722
2712
 
2723
- float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2724
- float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2713
+ float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
2714
+ float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
2715
+
2725
2716
  for (int j = 0; j < QK_K/32; ++j) {
2726
- uint8_t ls = nearest_int(inv_scale*scales[j]);
2727
- uint8_t lm = nearest_int(inv_min*mins[j]);
2717
+ uint8_t ls = Ls[j];
2718
+ uint8_t lm = Lm[j];
2728
2719
  ls = MIN(63, ls);
2729
2720
  lm = MIN(63, lm);
2730
2721
  if (j < 4) {
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2736
2727
  y[i].scales[j-0] |= ((lm >> 4) << 6);
2737
2728
  }
2738
2729
  }
2739
- y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2740
- y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2730
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2731
+ y[i].dmin = GGML_FP32_TO_FP16(m_block);
2741
2732
 
2742
2733
  uint8_t sc, m;
2743
2734
  for (int j = 0; j < QK_K/32; ++j) {
@@ -9048,8 +9039,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9048
9039
  int8_t L[32];
9049
9040
  int8_t Laux[32];
9050
9041
  float waux[32];
9051
- bool is_on_grid[4];
9052
- bool is_on_grid_aux[4];
9053
9042
  uint8_t block_signs[4];
9054
9043
  uint32_t q2[2*(QK_K/32)];
9055
9044
 
@@ -9099,10 +9088,11 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9099
9088
  memset(L, 0, 32);
9100
9089
  continue;
9101
9090
  }
9091
+ float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
9092
+ float eff_max = scale*kMaxQ;
9102
9093
  float best = 0;
9103
- float scale = max/(2*kMaxQ-1);
9104
- for (int is = -9; is <= 9; ++is) {
9105
- float id = (2*kMaxQ-1+is*0.1f)/max;
9094
+ for (int is = -6; is <= 6; ++is) {
9095
+ float id = (2*kMaxQ-1+is*0.1f)/eff_max;
9106
9096
  float this_scale = 1/id;
9107
9097
  for (int k = 0; k < 4; ++k) {
9108
9098
  for (int i = 0; i < 8; ++i) {
@@ -9112,9 +9102,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9112
9102
  uint16_t u = 0;
9113
9103
  for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
9114
9104
  int grid_index = kmap_q2xs[u];
9115
- is_on_grid_aux[k] = true;
9116
9105
  if (grid_index < 0) {
9117
- is_on_grid_aux[k] = false;
9118
9106
  const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
9119
9107
  grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
9120
9108
  }
@@ -9128,16 +9116,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9128
9116
  }
9129
9117
  if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
9130
9118
  scale = sumqx/sumq2; best = scale*sumqx;
9131
- for (int i = 0; i < 32; ++i) L[i] = Laux[i];
9132
- for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
9119
+ memcpy(L, Laux, 32);
9133
9120
  }
9134
9121
  }
9135
- int n_not_ongrid = 0;
9136
- for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
9137
- if (n_not_ongrid > 0 && scale > 0) {
9122
+ if (scale > 0) {
9138
9123
  float id = 1/scale;
9139
9124
  for (int k = 0; k < 4; ++k) {
9140
- if (is_on_grid[k]) continue;
9141
9125
  uint16_t u = 0;
9142
9126
  for (int i = 0; i < 8; ++i) {
9143
9127
  int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
@@ -9193,49 +9177,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
9193
9177
  float d = max_scale/31;
9194
9178
  y[ibl].d = GGML_FP32_TO_FP16(d);
9195
9179
  float id = 1/d;
9196
- float sumqx = 0, sumq2 = 0;
9197
9180
  for (int ib = 0; ib < QK_K/32; ++ib) {
9198
9181
  int l = nearest_int(0.5f*(id*scales[ib]-1));
9199
9182
  l = MAX(0, MIN(15, l));
9200
9183
  q2[2*ib+1] |= ((uint32_t)l << 28);
9201
- const float * xb = xbl + 32*ib;
9202
- const float * qw = quant_weights + QK_K*ibl + 32*ib;
9203
- for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
9204
- const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
9205
- const float db = d * (1 + 2*l);
9206
- uint32_t u = 0;
9207
- for (int k = 0; k < 4; ++k) {
9208
- const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
9209
- const float * xk = xb + 8*k;
9210
- const float * wk = weight + 8*k;
9211
- const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
9212
- float best_mse = 0; int best_index = aux8[k];
9213
- for (int j = 0; j < 8; ++j) {
9214
- float diff = db * grid[j] * signs[j] - xk[j];
9215
- best_mse += wk[j] * diff * diff;
9216
- }
9217
- for (int idx = 0; idx < 256; ++idx) {
9218
- grid = (const uint8_t *)(kgrid_q2xs + idx);
9219
- float mse = 0;
9220
- for (int j = 0; j < 8; ++j) {
9221
- float diff = db * grid[j] * signs[j] - xk[j];
9222
- mse += wk[j] * diff * diff;
9223
- }
9224
- if (mse < best_mse) {
9225
- best_mse = mse; best_index = idx;
9226
- }
9227
- }
9228
- u |= (best_index << 8*k);
9229
- grid = (const uint8_t *)(kgrid_q2xs + best_index);
9230
- //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
9231
- for (int j = 0; j < 8; ++j) {
9232
- float q = db * grid[j] * signs[j];
9233
- sumqx += wk[j] * q * xk[j];
9234
- sumq2 += wk[j] * q * q;
9235
- }
9236
- }
9237
- q2[2*ib] = u;
9238
- if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
9239
9184
  }
9240
9185
  memcpy(y[ibl].qs, q2, QK_K/4);
9241
9186
  }
@@ -191,70 +191,74 @@ typedef struct {
191
191
  } block_iq3_xxs;
192
192
  static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
193
193
 
194
+ #ifdef __cplusplus
195
+ extern "C" {
196
+ #endif
197
+
194
198
  // Quantization
195
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
196
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
197
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
198
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
199
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
200
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
201
-
202
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
203
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
204
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
205
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
206
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
207
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
208
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
209
-
210
- void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
211
- void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
212
- void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
213
- void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
214
- void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
215
- void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
216
-
217
- void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
218
- void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
219
- void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
220
- void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
221
- void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
222
- void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
223
- void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
199
+ void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
200
+ void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
201
+ void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
202
+ void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
203
+ void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
204
+ void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
205
+
206
+ void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
207
+ void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
208
+ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
209
+ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
210
+ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
211
+ void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
212
+ void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
213
+
214
+ void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
215
+ void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
216
+ void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
217
+ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
218
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
219
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
220
+
221
+ void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
222
+ void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
223
+ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
224
+ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
225
+ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
226
+ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
227
+ void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
224
228
 
225
229
  // Dequantization
226
- void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
227
- void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
228
- void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
229
- void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
230
- void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
231
- //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
232
-
233
- void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
234
- void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
235
- void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
236
- void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
237
- void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
238
- void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
239
- void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
240
- void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
241
- void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
230
+ void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
231
+ void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
232
+ void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
233
+ void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
234
+ void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
235
+ //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
236
+
237
+ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
238
+ void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
239
+ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
240
+ void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
241
+ void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
242
+ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
243
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
244
+ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
245
+ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
242
246
 
243
247
  // Dot product
244
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
245
- void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
246
- void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
- void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
- void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
249
-
250
- void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
251
- void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
252
- void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
253
- void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
254
- void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
255
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
256
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
257
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
249
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
250
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
251
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
252
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
253
+
254
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
255
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
256
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
257
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
259
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
260
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
261
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
262
 
259
263
  //
260
264
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
276
280
  void iq2xs_free_impl(int grid_size);
277
281
  void iq3xs_init_impl(int grid_size);
278
282
  void iq3xs_free_impl(int grid_size);
283
+
284
+ #ifdef __cplusplus
285
+ }
286
+ #endif
287
+