llama_cpp 0.12.4 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +138 -53
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +688 -270
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +39 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +68 -59
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +131 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1516 -10656
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +1697 -1241
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +14 -9
- data/vendor/tmp/llama.cpp/ggml.c +13 -10
- data/vendor/tmp/llama.cpp/llama.cpp +266 -43
- data/vendor/tmp/llama.cpp/llama.h +1 -1
- metadata +2 -2
@@ -19,6 +19,7 @@ extern "C" {
|
|
19
19
|
// fall back to the _Static_assert C11 keyword.
|
20
20
|
// if C99 - static_assert is noop
|
21
21
|
// ref: https://stackoverflow.com/a/53923785/4039976
|
22
|
+
#ifndef __cplusplus
|
22
23
|
#ifndef static_assert
|
23
24
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
24
25
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
@@ -26,6 +27,7 @@ extern "C" {
|
|
26
27
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
27
28
|
#endif
|
28
29
|
#endif
|
30
|
+
#endif
|
29
31
|
|
30
32
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
31
33
|
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
@@ -2381,19 +2381,20 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
2381
2381
|
|
2382
2382
|
uint8_t L[QK_K];
|
2383
2383
|
uint8_t Laux[32];
|
2384
|
+
uint8_t Ls[QK_K/32];
|
2385
|
+
uint8_t Lm[QK_K/32];
|
2384
2386
|
float weights[32];
|
2385
|
-
float
|
2386
|
-
float
|
2387
|
+
float sw[QK_K/32];
|
2388
|
+
float mins[QK_K/32];
|
2389
|
+
float scales[QK_K/32];
|
2387
2390
|
|
2388
2391
|
for (int i = 0; i < nb; i++) {
|
2389
2392
|
|
2390
2393
|
float sum_x2 = 0;
|
2391
2394
|
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
2392
|
-
float sigma2 = sum_x2/QK_K;
|
2395
|
+
float sigma2 = 2*sum_x2/QK_K;
|
2393
2396
|
float av_x = sqrtf(sigma2);
|
2394
2397
|
|
2395
|
-
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2396
|
-
float max_min = 0;
|
2397
2398
|
for (int j = 0; j < QK_K/32; ++j) {
|
2398
2399
|
if (quant_weights) {
|
2399
2400
|
const float * qw = quant_weights + QK_K*i + 32*j;
|
@@ -2401,25 +2402,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
2401
2402
|
} else {
|
2402
2403
|
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2403
2404
|
}
|
2405
|
+
float sumw = 0;
|
2406
|
+
for (int l = 0; l < 32; ++l) sumw += weights[l];
|
2407
|
+
sw[j] = sumw;
|
2404
2408
|
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
2405
|
-
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
2406
|
-
float scale = scales[j];
|
2407
|
-
if (scale > max_scale) {
|
2408
|
-
max_scale = scale;
|
2409
|
-
}
|
2410
|
-
float min = mins[j];
|
2411
|
-
if (min > max_min) {
|
2412
|
-
max_min = min;
|
2413
|
-
}
|
2414
2409
|
}
|
2415
2410
|
|
2416
|
-
float
|
2417
|
-
float
|
2411
|
+
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
|
2412
|
+
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
|
2418
2413
|
for (int j = 0; j < QK_K/32; ++j) {
|
2419
|
-
uint8_t ls =
|
2420
|
-
uint8_t lm =
|
2421
|
-
ls = MIN(63, ls);
|
2422
|
-
lm = MIN(63, lm);
|
2414
|
+
uint8_t ls = Ls[j];
|
2415
|
+
uint8_t lm = Lm[j];
|
2423
2416
|
if (j < 4) {
|
2424
2417
|
y[i].scales[j] = ls;
|
2425
2418
|
y[i].scales[j+4] = lm;
|
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
2429
2422
|
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
2430
2423
|
}
|
2431
2424
|
}
|
2432
|
-
y[i].d = GGML_FP32_TO_FP16(
|
2433
|
-
y[i].dmin = GGML_FP32_TO_FP16(
|
2425
|
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
2426
|
+
y[i].dmin = GGML_FP32_TO_FP16(m_block);
|
2434
2427
|
|
2435
2428
|
uint8_t sc, m;
|
2436
2429
|
for (int j = 0; j < QK_K/32; ++j) {
|
@@ -2688,20 +2681,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
2688
2681
|
const int nb = n_per_row / QK_K;
|
2689
2682
|
|
2690
2683
|
uint8_t L[QK_K];
|
2691
|
-
float mins[QK_K/32];
|
2692
|
-
float scales[QK_K/32];
|
2693
|
-
float weights[32];
|
2694
2684
|
uint8_t Laux[32];
|
2685
|
+
uint8_t Ls[QK_K/32];
|
2686
|
+
uint8_t Lm[QK_K/32];
|
2687
|
+
float mins[QK_K/32];
|
2688
|
+
float scales[QK_K/32];
|
2689
|
+
float sw[QK_K/32];
|
2690
|
+
float weights[32];
|
2695
2691
|
|
2696
2692
|
for (int i = 0; i < nb; i++) {
|
2697
2693
|
|
2698
2694
|
float sum_x2 = 0;
|
2699
2695
|
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
2700
|
-
float sigma2 = sum_x2/QK_K;
|
2696
|
+
float sigma2 = 2*sum_x2/QK_K;
|
2701
2697
|
float av_x = sqrtf(sigma2);
|
2702
2698
|
|
2703
|
-
float max_scale = 0; // as we are deducting the min, scales are always positive
|
2704
|
-
float max_min = 0;
|
2705
2699
|
for (int j = 0; j < QK_K/32; ++j) {
|
2706
2700
|
if (quant_weights) {
|
2707
2701
|
const float * qw = quant_weights + QK_K*i + 32*j;
|
@@ -2709,22 +2703,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
2709
2703
|
} else {
|
2710
2704
|
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
2711
2705
|
}
|
2706
|
+
float sumw = 0;
|
2707
|
+
for (int l = 0; l < 32; ++l) sumw += weights[l];
|
2708
|
+
sw[j] = sumw;
|
2709
|
+
|
2712
2710
|
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
2713
|
-
float scale = scales[j];
|
2714
|
-
if (scale > max_scale) {
|
2715
|
-
max_scale = scale;
|
2716
|
-
}
|
2717
|
-
float min = mins[j];
|
2718
|
-
if (min > max_min) {
|
2719
|
-
max_min = min;
|
2720
|
-
}
|
2721
2711
|
}
|
2722
2712
|
|
2723
|
-
float
|
2724
|
-
float
|
2713
|
+
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
|
2714
|
+
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
|
2715
|
+
|
2725
2716
|
for (int j = 0; j < QK_K/32; ++j) {
|
2726
|
-
uint8_t ls =
|
2727
|
-
uint8_t lm =
|
2717
|
+
uint8_t ls = Ls[j];
|
2718
|
+
uint8_t lm = Lm[j];
|
2728
2719
|
ls = MIN(63, ls);
|
2729
2720
|
lm = MIN(63, lm);
|
2730
2721
|
if (j < 4) {
|
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
2736
2727
|
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
2737
2728
|
}
|
2738
2729
|
}
|
2739
|
-
y[i].d = GGML_FP32_TO_FP16(
|
2740
|
-
y[i].dmin = GGML_FP32_TO_FP16(
|
2730
|
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
2731
|
+
y[i].dmin = GGML_FP32_TO_FP16(m_block);
|
2741
2732
|
|
2742
2733
|
uint8_t sc, m;
|
2743
2734
|
for (int j = 0; j < QK_K/32; ++j) {
|
@@ -9048,8 +9039,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9048
9039
|
int8_t L[32];
|
9049
9040
|
int8_t Laux[32];
|
9050
9041
|
float waux[32];
|
9051
|
-
bool is_on_grid[4];
|
9052
|
-
bool is_on_grid_aux[4];
|
9053
9042
|
uint8_t block_signs[4];
|
9054
9043
|
uint32_t q2[2*(QK_K/32)];
|
9055
9044
|
|
@@ -9099,10 +9088,11 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9099
9088
|
memset(L, 0, 32);
|
9100
9089
|
continue;
|
9101
9090
|
}
|
9091
|
+
float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight);
|
9092
|
+
float eff_max = scale*kMaxQ;
|
9102
9093
|
float best = 0;
|
9103
|
-
|
9104
|
-
|
9105
|
-
float id = (2*kMaxQ-1+is*0.1f)/max;
|
9094
|
+
for (int is = -6; is <= 6; ++is) {
|
9095
|
+
float id = (2*kMaxQ-1+is*0.1f)/eff_max;
|
9106
9096
|
float this_scale = 1/id;
|
9107
9097
|
for (int k = 0; k < 4; ++k) {
|
9108
9098
|
for (int i = 0; i < 8; ++i) {
|
@@ -9112,9 +9102,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9112
9102
|
uint16_t u = 0;
|
9113
9103
|
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
9114
9104
|
int grid_index = kmap_q2xs[u];
|
9115
|
-
is_on_grid_aux[k] = true;
|
9116
9105
|
if (grid_index < 0) {
|
9117
|
-
is_on_grid_aux[k] = false;
|
9118
9106
|
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
9119
9107
|
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
9120
9108
|
}
|
@@ -9128,16 +9116,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9128
9116
|
}
|
9129
9117
|
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
9130
9118
|
scale = sumqx/sumq2; best = scale*sumqx;
|
9131
|
-
|
9132
|
-
for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
9119
|
+
memcpy(L, Laux, 32);
|
9133
9120
|
}
|
9134
9121
|
}
|
9135
|
-
|
9136
|
-
for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
9137
|
-
if (n_not_ongrid > 0 && scale > 0) {
|
9122
|
+
if (scale > 0) {
|
9138
9123
|
float id = 1/scale;
|
9139
9124
|
for (int k = 0; k < 4; ++k) {
|
9140
|
-
if (is_on_grid[k]) continue;
|
9141
9125
|
uint16_t u = 0;
|
9142
9126
|
for (int i = 0; i < 8; ++i) {
|
9143
9127
|
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
@@ -9193,49 +9177,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
9193
9177
|
float d = max_scale/31;
|
9194
9178
|
y[ibl].d = GGML_FP32_TO_FP16(d);
|
9195
9179
|
float id = 1/d;
|
9196
|
-
float sumqx = 0, sumq2 = 0;
|
9197
9180
|
for (int ib = 0; ib < QK_K/32; ++ib) {
|
9198
9181
|
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
9199
9182
|
l = MAX(0, MIN(15, l));
|
9200
9183
|
q2[2*ib+1] |= ((uint32_t)l << 28);
|
9201
|
-
const float * xb = xbl + 32*ib;
|
9202
|
-
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
9203
|
-
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
9204
|
-
const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
|
9205
|
-
const float db = d * (1 + 2*l);
|
9206
|
-
uint32_t u = 0;
|
9207
|
-
for (int k = 0; k < 4; ++k) {
|
9208
|
-
const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
|
9209
|
-
const float * xk = xb + 8*k;
|
9210
|
-
const float * wk = weight + 8*k;
|
9211
|
-
const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
9212
|
-
float best_mse = 0; int best_index = aux8[k];
|
9213
|
-
for (int j = 0; j < 8; ++j) {
|
9214
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
9215
|
-
best_mse += wk[j] * diff * diff;
|
9216
|
-
}
|
9217
|
-
for (int idx = 0; idx < 256; ++idx) {
|
9218
|
-
grid = (const uint8_t *)(kgrid_q2xs + idx);
|
9219
|
-
float mse = 0;
|
9220
|
-
for (int j = 0; j < 8; ++j) {
|
9221
|
-
float diff = db * grid[j] * signs[j] - xk[j];
|
9222
|
-
mse += wk[j] * diff * diff;
|
9223
|
-
}
|
9224
|
-
if (mse < best_mse) {
|
9225
|
-
best_mse = mse; best_index = idx;
|
9226
|
-
}
|
9227
|
-
}
|
9228
|
-
u |= (best_index << 8*k);
|
9229
|
-
grid = (const uint8_t *)(kgrid_q2xs + best_index);
|
9230
|
-
//grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
9231
|
-
for (int j = 0; j < 8; ++j) {
|
9232
|
-
float q = db * grid[j] * signs[j];
|
9233
|
-
sumqx += wk[j] * q * xk[j];
|
9234
|
-
sumq2 += wk[j] * q * q;
|
9235
|
-
}
|
9236
|
-
}
|
9237
|
-
q2[2*ib] = u;
|
9238
|
-
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
9239
9184
|
}
|
9240
9185
|
memcpy(y[ibl].qs, q2, QK_K/4);
|
9241
9186
|
}
|
@@ -191,70 +191,74 @@ typedef struct {
|
|
191
191
|
} block_iq3_xxs;
|
192
192
|
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
193
193
|
|
194
|
+
#ifdef __cplusplus
|
195
|
+
extern "C" {
|
196
|
+
#endif
|
197
|
+
|
194
198
|
// Quantization
|
195
|
-
void quantize_row_q4_0_reference(const float *
|
196
|
-
void quantize_row_q4_1_reference(const float *
|
197
|
-
void quantize_row_q5_0_reference(const float *
|
198
|
-
void quantize_row_q5_1_reference(const float *
|
199
|
-
void quantize_row_q8_0_reference(const float *
|
200
|
-
void quantize_row_q8_1_reference(const float *
|
201
|
-
|
202
|
-
void quantize_row_q2_K_reference(const float *
|
203
|
-
void quantize_row_q3_K_reference(const float *
|
204
|
-
void quantize_row_q4_K_reference(const float *
|
205
|
-
void quantize_row_q5_K_reference(const float *
|
206
|
-
void quantize_row_q6_K_reference(const float *
|
207
|
-
void quantize_row_q8_K_reference(const float *
|
208
|
-
void quantize_row_iq3_xxs_reference(const float *
|
209
|
-
|
210
|
-
void quantize_row_q4_0(const float *
|
211
|
-
void quantize_row_q4_1(const float *
|
212
|
-
void quantize_row_q5_0(const float *
|
213
|
-
void quantize_row_q5_1(const float *
|
214
|
-
void quantize_row_q8_0(const float *
|
215
|
-
void quantize_row_q8_1(const float *
|
216
|
-
|
217
|
-
void quantize_row_q2_K(const float *
|
218
|
-
void quantize_row_q3_K(const float *
|
219
|
-
void quantize_row_q4_K(const float *
|
220
|
-
void quantize_row_q5_K(const float *
|
221
|
-
void quantize_row_q6_K(const float *
|
222
|
-
void quantize_row_q8_K(const float *
|
223
|
-
void quantize_row_iq3_xxs(const float *
|
199
|
+
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
200
|
+
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
201
|
+
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
202
|
+
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
203
|
+
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
204
|
+
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
205
|
+
|
206
|
+
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
207
|
+
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
208
|
+
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
209
|
+
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
210
|
+
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
211
|
+
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
212
|
+
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
213
|
+
|
214
|
+
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
215
|
+
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
216
|
+
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
217
|
+
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
218
|
+
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
219
|
+
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
220
|
+
|
221
|
+
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
222
|
+
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
223
|
+
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
224
|
+
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
225
|
+
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
226
|
+
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
227
|
+
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
224
228
|
|
225
229
|
// Dequantization
|
226
|
-
void dequantize_row_q4_0(const block_q4_0 *
|
227
|
-
void dequantize_row_q4_1(const block_q4_1 *
|
228
|
-
void dequantize_row_q5_0(const block_q5_0 *
|
229
|
-
void dequantize_row_q5_1(const block_q5_1 *
|
230
|
-
void dequantize_row_q8_0(const block_q8_0 *
|
231
|
-
//void dequantize_row_q8_1(const block_q8_1 *
|
232
|
-
|
233
|
-
void dequantize_row_q2_K(const block_q2_K *
|
234
|
-
void dequantize_row_q3_K(const block_q3_K *
|
235
|
-
void dequantize_row_q4_K(const block_q4_K *
|
236
|
-
void dequantize_row_q5_K(const block_q5_K *
|
237
|
-
void dequantize_row_q6_K(const block_q6_K *
|
238
|
-
void dequantize_row_q8_K(const block_q8_K *
|
239
|
-
void dequantize_row_iq2_xxs(const block_iq2_xxs *
|
240
|
-
void dequantize_row_iq2_xs (const block_iq2_xs *
|
241
|
-
void dequantize_row_iq3_xxs(const block_iq3_xxs *
|
230
|
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
231
|
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
232
|
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
233
|
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
234
|
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
235
|
+
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
236
|
+
|
237
|
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
238
|
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
239
|
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
240
|
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
241
|
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
242
|
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
243
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
244
|
+
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
245
|
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
242
246
|
|
243
247
|
// Dot product
|
244
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float *
|
245
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float *
|
246
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float *
|
247
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float *
|
248
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float *
|
249
|
-
|
250
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float *
|
251
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float *
|
252
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float *
|
253
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float *
|
254
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float *
|
255
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float *
|
256
|
-
void ggml_vec_dot_iq2_xs_q8_K (int n, float *
|
257
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float *
|
248
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
249
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
250
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
251
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
252
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
253
|
+
|
254
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
255
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
256
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
257
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
258
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
259
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
260
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
261
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
258
262
|
|
259
263
|
//
|
260
264
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
|
|
276
280
|
void iq2xs_free_impl(int grid_size);
|
277
281
|
void iq3xs_init_impl(int grid_size);
|
278
282
|
void iq3xs_free_impl(int grid_size);
|
283
|
+
|
284
|
+
#ifdef __cplusplus
|
285
|
+
}
|
286
|
+
#endif
|
287
|
+
|