@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
#define UNUSED GGML_UNUSED
|
|
29
29
|
|
|
30
30
|
// reference implementation for deterministic creation of model files
|
|
31
|
-
void quantize_row_q4_0_ref(const float *
|
|
31
|
+
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
|
|
32
32
|
static const int qk = QK4_0;
|
|
33
33
|
|
|
34
34
|
assert(k % qk == 0);
|
|
@@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
|
|
|
65
65
|
}
|
|
66
66
|
}
|
|
67
67
|
|
|
68
|
-
void quantize_row_q4_1_ref(const float *
|
|
68
|
+
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
|
|
69
69
|
const int qk = QK4_1;
|
|
70
70
|
|
|
71
71
|
assert(k % qk == 0);
|
|
@@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
|
|
|
102
102
|
}
|
|
103
103
|
}
|
|
104
104
|
|
|
105
|
-
void quantize_row_q5_0_ref(const float *
|
|
105
|
+
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
|
|
106
106
|
static const int qk = QK5_0;
|
|
107
107
|
|
|
108
108
|
assert(k % qk == 0);
|
|
@@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
|
|
|
146
146
|
}
|
|
147
147
|
}
|
|
148
148
|
|
|
149
|
-
void quantize_row_q5_1_ref(const float *
|
|
149
|
+
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
|
|
150
150
|
const int qk = QK5_1;
|
|
151
151
|
|
|
152
152
|
assert(k % qk == 0);
|
|
@@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
|
|
|
191
191
|
}
|
|
192
192
|
|
|
193
193
|
// reference implementation for deterministic creation of model files
|
|
194
|
-
void quantize_row_q8_0_ref(const float *
|
|
194
|
+
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
|
|
195
195
|
assert(k % QK8_0 == 0);
|
|
196
196
|
const int nb = k / QK8_0;
|
|
197
197
|
|
|
@@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
|
|
|
217
217
|
}
|
|
218
218
|
|
|
219
219
|
// reference implementation for deterministic creation of model files
|
|
220
|
-
void quantize_row_q8_1_ref(const float *
|
|
220
|
+
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
|
|
221
221
|
assert(QK8_1 == 32);
|
|
222
222
|
assert(k % QK8_1 == 0);
|
|
223
223
|
const int nb = k / QK8_1;
|
|
@@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
|
|
|
252
252
|
}
|
|
253
253
|
}
|
|
254
254
|
|
|
255
|
-
void dequantize_row_q4_0(const block_q4_0 *
|
|
255
|
+
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
256
256
|
static const int qk = QK4_0;
|
|
257
257
|
|
|
258
258
|
assert(k % qk == 0);
|
|
@@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
|
|
|
272
272
|
}
|
|
273
273
|
}
|
|
274
274
|
|
|
275
|
-
void dequantize_row_q4_1(const block_q4_1 *
|
|
275
|
+
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
276
276
|
static const int qk = QK4_1;
|
|
277
277
|
|
|
278
278
|
assert(k % qk == 0);
|
|
@@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
|
|
|
293
293
|
}
|
|
294
294
|
}
|
|
295
295
|
|
|
296
|
-
void dequantize_row_q5_0(const block_q5_0 *
|
|
296
|
+
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
297
297
|
static const int qk = QK5_0;
|
|
298
298
|
|
|
299
299
|
assert(k % qk == 0);
|
|
@@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
|
|
|
319
319
|
}
|
|
320
320
|
}
|
|
321
321
|
|
|
322
|
-
void dequantize_row_q5_1(const block_q5_1 *
|
|
322
|
+
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
323
323
|
static const int qk = QK5_1;
|
|
324
324
|
|
|
325
325
|
assert(k % qk == 0);
|
|
@@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
|
|
|
346
346
|
}
|
|
347
347
|
}
|
|
348
348
|
|
|
349
|
-
void dequantize_row_q8_0(const block_q8_0 *
|
|
349
|
+
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
350
350
|
static const int qk = QK8_0;
|
|
351
351
|
|
|
352
352
|
assert(k % qk == 0);
|
|
@@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
|
|
|
376
376
|
return (i & 0x007fffff) - 0x00400000;
|
|
377
377
|
}
|
|
378
378
|
|
|
379
|
-
static float make_qx_quants(int n, int nmax, const float *
|
|
380
|
-
const float *
|
|
379
|
+
static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
|
|
380
|
+
const float * GGML_RESTRICT qw) {
|
|
381
381
|
float max = 0;
|
|
382
382
|
float amax = 0;
|
|
383
383
|
for (int i = 0; i < n; ++i) {
|
|
@@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
445
445
|
return scale;
|
|
446
446
|
}
|
|
447
447
|
|
|
448
|
-
static float make_q3_quants(int n, int nmax, const float *
|
|
448
|
+
static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
|
|
449
449
|
float max = 0;
|
|
450
450
|
float amax = 0;
|
|
451
451
|
for (int i = 0; i < n; ++i) {
|
|
@@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
|
504
504
|
return 1/iscale;
|
|
505
505
|
}
|
|
506
506
|
|
|
507
|
-
static float make_qkx1_quants(int n, int nmax, const float *
|
|
507
|
+
static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
|
|
508
508
|
int ntry, float alpha) {
|
|
509
509
|
float min = x[0];
|
|
510
510
|
float max = x[0];
|
|
@@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
|
547
547
|
return scale;
|
|
548
548
|
}
|
|
549
549
|
|
|
550
|
-
static float make_qkx2_quants(int n, int nmax, const float *
|
|
551
|
-
uint8_t *
|
|
550
|
+
static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
|
551
|
+
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
|
552
552
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
553
553
|
float min = x[0];
|
|
554
554
|
float max = x[0];
|
|
@@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|
|
628
628
|
return scale;
|
|
629
629
|
}
|
|
630
630
|
|
|
631
|
-
static inline void get_scale_min_k4(int j, const uint8_t *
|
|
631
|
+
static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
|
|
632
632
|
if (j < 4) {
|
|
633
633
|
*d = q[j] & 63; *m = q[j + 4] & 63;
|
|
634
634
|
} else {
|
|
@@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
|
639
639
|
|
|
640
640
|
//========================- 2-bit (de)-quantization
|
|
641
641
|
|
|
642
|
-
void quantize_row_q2_K_ref(const float *
|
|
642
|
+
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
|
|
643
643
|
assert(k % QK_K == 0);
|
|
644
644
|
const int nb = k / QK_K;
|
|
645
645
|
|
|
@@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
|
|
|
709
709
|
}
|
|
710
710
|
}
|
|
711
711
|
|
|
712
|
-
void dequantize_row_q2_K(const block_q2_K *
|
|
712
|
+
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
713
713
|
assert(k % QK_K == 0);
|
|
714
714
|
const int nb = k / QK_K;
|
|
715
715
|
|
|
@@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
|
|
|
741
741
|
}
|
|
742
742
|
}
|
|
743
743
|
|
|
744
|
-
static float make_qkx3_quants(int n, int nmax, const float *
|
|
745
|
-
uint8_t *
|
|
744
|
+
static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
|
|
745
|
+
uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
|
|
746
746
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
747
747
|
float min = x[0];
|
|
748
748
|
float max = x[0];
|
|
@@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|
|
824
824
|
return scale;
|
|
825
825
|
}
|
|
826
826
|
|
|
827
|
-
static float make_qp_quants(int n, int nmax, const float *
|
|
827
|
+
static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
|
|
828
828
|
float max = 0;
|
|
829
829
|
for (int i = 0; i < n; ++i) {
|
|
830
830
|
max = MAX(max, x[i]);
|
|
@@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
|
|
|
897
897
|
return sumlx/suml2;
|
|
898
898
|
}
|
|
899
899
|
|
|
900
|
-
static void quantize_row_q2_K_impl(const float *
|
|
900
|
+
static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
|
|
901
901
|
GGML_ASSERT(quant_weights);
|
|
902
902
|
assert(k % QK_K == 0);
|
|
903
903
|
const int nb = k / QK_K;
|
|
@@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
|
917
917
|
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
|
918
918
|
float sigma2 = sumx2/QK_K;
|
|
919
919
|
for (int j = 0; j < QK_K/16; ++j) {
|
|
920
|
-
const float *
|
|
920
|
+
const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
|
|
921
921
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
|
922
922
|
for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
|
|
923
923
|
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
|
@@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
|
959
959
|
}
|
|
960
960
|
}
|
|
961
961
|
|
|
962
|
-
size_t quantize_q2_K(const float *
|
|
962
|
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
963
963
|
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
|
964
964
|
if (!quant_weights) {
|
|
965
965
|
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
@@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
977
977
|
|
|
978
978
|
//========================= 3-bit (de)-quantization
|
|
979
979
|
|
|
980
|
-
void quantize_row_q3_K_ref(const float *
|
|
980
|
+
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
|
|
981
981
|
assert(k % QK_K == 0);
|
|
982
982
|
const int nb = k / QK_K;
|
|
983
983
|
|
|
@@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
|
|
|
1053
1053
|
}
|
|
1054
1054
|
}
|
|
1055
1055
|
|
|
1056
|
-
void dequantize_row_q3_K(const block_q3_K *
|
|
1056
|
+
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
1057
1057
|
assert(k % QK_K == 0);
|
|
1058
1058
|
const int nb = k / QK_K;
|
|
1059
1059
|
|
|
@@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
|
|
1067
1067
|
|
|
1068
1068
|
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
|
1069
1069
|
|
|
1070
|
-
const uint8_t *
|
|
1071
|
-
const uint8_t *
|
|
1070
|
+
const uint8_t * GGML_RESTRICT q = x[i].qs;
|
|
1071
|
+
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
|
1072
1072
|
uint8_t m = 1;
|
|
1073
1073
|
|
|
1074
1074
|
memcpy(aux, x[i].scales, 12);
|
|
@@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
|
|
1103
1103
|
}
|
|
1104
1104
|
}
|
|
1105
1105
|
|
|
1106
|
-
static void quantize_row_q3_K_impl(const float *
|
|
1106
|
+
static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
|
|
1107
1107
|
assert(n_per_row % QK_K == 0);
|
|
1108
1108
|
const int nb = n_per_row / QK_K;
|
|
1109
1109
|
|
|
@@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
|
1187
1187
|
}
|
|
1188
1188
|
}
|
|
1189
1189
|
|
|
1190
|
-
size_t quantize_q3_K(const float *
|
|
1190
|
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1191
1191
|
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
|
1192
1192
|
if (!quant_weights) {
|
|
1193
1193
|
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
@@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1205
1205
|
|
|
1206
1206
|
// ====================== 4-bit (de)-quantization
|
|
1207
1207
|
|
|
1208
|
-
void quantize_row_q4_K_ref(const float *
|
|
1208
|
+
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
|
|
1209
1209
|
assert(k % QK_K == 0);
|
|
1210
1210
|
const int nb = k / QK_K;
|
|
1211
1211
|
|
|
@@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
|
|
|
1277
1277
|
}
|
|
1278
1278
|
}
|
|
1279
1279
|
|
|
1280
|
-
void dequantize_row_q4_K(const block_q4_K *
|
|
1280
|
+
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
1281
1281
|
assert(k % QK_K == 0);
|
|
1282
1282
|
const int nb = k / QK_K;
|
|
1283
1283
|
|
|
@@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
|
|
|
1301
1301
|
}
|
|
1302
1302
|
}
|
|
1303
1303
|
|
|
1304
|
-
static void quantize_row_q4_K_impl(const float *
|
|
1304
|
+
static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1305
1305
|
assert(n_per_row % QK_K == 0);
|
|
1306
1306
|
const int64_t nb = n_per_row / QK_K;
|
|
1307
1307
|
|
|
@@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
|
1374
1374
|
}
|
|
1375
1375
|
}
|
|
1376
1376
|
|
|
1377
|
-
size_t quantize_q4_K(const float *
|
|
1377
|
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1378
1378
|
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
|
1379
1379
|
if (!quant_weights) {
|
|
1380
1380
|
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
@@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1392
1392
|
|
|
1393
1393
|
// ====================== 5-bit (de)-quantization
|
|
1394
1394
|
|
|
1395
|
-
void quantize_row_q5_K_ref(const float *
|
|
1395
|
+
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
|
|
1396
1396
|
assert(k % QK_K == 0);
|
|
1397
1397
|
const int64_t nb = k / QK_K;
|
|
1398
1398
|
|
|
@@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
|
|
|
1454
1454
|
}
|
|
1455
1455
|
}
|
|
1456
1456
|
|
|
1457
|
-
uint8_t *
|
|
1458
|
-
uint8_t *
|
|
1457
|
+
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
1458
|
+
uint8_t * GGML_RESTRICT ql = y[i].qs;
|
|
1459
1459
|
memset(qh, 0, QK_K/8);
|
|
1460
1460
|
|
|
1461
1461
|
uint8_t m1 = 1, m2 = 2;
|
|
@@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
|
|
|
1479
1479
|
}
|
|
1480
1480
|
}
|
|
1481
1481
|
|
|
1482
|
-
void dequantize_row_q5_K(const block_q5_K *
|
|
1482
|
+
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
1483
1483
|
assert(k % QK_K == 0);
|
|
1484
1484
|
const int64_t nb = k / QK_K;
|
|
1485
1485
|
|
|
@@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
|
|
|
1506
1506
|
}
|
|
1507
1507
|
}
|
|
1508
1508
|
|
|
1509
|
-
static void quantize_row_q5_K_impl(const float *
|
|
1509
|
+
static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1510
1510
|
assert(n_per_row % QK_K == 0);
|
|
1511
1511
|
const int64_t nb = n_per_row / QK_K;
|
|
1512
1512
|
|
|
@@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
|
1573
1573
|
}
|
|
1574
1574
|
}
|
|
1575
1575
|
|
|
1576
|
-
uint8_t *
|
|
1577
|
-
uint8_t *
|
|
1576
|
+
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
1577
|
+
uint8_t * GGML_RESTRICT ql = y[i].qs;
|
|
1578
1578
|
memset(qh, 0, QK_K/8);
|
|
1579
1579
|
|
|
1580
1580
|
uint8_t m1 = 1, m2 = 2;
|
|
@@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
|
1599
1599
|
}
|
|
1600
1600
|
}
|
|
1601
1601
|
|
|
1602
|
-
size_t quantize_q5_K(const float *
|
|
1602
|
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1603
1603
|
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
|
1604
1604
|
if (!quant_weights) {
|
|
1605
1605
|
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
@@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1617
1617
|
|
|
1618
1618
|
// ====================== 6-bit (de)-quantization
|
|
1619
1619
|
|
|
1620
|
-
void quantize_row_q6_K_ref(const float *
|
|
1620
|
+
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
|
|
1621
1621
|
assert(k % QK_K == 0);
|
|
1622
1622
|
const int64_t nb = k / QK_K;
|
|
1623
1623
|
|
|
@@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
|
|
|
1667
1667
|
}
|
|
1668
1668
|
}
|
|
1669
1669
|
|
|
1670
|
-
uint8_t *
|
|
1671
|
-
uint8_t *
|
|
1670
|
+
uint8_t * GGML_RESTRICT ql = y[i].ql;
|
|
1671
|
+
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
1672
1672
|
for (int j = 0; j < QK_K; j += 128) {
|
|
1673
1673
|
for (int l = 0; l < 32; ++l) {
|
|
1674
1674
|
const uint8_t q1 = L[j + l + 0] & 0xF;
|
|
@@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
|
|
|
1687
1687
|
}
|
|
1688
1688
|
}
|
|
1689
1689
|
|
|
1690
|
-
void dequantize_row_q6_K(const block_q6_K *
|
|
1690
|
+
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
1691
1691
|
assert(k % QK_K == 0);
|
|
1692
1692
|
const int64_t nb = k / QK_K;
|
|
1693
1693
|
|
|
1694
1694
|
for (int i = 0; i < nb; i++) {
|
|
1695
1695
|
const float d = GGML_FP16_TO_FP32(x[i].d);
|
|
1696
1696
|
|
|
1697
|
-
const uint8_t *
|
|
1698
|
-
const uint8_t *
|
|
1699
|
-
const int8_t *
|
|
1697
|
+
const uint8_t * GGML_RESTRICT ql = x[i].ql;
|
|
1698
|
+
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
|
1699
|
+
const int8_t * GGML_RESTRICT sc = x[i].scales;
|
|
1700
1700
|
|
|
1701
1701
|
for (int n = 0; n < QK_K; n += 128) {
|
|
1702
1702
|
for (int l = 0; l < 32; ++l) {
|
|
@@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
|
|
|
1718
1718
|
}
|
|
1719
1719
|
}
|
|
1720
1720
|
|
|
1721
|
-
static void quantize_row_q6_K_impl(const float *
|
|
1721
|
+
static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1722
1722
|
assert(n_per_row % QK_K == 0);
|
|
1723
1723
|
const int64_t nb = n_per_row / QK_K;
|
|
1724
1724
|
|
|
@@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
|
1781
1781
|
}
|
|
1782
1782
|
}
|
|
1783
1783
|
|
|
1784
|
-
uint8_t *
|
|
1785
|
-
uint8_t *
|
|
1784
|
+
uint8_t * GGML_RESTRICT ql = y[i].ql;
|
|
1785
|
+
uint8_t * GGML_RESTRICT qh = y[i].qh;
|
|
1786
1786
|
for (int j = 0; j < QK_K; j += 128) {
|
|
1787
1787
|
for (int l = 0; l < 32; ++l) {
|
|
1788
1788
|
const uint8_t q1 = L[j + l + 0] & 0xF;
|
|
@@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
|
1802
1802
|
}
|
|
1803
1803
|
}
|
|
1804
1804
|
|
|
1805
|
-
size_t quantize_q6_K(const float *
|
|
1805
|
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1806
1806
|
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
|
1807
1807
|
if (!quant_weights) {
|
|
1808
1808
|
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
@@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1818
1818
|
return nrow * row_size;
|
|
1819
1819
|
}
|
|
1820
1820
|
|
|
1821
|
-
static void quantize_row_q4_0_impl(const float *
|
|
1821
|
+
static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1822
1822
|
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
|
1823
1823
|
|
|
1824
1824
|
if (!quant_weights) {
|
|
@@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
|
1846
1846
|
}
|
|
1847
1847
|
}
|
|
1848
1848
|
|
|
1849
|
-
size_t quantize_q4_0(const float *
|
|
1849
|
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1850
1850
|
if (!quant_weights) {
|
|
1851
1851
|
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
1852
1852
|
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
|
@@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1861
1861
|
return nrow * row_size;
|
|
1862
1862
|
}
|
|
1863
1863
|
|
|
1864
|
-
static void quantize_row_q4_1_impl(const float *
|
|
1864
|
+
static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1865
1865
|
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
|
1866
1866
|
|
|
1867
1867
|
if (!quant_weights) {
|
|
@@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
|
1891
1891
|
}
|
|
1892
1892
|
}
|
|
1893
1893
|
|
|
1894
|
-
size_t quantize_q4_1(const float *
|
|
1894
|
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1895
1895
|
if (!quant_weights) {
|
|
1896
1896
|
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
1897
1897
|
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
|
@@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1906
1906
|
return nrow * row_size;
|
|
1907
1907
|
}
|
|
1908
1908
|
|
|
1909
|
-
static void quantize_row_q5_0_impl(const float *
|
|
1909
|
+
static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1910
1910
|
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
|
1911
1911
|
|
|
1912
1912
|
if (!quant_weights) {
|
|
@@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
|
1945
1945
|
}
|
|
1946
1946
|
}
|
|
1947
1947
|
|
|
1948
|
-
size_t quantize_q5_0(const float *
|
|
1948
|
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
1949
1949
|
if (!quant_weights) {
|
|
1950
1950
|
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
1951
1951
|
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
|
@@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
1960
1960
|
return nrow * row_size;
|
|
1961
1961
|
}
|
|
1962
1962
|
|
|
1963
|
-
static void quantize_row_q5_1_impl(const float *
|
|
1963
|
+
static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
|
|
1964
1964
|
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
|
1965
1965
|
|
|
1966
1966
|
if (!quant_weights) {
|
|
@@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
|
1998
1998
|
}
|
|
1999
1999
|
}
|
|
2000
2000
|
|
|
2001
|
-
size_t quantize_q5_1(const float *
|
|
2001
|
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
2002
2002
|
if (!quant_weights) {
|
|
2003
2003
|
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
2004
2004
|
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
|
@@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
2013
2013
|
return nrow * row_size;
|
|
2014
2014
|
}
|
|
2015
2015
|
|
|
2016
|
-
size_t quantize_q8_0(const float *
|
|
2016
|
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
2017
2017
|
(void)quant_weights; // not used
|
|
2018
2018
|
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
|
2019
2019
|
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
@@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
|
|
|
2022
2022
|
|
|
2023
2023
|
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
|
|
2024
2024
|
|
|
2025
|
-
void quantize_row_tq1_0_ref(const float *
|
|
2025
|
+
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
|
|
2026
2026
|
assert(k % QK_K == 0);
|
|
2027
2027
|
const int64_t nb = k / QK_K;
|
|
2028
2028
|
|
|
@@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
|
|
|
2088
2088
|
}
|
|
2089
2089
|
}
|
|
2090
2090
|
|
|
2091
|
-
void quantize_row_tq2_0_ref(const float *
|
|
2091
|
+
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
|
|
2092
2092
|
assert(k % QK_K == 0);
|
|
2093
2093
|
const int64_t nb = k / QK_K;
|
|
2094
2094
|
|
|
@@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
|
|
|
2120
2120
|
}
|
|
2121
2121
|
}
|
|
2122
2122
|
|
|
2123
|
-
size_t quantize_tq1_0(const float *
|
|
2123
|
+
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
2124
2124
|
(void)quant_weights; // not used
|
|
2125
2125
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
|
|
2126
2126
|
quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
2127
2127
|
return nrow * row_size;
|
|
2128
2128
|
}
|
|
2129
2129
|
|
|
2130
|
-
size_t quantize_tq2_0(const float *
|
|
2130
|
+
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
2131
2131
|
(void)quant_weights; // not used
|
|
2132
2132
|
const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
|
|
2133
2133
|
quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
|
2134
2134
|
return nrow * row_size;
|
|
2135
2135
|
}
|
|
2136
2136
|
|
|
2137
|
-
void dequantize_row_tq1_0(const block_tq1_0 *
|
|
2137
|
+
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2138
2138
|
assert(k % QK_K == 0);
|
|
2139
2139
|
const int64_t nb = k / QK_K;
|
|
2140
2140
|
|
|
@@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
|
|
|
2173
2173
|
}
|
|
2174
2174
|
}
|
|
2175
2175
|
|
|
2176
|
-
void dequantize_row_tq2_0(const block_tq2_0 *
|
|
2176
|
+
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2177
2177
|
assert(k % QK_K == 0);
|
|
2178
2178
|
const int64_t nb = k / QK_K;
|
|
2179
2179
|
|
|
@@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
|
|
|
2194
2194
|
|
|
2195
2195
|
// ====================== "True" 2-bit (de)-quantization
|
|
2196
2196
|
|
|
2197
|
-
void dequantize_row_iq2_xxs(const block_iq2_xxs *
|
|
2197
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2198
2198
|
assert(k % QK_K == 0);
|
|
2199
2199
|
const int64_t nb = k / QK_K;
|
|
2200
2200
|
|
|
@@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
|
2222
2222
|
|
|
2223
2223
|
// ====================== 2.3125 bpw (de)-quantization
|
|
2224
2224
|
|
|
2225
|
-
void dequantize_row_iq2_xs(const block_iq2_xs *
|
|
2225
|
+
void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2226
2226
|
assert(k % QK_K == 0);
|
|
2227
2227
|
const int64_t nb = k / QK_K;
|
|
2228
2228
|
|
|
@@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
|
2249
2249
|
|
|
2250
2250
|
// ====================== 2.5625 bpw (de)-quantization
|
|
2251
2251
|
|
|
2252
|
-
void dequantize_row_iq2_s(const block_iq2_s *
|
|
2252
|
+
void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2253
2253
|
assert(k % QK_K == 0);
|
|
2254
2254
|
const int64_t nb = k / QK_K;
|
|
2255
2255
|
|
|
@@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
|
|
|
2281
2281
|
|
|
2282
2282
|
// ====================== 3.0625 bpw (de)-quantization
|
|
2283
2283
|
|
|
2284
|
-
void dequantize_row_iq3_xxs(const block_iq3_xxs *
|
|
2284
|
+
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2285
2285
|
assert(k % QK_K == 0);
|
|
2286
2286
|
const int64_t nb = k / QK_K;
|
|
2287
2287
|
|
|
@@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
|
|
|
2313
2313
|
|
|
2314
2314
|
// ====================== 3.3125 bpw (de)-quantization
|
|
2315
2315
|
|
|
2316
|
-
void dequantize_row_iq3_s(const block_iq3_s *
|
|
2316
|
+
void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2317
2317
|
assert(k % QK_K == 0);
|
|
2318
2318
|
const int64_t nb = k / QK_K;
|
|
2319
2319
|
|
|
@@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
|
|
|
2356
2356
|
|
|
2357
2357
|
// ====================== 1.5625 bpw (de)-quantization
|
|
2358
2358
|
|
|
2359
|
-
void dequantize_row_iq1_s(const block_iq1_s *
|
|
2359
|
+
void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2360
2360
|
assert(k % QK_K == 0);
|
|
2361
2361
|
const int64_t nb = k / QK_K;
|
|
2362
2362
|
|
|
@@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
|
|
2381
2381
|
}
|
|
2382
2382
|
}
|
|
2383
2383
|
|
|
2384
|
-
void dequantize_row_iq1_m(const block_iq1_m *
|
|
2384
|
+
void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2385
2385
|
assert(k % QK_K == 0);
|
|
2386
2386
|
const int64_t nb = k / QK_K;
|
|
2387
2387
|
|
|
@@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
|
|
|
2433
2433
|
|
|
2434
2434
|
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
|
2435
2435
|
|
|
2436
|
-
void dequantize_row_iq4_nl(const block_iq4_nl *
|
|
2436
|
+
void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2437
2437
|
assert(k % QK4_NL == 0);
|
|
2438
2438
|
const int64_t nb = k / QK4_NL;
|
|
2439
2439
|
|
|
@@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
|
|
|
2451
2451
|
}
|
|
2452
2452
|
}
|
|
2453
2453
|
|
|
2454
|
-
void dequantize_row_iq4_xs(const block_iq4_xs *
|
|
2454
|
+
void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2455
2455
|
assert(k % QK_K == 0);
|
|
2456
2456
|
const int64_t nb = k / QK_K;
|
|
2457
2457
|
|
|
@@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|
|
2476
2476
|
|
|
2477
2477
|
//===================================== Q8_K ==============================================
|
|
2478
2478
|
|
|
2479
|
-
void quantize_row_q8_K_ref(const float *
|
|
2479
|
+
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
|
|
2480
2480
|
assert(k % QK_K == 0);
|
|
2481
2481
|
const int64_t nb = k / QK_K;
|
|
2482
2482
|
|
|
@@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
|
|
|
2515
2515
|
}
|
|
2516
2516
|
}
|
|
2517
2517
|
|
|
2518
|
-
void dequantize_row_q8_K(const block_q8_K *
|
|
2518
|
+
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
|
2519
2519
|
assert(k % QK_K == 0);
|
|
2520
2520
|
const int64_t nb = k / QK_K;
|
|
2521
2521
|
|
|
@@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
|
|
|
2927
2927
|
}
|
|
2928
2928
|
}
|
|
2929
2929
|
|
|
2930
|
-
static int iq2_find_best_neighbour(const uint16_t *
|
|
2931
|
-
const float *
|
|
2930
|
+
static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
|
2931
|
+
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
|
|
2932
2932
|
int num_neighbors = neighbours[0];
|
|
2933
2933
|
GGML_ASSERT(num_neighbors > 0);
|
|
2934
2934
|
float best_d2 = FLT_MAX;
|
|
@@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
2951
2951
|
return grid_index;
|
|
2952
2952
|
}
|
|
2953
2953
|
|
|
2954
|
-
static void quantize_row_iq2_xxs_impl(const float *
|
|
2954
|
+
static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
|
2955
2955
|
|
|
2956
2956
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
|
|
2957
2957
|
|
|
@@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|
|
3124
3124
|
}
|
|
3125
3125
|
}
|
|
3126
3126
|
|
|
3127
|
-
static void quantize_row_iq2_xs_impl(const float *
|
|
3127
|
+
static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
|
3128
3128
|
|
|
3129
3129
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
|
|
3130
3130
|
|
|
@@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
|
3304
3304
|
}
|
|
3305
3305
|
}
|
|
3306
3306
|
|
|
3307
|
-
size_t quantize_iq2_xxs(const float *
|
|
3307
|
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
3308
3308
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
3309
3309
|
int64_t nblock = n_per_row/QK_K;
|
|
3310
3310
|
char * qrow = (char *)dst;
|
|
@@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
|
|
|
3316
3316
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
|
3317
3317
|
}
|
|
3318
3318
|
|
|
3319
|
-
size_t quantize_iq2_xs(const float *
|
|
3319
|
+
size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
3320
3320
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
3321
3321
|
int64_t nblock = n_per_row/QK_K;
|
|
3322
3322
|
char * qrow = (char *)dst;
|
|
@@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
|
|
|
3521
3521
|
}
|
|
3522
3522
|
}
|
|
3523
3523
|
|
|
3524
|
-
static int iq3_find_best_neighbour(const uint16_t *
|
|
3525
|
-
const float *
|
|
3524
|
+
static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
|
|
3525
|
+
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
|
|
3526
3526
|
int num_neighbors = neighbours[0];
|
|
3527
3527
|
GGML_ASSERT(num_neighbors > 0);
|
|
3528
3528
|
float best_d2 = FLT_MAX;
|
|
@@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
3545
3545
|
return grid_index;
|
|
3546
3546
|
}
|
|
3547
3547
|
|
|
3548
|
-
static void quantize_row_iq3_xxs_impl(int grid_size, const float *
|
|
3549
|
-
const float *
|
|
3548
|
+
static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
|
|
3549
|
+
const float * GGML_RESTRICT quant_weights) {
|
|
3550
3550
|
|
|
3551
3551
|
const int gindex = iq3_data_index(grid_size);
|
|
3552
3552
|
|
|
@@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
|
3758
3758
|
}
|
|
3759
3759
|
}
|
|
3760
3760
|
|
|
3761
|
-
size_t quantize_iq3_xxs(const float *
|
|
3761
|
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
3762
3762
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
3763
3763
|
int64_t nblock = n_per_row/QK_K;
|
|
3764
3764
|
char * qrow = (char *)dst;
|
|
@@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
|
|
|
3770
3770
|
return nrow * nblock * sizeof(block_iq3_xxs);
|
|
3771
3771
|
}
|
|
3772
3772
|
|
|
3773
|
-
void quantize_row_iq3_xxs_ref(const float *
|
|
3773
|
+
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
|
|
3774
3774
|
assert(k % QK_K == 0);
|
|
3775
3775
|
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
|
3776
3776
|
}
|
|
3777
3777
|
|
|
3778
|
-
static void quantize_row_iq3_s_impl(int block_size, const float *
|
|
3779
|
-
const float *
|
|
3778
|
+
static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
|
|
3779
|
+
const float * GGML_RESTRICT quant_weights,
|
|
3780
3780
|
float * scales,
|
|
3781
3781
|
float * weight,
|
|
3782
3782
|
float * xval,
|
|
@@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
|
3958
3958
|
}
|
|
3959
3959
|
|
|
3960
3960
|
#define IQ3S_BLOCK_SIZE 32
|
|
3961
|
-
size_t quantize_iq3_s(const float *
|
|
3961
|
+
size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
3962
3962
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
3963
3963
|
int64_t nblock = n_per_row/QK_K;
|
|
3964
3964
|
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
|
@@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
|
|
|
3980
3980
|
return nrow * nblock * sizeof(block_iq3_s);
|
|
3981
3981
|
}
|
|
3982
3982
|
|
|
3983
|
-
void quantize_row_iq3_s_ref(const float *
|
|
3983
|
+
void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
|
|
3984
3984
|
assert(k % QK_K == 0);
|
|
3985
3985
|
quantize_iq3_s(x, y, 1, k, NULL);
|
|
3986
3986
|
}
|
|
@@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
|
|
|
3988
3988
|
|
|
3989
3989
|
// =================================== 1.5 bpw ===================================================
|
|
3990
3990
|
|
|
3991
|
-
static int iq1_find_best_neighbour(const uint16_t *
|
|
3992
|
-
const float *
|
|
3991
|
+
static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
|
3992
|
+
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
|
|
3993
3993
|
int num_neighbors = neighbours[0];
|
|
3994
3994
|
GGML_ASSERT(num_neighbors > 0);
|
|
3995
3995
|
float best_score = -FLT_MAX;
|
|
@@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
|
4048
4048
|
return grid_index;
|
|
4049
4049
|
}
|
|
4050
4050
|
|
|
4051
|
-
static int iq1_find_best_neighbour2(const uint16_t *
|
|
4052
|
-
const float *
|
|
4051
|
+
static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
|
|
4052
|
+
const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
|
|
4053
4053
|
int num_neighbors = neighbours[0];
|
|
4054
4054
|
GGML_ASSERT(num_neighbors > 0);
|
|
4055
4055
|
float best_score = FLT_MAX;
|
|
@@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
|
|
|
4113
4113
|
|
|
4114
4114
|
#define IQ1S_BLOCK_SIZE 32
|
|
4115
4115
|
#define IQ1M_BLOCK_SIZE 16
|
|
4116
|
-
static void quantize_row_iq1_s_impl(const float *
|
|
4116
|
+
static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
|
|
4117
4117
|
float * scales,
|
|
4118
4118
|
float * weight,
|
|
4119
4119
|
float * sumx,
|
|
@@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
|
4271
4271
|
}
|
|
4272
4272
|
}
|
|
4273
4273
|
|
|
4274
|
-
size_t quantize_iq1_s(const float *
|
|
4274
|
+
size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
4275
4275
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
4276
4276
|
float scales[QK_K/IQ1S_BLOCK_SIZE];
|
|
4277
4277
|
float weight[IQ1S_BLOCK_SIZE];
|
|
@@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
|
|
|
4291
4291
|
return nrow * nblock * sizeof(block_iq1_s);
|
|
4292
4292
|
}
|
|
4293
4293
|
|
|
4294
|
-
static void quantize_row_iq1_m_impl(const float *
|
|
4294
|
+
static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
|
|
4295
4295
|
float * scales,
|
|
4296
4296
|
float * weight,
|
|
4297
4297
|
float * pairs,
|
|
@@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
|
4539
4539
|
}
|
|
4540
4540
|
}
|
|
4541
4541
|
|
|
4542
|
-
size_t quantize_iq1_m(const float *
|
|
4542
|
+
size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
4543
4543
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
4544
4544
|
float scales[QK_K/IQ1M_BLOCK_SIZE];
|
|
4545
4545
|
float weight[IQ1M_BLOCK_SIZE];
|
|
@@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
|
4570
4570
|
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
|
4571
4571
|
}
|
|
4572
4572
|
|
|
4573
|
-
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float *
|
|
4573
|
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
|
|
4574
4574
|
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
|
4575
4575
|
float * scales, float * weight, uint8_t * L,
|
|
4576
4576
|
const int8_t * values,
|
|
@@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
|
4681
4681
|
}
|
|
4682
4682
|
}
|
|
4683
4683
|
|
|
4684
|
-
size_t quantize_iq4_nl(const float *
|
|
4684
|
+
size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
4685
4685
|
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
|
4686
4686
|
int64_t nblock = n_per_row/QK4_NL;
|
|
4687
4687
|
char * qrow = (char *)dst;
|
|
@@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
|
|
|
4703
4703
|
return nrow * nblock * sizeof(block_iq4_nl);
|
|
4704
4704
|
}
|
|
4705
4705
|
|
|
4706
|
-
//void quantize_row_iq4_nl_ref(const float *
|
|
4707
|
-
void quantize_row_iq4_nl_ref(const float *
|
|
4706
|
+
//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
|
4707
|
+
void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
|
|
4708
4708
|
GGML_ASSERT(k%QK4_NL == 0);
|
|
4709
4709
|
int64_t nblock = k/QK4_NL;
|
|
4710
4710
|
uint8_t L[QK4_NL];
|
|
@@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
|
|
|
4719
4719
|
}
|
|
4720
4720
|
}
|
|
4721
4721
|
|
|
4722
|
-
size_t quantize_iq4_xs(const float *
|
|
4722
|
+
size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
4723
4723
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
4724
4724
|
int64_t nblock = n_per_row/QK_K;
|
|
4725
4725
|
char * qrow = (char *)dst;
|
|
@@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
|
|
|
4739
4739
|
return nrow * nblock * sizeof(block_iq4_xs);
|
|
4740
4740
|
}
|
|
4741
4741
|
|
|
4742
|
-
void quantize_row_iq4_xs_ref(const float *
|
|
4742
|
+
void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
|
|
4743
4743
|
assert(k % QK_K == 0);
|
|
4744
4744
|
quantize_iq4_xs(x, y, 1, k, NULL);
|
|
4745
4745
|
}
|
|
4746
4746
|
|
|
4747
4747
|
// =============================== 2.5625 bpw
|
|
4748
4748
|
|
|
4749
|
-
static void quantize_row_iq2_s_impl(const float *
|
|
4749
|
+
static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
|
|
4750
4750
|
|
|
4751
4751
|
const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
|
|
4752
4752
|
|
|
@@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
|
4914
4914
|
}
|
|
4915
4915
|
}
|
|
4916
4916
|
|
|
4917
|
-
size_t quantize_iq2_s(const float *
|
|
4917
|
+
size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
4918
4918
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
|
4919
4919
|
int64_t nblock = n_per_row/QK_K;
|
|
4920
4920
|
char * qrow = (char *)dst;
|
|
@@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
|
|
|
4926
4926
|
return nrow * nblock * sizeof(block_iq2_s);
|
|
4927
4927
|
}
|
|
4928
4928
|
|
|
4929
|
-
void quantize_row_iq2_s_ref(const float *
|
|
4929
|
+
void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
|
|
4930
4930
|
assert(k % QK_K == 0);
|
|
4931
4931
|
quantize_iq2_s(x, y, 1, k, NULL);
|
|
4932
4932
|
}
|