@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
// Unit tests for quantization specific functions - quantize, dequantize and dot product
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
#include "ggml-cpu.h"
|
|
4
5
|
|
|
5
6
|
#undef NDEBUG
|
|
6
7
|
#include <assert.h>
|
|
@@ -15,11 +16,13 @@
|
|
|
15
16
|
|
|
16
17
|
constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
|
17
18
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
|
19
|
+
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_TERNARY = 0.01f;
|
|
18
20
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
|
19
21
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
|
20
22
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
|
|
21
23
|
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
|
22
24
|
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
|
|
25
|
+
constexpr float MAX_DOT_PRODUCT_ERROR_TERNARY = 0.15f;
|
|
23
26
|
|
|
24
27
|
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
|
25
28
|
|
|
@@ -42,26 +45,27 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
|
|
|
42
45
|
}
|
|
43
46
|
|
|
44
47
|
// Total quantization error on test data
|
|
45
|
-
static float total_quantization_error(
|
|
48
|
+
static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
|
|
46
49
|
std::vector<uint8_t> tmp_q(2*test_size);
|
|
47
50
|
std::vector<float> tmp_out(test_size);
|
|
48
51
|
|
|
49
|
-
|
|
50
|
-
qfns
|
|
52
|
+
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
|
53
|
+
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
|
51
54
|
return array_rmse(test_data, tmp_out.data(), test_size);
|
|
52
55
|
}
|
|
53
56
|
|
|
54
57
|
// Total quantization error on test data
|
|
55
|
-
static float reference_quantization_error(
|
|
58
|
+
static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
|
|
56
59
|
std::vector<uint8_t> tmp_q(2*test_size);
|
|
57
60
|
std::vector<float> tmp_out(test_size);
|
|
58
61
|
std::vector<float> tmp_out_ref(test_size);
|
|
59
62
|
|
|
60
|
-
|
|
61
|
-
|
|
63
|
+
// FIXME: why is done twice?
|
|
64
|
+
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
|
65
|
+
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
|
62
66
|
|
|
63
|
-
qfns
|
|
64
|
-
qfns
|
|
67
|
+
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
|
|
68
|
+
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
|
65
69
|
|
|
66
70
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
|
67
71
|
}
|
|
@@ -76,18 +80,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|
|
76
80
|
|
|
77
81
|
// Total dot product error
|
|
78
82
|
static float dot_product_error(
|
|
79
|
-
|
|
83
|
+
const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
|
|
80
84
|
) {
|
|
81
85
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
|
82
86
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
|
83
87
|
|
|
84
|
-
auto vdot =
|
|
88
|
+
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
|
|
85
89
|
|
|
86
|
-
|
|
87
|
-
vdot
|
|
90
|
+
qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
|
|
91
|
+
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
|
88
92
|
|
|
89
93
|
float result = INFINITY;
|
|
90
|
-
|
|
94
|
+
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
|
91
95
|
|
|
92
96
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
|
93
97
|
|
|
@@ -129,10 +133,11 @@ int main(int argc, char * argv[]) {
|
|
|
129
133
|
|
|
130
134
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
131
135
|
ggml_type type = (ggml_type) i;
|
|
132
|
-
|
|
136
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
137
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
133
138
|
|
|
134
139
|
// deprecated - skip
|
|
135
|
-
if (qfns
|
|
140
|
+
if (qfns->blck_size == 0) {
|
|
136
141
|
continue;
|
|
137
142
|
}
|
|
138
143
|
|
|
@@ -141,9 +146,11 @@ int main(int argc, char * argv[]) {
|
|
|
141
146
|
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
|
142
147
|
ggml_quantize_init(ei);
|
|
143
148
|
|
|
144
|
-
if (
|
|
145
|
-
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
|
149
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
150
|
+
const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
|
|
146
151
|
const float max_quantization_error =
|
|
152
|
+
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
|
153
|
+
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
|
147
154
|
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
|
148
155
|
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
|
149
156
|
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
|
@@ -155,17 +162,19 @@ int main(int argc, char * argv[]) {
|
|
|
155
162
|
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
|
156
163
|
}
|
|
157
164
|
|
|
158
|
-
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
|
|
165
|
+
const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
|
|
159
166
|
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
|
160
167
|
num_failed += failed;
|
|
161
168
|
if (failed || verbose) {
|
|
162
169
|
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
|
|
163
170
|
}
|
|
164
171
|
|
|
165
|
-
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
|
172
|
+
const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
|
|
166
173
|
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
|
167
174
|
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
|
168
175
|
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
|
176
|
+
: type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
|
|
177
|
+
? MAX_DOT_PRODUCT_ERROR_TERNARY
|
|
169
178
|
: MAX_DOT_PRODUCT_ERROR;
|
|
170
179
|
failed = !(vec_dot_error < max_allowed_error);
|
|
171
180
|
num_failed += failed;
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
// Benchmark quantization specific functions on synthetic data
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
#include "ggml-cpu.h"
|
|
4
5
|
|
|
5
6
|
#undef NDEBUG
|
|
6
7
|
#include <algorithm>
|
|
7
8
|
#include <assert.h>
|
|
8
9
|
#include <functional>
|
|
9
|
-
#include <inttypes.h>
|
|
10
10
|
#include <math.h>
|
|
11
11
|
#include <memory>
|
|
12
12
|
#include <stdio.h>
|
|
@@ -122,9 +122,10 @@ static void usage(char * argv[]) {
|
|
|
122
122
|
printf(" --type TYPE set test type as");
|
|
123
123
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
124
124
|
ggml_type type = (ggml_type) i;
|
|
125
|
-
|
|
125
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
126
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
126
127
|
if (ggml_type_name(type) != NULL) {
|
|
127
|
-
if (
|
|
128
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
128
129
|
printf(" %s", ggml_type_name(type));
|
|
129
130
|
}
|
|
130
131
|
}
|
|
@@ -270,12 +271,13 @@ int main(int argc, char * argv[]) {
|
|
|
270
271
|
|
|
271
272
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
272
273
|
ggml_type type = (ggml_type) i;
|
|
273
|
-
|
|
274
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
275
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
274
276
|
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
|
275
277
|
continue;
|
|
276
278
|
}
|
|
277
279
|
|
|
278
|
-
if (
|
|
280
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
279
281
|
printf("%s\n", ggml_type_name(type));
|
|
280
282
|
|
|
281
283
|
ggml_quantize_init(type);
|
|
@@ -285,7 +287,7 @@ int main(int argc, char * argv[]) {
|
|
|
285
287
|
for (size_t size : params.test_sizes) {
|
|
286
288
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
287
289
|
auto quantize_fn = [&](void) -> float {
|
|
288
|
-
qfns
|
|
290
|
+
qfns->from_float_ref(test_data1, test_q1, size);
|
|
289
291
|
return test_q1[0];
|
|
290
292
|
};
|
|
291
293
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -299,7 +301,7 @@ int main(int argc, char * argv[]) {
|
|
|
299
301
|
for (size_t size : params.test_sizes) {
|
|
300
302
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
301
303
|
auto quantize_fn = [&](void) -> float {
|
|
302
|
-
|
|
304
|
+
qfns_cpu->from_float(test_data1, test_q1, size);
|
|
303
305
|
return test_q1[0];
|
|
304
306
|
};
|
|
305
307
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -310,11 +312,11 @@ int main(int argc, char * argv[]) {
|
|
|
310
312
|
|
|
311
313
|
if (params.op_dequantize_row_q) {
|
|
312
314
|
printf(" dequantize_row_q\n");
|
|
313
|
-
|
|
315
|
+
qfns_cpu->from_float(test_data1, test_q1, largest);
|
|
314
316
|
for (size_t size : params.test_sizes) {
|
|
315
317
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
316
318
|
auto quantize_fn = [&](void) -> float {
|
|
317
|
-
qfns
|
|
319
|
+
qfns->to_float(test_q1, test_out, size);
|
|
318
320
|
return test_out[0];
|
|
319
321
|
};
|
|
320
322
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -328,8 +330,8 @@ int main(int argc, char * argv[]) {
|
|
|
328
330
|
for (size_t size : params.test_sizes) {
|
|
329
331
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
330
332
|
auto quantize_fn = [&](void) -> float {
|
|
331
|
-
auto vdot =
|
|
332
|
-
vdot
|
|
333
|
+
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
|
|
334
|
+
vdot->from_float(test_data1, test_q1, size);
|
|
333
335
|
return test_q1[0];
|
|
334
336
|
};
|
|
335
337
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -340,13 +342,13 @@ int main(int argc, char * argv[]) {
|
|
|
340
342
|
|
|
341
343
|
if (params.op_vec_dot_q) {
|
|
342
344
|
printf(" vec_dot_q\n");
|
|
343
|
-
|
|
344
|
-
|
|
345
|
+
qfns_cpu->from_float(test_data1, test_q1, largest);
|
|
346
|
+
qfns_cpu->from_float(test_data2, test_q2, largest);
|
|
345
347
|
for (size_t size : params.test_sizes) {
|
|
346
348
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
347
349
|
auto quantize_fn = [&](void) -> float {
|
|
348
350
|
float result;
|
|
349
|
-
|
|
351
|
+
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
|
350
352
|
return result;
|
|
351
353
|
};
|
|
352
354
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
|
+
#include "ggml-cpu.h"
|
|
2
3
|
|
|
3
4
|
#include <cmath>
|
|
4
5
|
#include <cstdio>
|
|
@@ -113,7 +114,7 @@ static struct ggml_tensor * get_random_tensor_f32(
|
|
|
113
114
|
}
|
|
114
115
|
|
|
115
116
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
116
|
-
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
|
117
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
|
117
118
|
|
|
118
119
|
if (plan.work_size > 0) {
|
|
119
120
|
buf.resize(plan.work_size);
|