@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
// Unit tests for quantization specific functions - quantize, dequantize and dot product
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
#include "ggml-cpu.h"
|
|
4
5
|
|
|
5
6
|
#undef NDEBUG
|
|
6
7
|
#include <assert.h>
|
|
@@ -44,26 +45,27 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
|
|
|
44
45
|
}
|
|
45
46
|
|
|
46
47
|
// Total quantization error on test data
|
|
47
|
-
static float total_quantization_error(
|
|
48
|
+
static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
|
|
48
49
|
std::vector<uint8_t> tmp_q(2*test_size);
|
|
49
50
|
std::vector<float> tmp_out(test_size);
|
|
50
51
|
|
|
51
|
-
|
|
52
|
-
qfns
|
|
52
|
+
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
|
53
|
+
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
|
53
54
|
return array_rmse(test_data, tmp_out.data(), test_size);
|
|
54
55
|
}
|
|
55
56
|
|
|
56
57
|
// Total quantization error on test data
|
|
57
|
-
static float reference_quantization_error(
|
|
58
|
+
static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
|
|
58
59
|
std::vector<uint8_t> tmp_q(2*test_size);
|
|
59
60
|
std::vector<float> tmp_out(test_size);
|
|
60
61
|
std::vector<float> tmp_out_ref(test_size);
|
|
61
62
|
|
|
62
|
-
|
|
63
|
-
|
|
63
|
+
// FIXME: why is done twice?
|
|
64
|
+
qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
|
|
65
|
+
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
|
|
64
66
|
|
|
65
|
-
qfns
|
|
66
|
-
qfns
|
|
67
|
+
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
|
|
68
|
+
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
|
67
69
|
|
|
68
70
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
|
69
71
|
}
|
|
@@ -77,19 +79,19 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
|
|
77
79
|
}
|
|
78
80
|
|
|
79
81
|
// Total dot product error
|
|
80
|
-
static float dot_product_error(
|
|
81
|
-
|
|
82
|
-
|
|
82
|
+
static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
|
|
83
|
+
GGML_UNUSED(qfns);
|
|
84
|
+
|
|
83
85
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
|
84
86
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
|
85
87
|
|
|
86
|
-
auto vdot =
|
|
88
|
+
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
|
|
87
89
|
|
|
88
|
-
|
|
89
|
-
vdot
|
|
90
|
+
qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size);
|
|
91
|
+
vdot->from_float(test_data2, tmp_q2.data(), test_size);
|
|
90
92
|
|
|
91
93
|
float result = INFINITY;
|
|
92
|
-
|
|
94
|
+
qfns_cpu->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
|
|
93
95
|
|
|
94
96
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
|
95
97
|
|
|
@@ -131,10 +133,11 @@ int main(int argc, char * argv[]) {
|
|
|
131
133
|
|
|
132
134
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
133
135
|
ggml_type type = (ggml_type) i;
|
|
134
|
-
|
|
136
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
137
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
135
138
|
|
|
136
139
|
// deprecated - skip
|
|
137
|
-
if (qfns
|
|
140
|
+
if (qfns->blck_size == 0) {
|
|
138
141
|
continue;
|
|
139
142
|
}
|
|
140
143
|
|
|
@@ -143,8 +146,8 @@ int main(int argc, char * argv[]) {
|
|
|
143
146
|
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
|
144
147
|
ggml_quantize_init(ei);
|
|
145
148
|
|
|
146
|
-
if (
|
|
147
|
-
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
|
149
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
150
|
+
const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
|
|
148
151
|
const float max_quantization_error =
|
|
149
152
|
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
|
150
153
|
type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
|
|
@@ -159,14 +162,14 @@ int main(int argc, char * argv[]) {
|
|
|
159
162
|
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
|
|
160
163
|
}
|
|
161
164
|
|
|
162
|
-
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
|
|
165
|
+
const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data());
|
|
163
166
|
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
|
|
164
167
|
num_failed += failed;
|
|
165
168
|
if (failed || verbose) {
|
|
166
169
|
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
|
|
167
170
|
}
|
|
168
171
|
|
|
169
|
-
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
|
172
|
+
const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
|
|
170
173
|
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
|
171
174
|
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
|
|
172
175
|
? MAX_DOT_PRODUCT_ERROR_LOWBIT
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
// Benchmark quantization specific functions on synthetic data
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
#include "ggml-cpu.h"
|
|
4
5
|
|
|
5
6
|
#undef NDEBUG
|
|
6
7
|
#include <algorithm>
|
|
7
8
|
#include <assert.h>
|
|
8
9
|
#include <functional>
|
|
9
|
-
#include <inttypes.h>
|
|
10
10
|
#include <math.h>
|
|
11
11
|
#include <memory>
|
|
12
12
|
#include <stdio.h>
|
|
@@ -122,9 +122,10 @@ static void usage(char * argv[]) {
|
|
|
122
122
|
printf(" --type TYPE set test type as");
|
|
123
123
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
124
124
|
ggml_type type = (ggml_type) i;
|
|
125
|
-
|
|
125
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
126
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
126
127
|
if (ggml_type_name(type) != NULL) {
|
|
127
|
-
if (
|
|
128
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
128
129
|
printf(" %s", ggml_type_name(type));
|
|
129
130
|
}
|
|
130
131
|
}
|
|
@@ -270,12 +271,13 @@ int main(int argc, char * argv[]) {
|
|
|
270
271
|
|
|
271
272
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
|
272
273
|
ggml_type type = (ggml_type) i;
|
|
273
|
-
|
|
274
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
275
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
274
276
|
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
|
275
277
|
continue;
|
|
276
278
|
}
|
|
277
279
|
|
|
278
|
-
if (
|
|
280
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
279
281
|
printf("%s\n", ggml_type_name(type));
|
|
280
282
|
|
|
281
283
|
ggml_quantize_init(type);
|
|
@@ -285,7 +287,7 @@ int main(int argc, char * argv[]) {
|
|
|
285
287
|
for (size_t size : params.test_sizes) {
|
|
286
288
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
287
289
|
auto quantize_fn = [&](void) -> float {
|
|
288
|
-
qfns
|
|
290
|
+
qfns->from_float_ref(test_data1, test_q1, size);
|
|
289
291
|
return test_q1[0];
|
|
290
292
|
};
|
|
291
293
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -299,7 +301,7 @@ int main(int argc, char * argv[]) {
|
|
|
299
301
|
for (size_t size : params.test_sizes) {
|
|
300
302
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
301
303
|
auto quantize_fn = [&](void) -> float {
|
|
302
|
-
|
|
304
|
+
qfns_cpu->from_float(test_data1, test_q1, size);
|
|
303
305
|
return test_q1[0];
|
|
304
306
|
};
|
|
305
307
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -310,11 +312,11 @@ int main(int argc, char * argv[]) {
|
|
|
310
312
|
|
|
311
313
|
if (params.op_dequantize_row_q) {
|
|
312
314
|
printf(" dequantize_row_q\n");
|
|
313
|
-
|
|
315
|
+
qfns_cpu->from_float(test_data1, test_q1, largest);
|
|
314
316
|
for (size_t size : params.test_sizes) {
|
|
315
317
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
316
318
|
auto quantize_fn = [&](void) -> float {
|
|
317
|
-
qfns
|
|
319
|
+
qfns->to_float(test_q1, test_out, size);
|
|
318
320
|
return test_out[0];
|
|
319
321
|
};
|
|
320
322
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -328,8 +330,8 @@ int main(int argc, char * argv[]) {
|
|
|
328
330
|
for (size_t size : params.test_sizes) {
|
|
329
331
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
330
332
|
auto quantize_fn = [&](void) -> float {
|
|
331
|
-
auto vdot =
|
|
332
|
-
vdot
|
|
333
|
+
const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
|
|
334
|
+
vdot->from_float(test_data1, test_q1, size);
|
|
333
335
|
return test_q1[0];
|
|
334
336
|
};
|
|
335
337
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -340,13 +342,13 @@ int main(int argc, char * argv[]) {
|
|
|
340
342
|
|
|
341
343
|
if (params.op_vec_dot_q) {
|
|
342
344
|
printf(" vec_dot_q\n");
|
|
343
|
-
|
|
344
|
-
|
|
345
|
+
qfns_cpu->from_float(test_data1, test_q1, largest);
|
|
346
|
+
qfns_cpu->from_float(test_data2, test_q2, largest);
|
|
345
347
|
for (size_t size : params.test_sizes) {
|
|
346
348
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
|
347
349
|
auto quantize_fn = [&](void) -> float {
|
|
348
350
|
float result;
|
|
349
|
-
|
|
351
|
+
qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
|
|
350
352
|
return result;
|
|
351
353
|
};
|
|
352
354
|
size_t quantized_size = ggml_row_size(type, size);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
|
+
#include "ggml-cpu.h"
|
|
2
3
|
|
|
3
4
|
#include <cmath>
|
|
4
5
|
#include <cstdio>
|
|
@@ -137,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|
|
137
138
|
struct ggml_tensor * x;
|
|
138
139
|
|
|
139
140
|
// rope f32
|
|
140
|
-
for (int m = 0; m <
|
|
141
|
+
for (int m = 0; m < 5; ++m) {
|
|
141
142
|
const int ndims = 4;
|
|
142
143
|
|
|
143
144
|
const int64_t n_rot = 128;
|
|
@@ -146,28 +147,69 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
|
|
146
147
|
const int n_past_0 = 100;
|
|
147
148
|
const int n_past_2 = 33;
|
|
148
149
|
|
|
149
|
-
struct ggml_tensor *
|
|
150
|
-
struct ggml_tensor *
|
|
151
|
-
struct ggml_tensor *
|
|
152
|
-
|
|
153
|
-
for (int i = 0; i < ne[2]; ++i) {
|
|
154
|
-
((int32_t *) p0->data)[i] = n_past_0 + i;
|
|
155
|
-
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
|
156
|
-
((int32_t *) p2->data)[i] = n_past_2 + i;
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
|
160
|
-
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
|
161
|
-
|
|
150
|
+
struct ggml_tensor * r0;
|
|
151
|
+
struct ggml_tensor * r1;
|
|
152
|
+
struct ggml_tensor * r2;
|
|
162
153
|
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
|
154
|
+
int mode = -1;
|
|
163
155
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
156
|
+
if (m < 3) {
|
|
157
|
+
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
|
158
|
+
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
|
159
|
+
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
|
168
160
|
|
|
169
|
-
|
|
170
|
-
|
|
161
|
+
for (int i = 0; i < ne[2]; ++i) {
|
|
162
|
+
((int32_t *) p0->data)[i] = n_past_0 + i;
|
|
163
|
+
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
|
164
|
+
((int32_t *) p2->data)[i] = n_past_2 + i;
|
|
165
|
+
}
|
|
166
|
+
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
|
167
|
+
mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
|
168
|
+
|
|
169
|
+
// 100, 101, 102, ..., 172
|
|
170
|
+
r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
|
171
|
+
// -67, -67, -67, ..., -67
|
|
172
|
+
r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
|
173
|
+
|
|
174
|
+
// 33, 34, 35, ..., 105
|
|
175
|
+
r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
|
176
|
+
} else {
|
|
177
|
+
// testing multi-dimension rope position embedding mode
|
|
178
|
+
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
|
179
|
+
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
|
180
|
+
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
|
181
|
+
|
|
182
|
+
int sections[4] = {16, 24, 24, 0};
|
|
183
|
+
mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
|
|
184
|
+
|
|
185
|
+
for (int i = 0; i < ne[2]; ++i) {
|
|
186
|
+
for (int j = 0; j < 4; ++j) {
|
|
187
|
+
((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
|
|
188
|
+
((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
|
|
189
|
+
((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// [[100, 101, 102, ..., 172],
|
|
194
|
+
// [101, 102, 103, ..., 173],
|
|
195
|
+
// [102, 103, 104, ..., 174]]
|
|
196
|
+
r0 = ggml_rope_multi(
|
|
197
|
+
ctx0, x, p0, nullptr,
|
|
198
|
+
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
|
199
|
+
// [[-67, -67, -67, ..., -67]
|
|
200
|
+
// [-67, -67, -67, ..., -67]
|
|
201
|
+
// [-67, -67, -67, ..., -67]]
|
|
202
|
+
r1 = ggml_rope_multi(
|
|
203
|
+
ctx0, r0, p1, nullptr,
|
|
204
|
+
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
|
205
|
+
|
|
206
|
+
// [[33, 34, 35, ..., 105]
|
|
207
|
+
// [34, 35, 36, ..., 106]
|
|
208
|
+
// [35, 36, 37, ..., 107]]
|
|
209
|
+
r2 = ggml_rope_multi(
|
|
210
|
+
ctx0, x, p2, nullptr,
|
|
211
|
+
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
|
212
|
+
}
|
|
171
213
|
|
|
172
214
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
173
215
|
|