@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
#include <string>
|
|
11
11
|
#include <vector>
|
|
12
12
|
|
|
13
|
+
extern struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers);
|
|
14
|
+
|
|
13
15
|
static void dump(const llama_token_data_array * cur_p) {
|
|
14
16
|
for (size_t i = 0; i < cur_p->size; i++) {
|
|
15
17
|
printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
|
@@ -18,181 +20,188 @@ static void dump(const llama_token_data_array * cur_p) {
|
|
|
18
20
|
|
|
19
21
|
#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
|
|
20
22
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
struct sampler_tester {
|
|
24
|
+
sampler_tester(size_t n_vocab) {
|
|
25
|
+
cur.reserve(n_vocab);
|
|
26
|
+
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
27
|
+
const float logit = logf(token_id);
|
|
28
|
+
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
29
|
+
}
|
|
26
30
|
|
|
27
|
-
|
|
28
|
-
|
|
31
|
+
cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
|
|
32
|
+
}
|
|
29
33
|
|
|
30
|
-
std::vector<
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
|
|
35
|
+
cur.reserve(probs.size());
|
|
36
|
+
for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
|
|
37
|
+
const float logit = logf(probs[token_id]);
|
|
38
|
+
cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
|
|
35
42
|
}
|
|
36
43
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
APPLY(llama_sampler_init_top_k(k), &cur_p);
|
|
41
|
-
DUMP(&cur_p);
|
|
42
|
-
|
|
43
|
-
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
44
|
-
for (size_t i = 0; i < cur_p.size; i++) {
|
|
45
|
-
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
|
|
44
|
+
void apply(llama_sampler * sampler) {
|
|
45
|
+
llama_sampler_apply(sampler, &cur_p);
|
|
46
|
+
llama_sampler_free(sampler);
|
|
46
47
|
}
|
|
47
|
-
}
|
|
48
48
|
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
void check() {
|
|
50
|
+
GGML_ASSERT(cur_p.size == probs_expected.size());
|
|
51
|
+
for (size_t i = 0; i < cur_p.size; i++) {
|
|
52
|
+
GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
llama_token_data_array cur_p;
|
|
57
|
+
|
|
58
|
+
private:
|
|
59
|
+
const std::vector<float> probs_expected;
|
|
51
60
|
|
|
52
61
|
std::vector<llama_token_data> cur;
|
|
53
|
-
|
|
54
|
-
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
55
|
-
const float logit = logf(probs[token_id]);
|
|
56
|
-
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
57
|
-
}
|
|
62
|
+
};
|
|
58
63
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
}
|
|
64
|
+
static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
|
|
65
|
+
sampler_tester tester(probs, probs_expected);
|
|
66
|
+
|
|
67
|
+
DUMP(&tester.cur_p);
|
|
68
|
+
tester.apply(llama_sampler_init_temp(temp));
|
|
69
|
+
tester.apply(llama_sampler_init_dist(0));
|
|
70
|
+
DUMP(&tester.cur_p);
|
|
71
|
+
|
|
72
|
+
tester.check();
|
|
69
73
|
}
|
|
70
74
|
|
|
71
|
-
static void
|
|
72
|
-
|
|
75
|
+
static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
|
|
76
|
+
sampler_tester tester(probs, probs_expected);
|
|
73
77
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
79
|
-
}
|
|
78
|
+
DUMP(&tester.cur_p);
|
|
79
|
+
tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
|
|
80
|
+
tester.apply(llama_sampler_init_dist (0));
|
|
81
|
+
DUMP(&tester.cur_p);
|
|
80
82
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
|
|
84
|
-
DUMP(&cur_p);
|
|
83
|
+
tester.check();
|
|
84
|
+
}
|
|
85
85
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
86
|
+
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
|
|
87
|
+
sampler_tester tester(probs, probs_expected);
|
|
88
|
+
|
|
89
|
+
DUMP(&tester.cur_p);
|
|
90
|
+
tester.apply(llama_sampler_init_top_k(k));
|
|
91
|
+
tester.apply(llama_sampler_init_dist (0));
|
|
92
|
+
DUMP(&tester.cur_p);
|
|
93
|
+
|
|
94
|
+
tester.check();
|
|
90
95
|
}
|
|
91
96
|
|
|
92
|
-
static void
|
|
93
|
-
|
|
97
|
+
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
|
|
98
|
+
sampler_tester tester(probs, probs_expected);
|
|
94
99
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
100
|
-
}
|
|
100
|
+
DUMP(&tester.cur_p);
|
|
101
|
+
tester.apply(llama_sampler_init_top_p(p, 1));
|
|
102
|
+
tester.apply(llama_sampler_init_dist (0));
|
|
103
|
+
DUMP(&tester.cur_p);
|
|
101
104
|
|
|
102
|
-
|
|
103
|
-
DUMP(&cur_p);
|
|
104
|
-
APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
|
|
105
|
-
DUMP(&cur_p);
|
|
106
|
-
APPLY(llama_sampler_init_softmax(), &cur_p);
|
|
107
|
-
|
|
108
|
-
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
109
|
-
for (size_t i = 0; i < cur_p.size; i++) {
|
|
110
|
-
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
111
|
-
}
|
|
105
|
+
tester.check();
|
|
112
106
|
}
|
|
113
107
|
|
|
114
|
-
static void
|
|
115
|
-
|
|
108
|
+
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
|
|
109
|
+
sampler_tester tester(probs, probs_expected);
|
|
116
110
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
122
|
-
}
|
|
111
|
+
DUMP(&tester.cur_p);
|
|
112
|
+
tester.apply(llama_sampler_init_min_p(p, 1));
|
|
113
|
+
tester.apply(llama_sampler_init_dist (0));
|
|
114
|
+
DUMP(&tester.cur_p);
|
|
123
115
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
APPLY(llama_sampler_init_typical(p, 1), &cur_p);
|
|
127
|
-
DUMP(&cur_p);
|
|
116
|
+
tester.check();
|
|
117
|
+
}
|
|
128
118
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
119
|
+
static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
|
|
120
|
+
sampler_tester tester(probs, probs_expected);
|
|
121
|
+
|
|
122
|
+
DUMP(&tester.cur_p);
|
|
123
|
+
tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
|
|
124
|
+
DUMP(&tester.cur_p);
|
|
125
|
+
|
|
126
|
+
tester.check();
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
|
|
130
|
+
sampler_tester tester(probs, probs_expected);
|
|
131
|
+
|
|
132
|
+
DUMP(&tester.cur_p);
|
|
133
|
+
tester.apply(llama_sampler_init_typical(p, 1));
|
|
134
|
+
DUMP(&tester.cur_p);
|
|
135
|
+
|
|
136
|
+
tester.check();
|
|
133
137
|
}
|
|
134
138
|
|
|
135
139
|
static void test_penalties(
|
|
136
140
|
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
|
137
|
-
const std::vector<float> &
|
|
141
|
+
const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
|
|
138
142
|
) {
|
|
139
|
-
GGML_ASSERT(probs.size() ==
|
|
143
|
+
GGML_ASSERT(probs.size() == probs_expected.size());
|
|
144
|
+
|
|
145
|
+
sampler_tester tester(probs, probs_expected);
|
|
140
146
|
|
|
141
147
|
const size_t n_vocab = probs.size();
|
|
148
|
+
auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
|
|
142
149
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
146
|
-
const float logit = logf(probs[token_id]);
|
|
147
|
-
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
150
|
+
for (size_t i = 0; i < last_tokens.size(); i++) {
|
|
151
|
+
llama_sampler_accept(sampler, last_tokens[i]);
|
|
148
152
|
}
|
|
149
153
|
|
|
150
|
-
|
|
154
|
+
DUMP(&tester.cur_p);
|
|
155
|
+
tester.apply(sampler);
|
|
156
|
+
tester.apply(llama_sampler_init_dist(0));
|
|
157
|
+
DUMP(&tester.cur_p);
|
|
158
|
+
|
|
159
|
+
tester.check();
|
|
160
|
+
}
|
|
151
161
|
|
|
152
|
-
|
|
162
|
+
static void test_dry(
|
|
163
|
+
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
|
164
|
+
const std::vector<float> & expected_probs, float dry_multiplier, float dry_base,
|
|
165
|
+
int dry_allowed_length, int dry_penalty_last_n,
|
|
166
|
+
const std::vector<std::vector<llama_token>> & seq_breakers
|
|
167
|
+
) {
|
|
168
|
+
GGML_ASSERT(probs.size() == expected_probs.size());
|
|
169
|
+
|
|
170
|
+
sampler_tester tester(probs, expected_probs);
|
|
171
|
+
|
|
172
|
+
auto * sampler = llama_sampler_init_dry_testing(1024, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers);
|
|
153
173
|
|
|
154
174
|
for (size_t i = 0; i < last_tokens.size(); i++) {
|
|
155
175
|
llama_sampler_accept(sampler, last_tokens[i]);
|
|
156
176
|
}
|
|
157
177
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
GGML_ASSERT(cur_p.size == expected_probs.size());
|
|
165
|
-
for (size_t i = 0; i < cur_p.size; i++) {
|
|
166
|
-
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
|
|
167
|
-
}
|
|
178
|
+
DUMP(&tester.cur_p);
|
|
179
|
+
tester.apply(sampler);
|
|
180
|
+
tester.apply(llama_sampler_init_dist(0));
|
|
181
|
+
DUMP(&tester.cur_p);
|
|
182
|
+
tester.check();
|
|
168
183
|
}
|
|
169
184
|
|
|
170
185
|
static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
|
|
171
186
|
) {
|
|
172
|
-
|
|
173
|
-
cur.reserve(n_vocab);
|
|
174
|
-
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
|
175
|
-
const float logit = logf(token_id);
|
|
176
|
-
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
187
|
+
sampler_tester tester(n_vocab);
|
|
180
188
|
|
|
181
189
|
llama_token min_token_id = 0;
|
|
182
190
|
const llama_token max_token_id = n_vocab-1;
|
|
183
191
|
|
|
184
192
|
for (auto s : samplers_sequence) {
|
|
185
193
|
switch (s){
|
|
186
|
-
case 'k':
|
|
187
|
-
case 'f': GGML_ABORT("tail_free test not implemented");
|
|
194
|
+
case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
|
|
188
195
|
case 'y': GGML_ABORT("typical test not implemented");
|
|
189
|
-
case 'p':
|
|
190
|
-
case 'm':
|
|
196
|
+
case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
|
|
197
|
+
case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
|
|
191
198
|
case 't': GGML_ABORT("temperature test not implemented");
|
|
192
199
|
default : GGML_ABORT("Unknown sampler");
|
|
193
200
|
}
|
|
194
201
|
|
|
195
|
-
|
|
202
|
+
tester.apply(llama_sampler_init_dist(0));
|
|
203
|
+
|
|
204
|
+
auto & cur_p = tester.cur_p;
|
|
196
205
|
|
|
197
206
|
const int size = cur_p.size;
|
|
198
207
|
|
|
@@ -263,7 +272,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
|
|
|
263
272
|
}
|
|
264
273
|
const int64_t t_end = ggml_time_us();
|
|
265
274
|
llama_sampler_free(cnstr);
|
|
266
|
-
printf("%-
|
|
275
|
+
printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
|
|
267
276
|
}
|
|
268
277
|
|
|
269
278
|
#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
|
|
@@ -275,30 +284,35 @@ static void test_perf() {
|
|
|
275
284
|
|
|
276
285
|
data.reserve(n_vocab);
|
|
277
286
|
for (int i = 0; i < n_vocab; i++) {
|
|
278
|
-
const float logit = 2.0f*((
|
|
287
|
+
const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5);
|
|
279
288
|
data.emplace_back(llama_token_data{i, logit, 0.0f});
|
|
280
289
|
}
|
|
281
290
|
|
|
282
|
-
BENCH(llama_sampler_init_top_k
|
|
283
|
-
BENCH(llama_sampler_init_top_p
|
|
284
|
-
BENCH(llama_sampler_init_min_p
|
|
285
|
-
BENCH(
|
|
286
|
-
BENCH(
|
|
287
|
-
BENCH(llama_sampler_init_softmax (), data, 32);
|
|
291
|
+
BENCH(llama_sampler_init_top_k (40), data, 32);
|
|
292
|
+
BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
|
|
293
|
+
BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
|
|
294
|
+
BENCH(llama_sampler_init_typical(0.5f, 1), data, 32);
|
|
295
|
+
BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32);
|
|
288
296
|
}
|
|
289
297
|
|
|
290
298
|
int main(void) {
|
|
291
299
|
ggml_time_init();
|
|
292
300
|
|
|
293
|
-
|
|
294
|
-
|
|
301
|
+
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
|
|
302
|
+
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
|
|
303
|
+
|
|
304
|
+
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
|
|
305
|
+
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
|
|
306
|
+
|
|
307
|
+
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
|
|
308
|
+
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
|
|
295
309
|
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
|
|
296
310
|
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
|
|
297
311
|
|
|
298
|
-
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {
|
|
299
|
-
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.
|
|
300
|
-
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.
|
|
301
|
-
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
|
312
|
+
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
|
|
313
|
+
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
|
|
314
|
+
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
|
|
315
|
+
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
|
|
302
316
|
|
|
303
317
|
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
|
|
304
318
|
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
|
|
@@ -309,9 +323,13 @@ int main(void) {
|
|
|
309
323
|
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
|
|
310
324
|
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
|
|
311
325
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
326
|
+
printf("XTC should:\n");
|
|
327
|
+
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.1f}, 0.99f, 0.09f);
|
|
328
|
+
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.1f}, 0.99f, 0.19f);
|
|
329
|
+
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.3f, 0.2f, 0.1f}, 0.99f, 0.29f);
|
|
330
|
+
|
|
331
|
+
printf("XTC should not:\n");
|
|
332
|
+
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.99f, 0.39f);
|
|
315
333
|
|
|
316
334
|
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
|
317
335
|
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
|
@@ -324,6 +342,13 @@ int main(void) {
|
|
|
324
342
|
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
|
325
343
|
test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
|
326
344
|
|
|
345
|
+
|
|
346
|
+
test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1}, {0.25f, 0.25f, 0.25f, 0.25f}, 1.0f, 1.1f, 2, 4, {});
|
|
347
|
+
test_dry({0.25f, 0.25f, 0.25f, 0.25f}, {0, 1, 2, 0, 1}, {0.296923f, 0.296923f, 0.296923f, 0.109232f}, 1.0f, 1.1f, 2, 5, {});
|
|
348
|
+
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 2, 6, {{3}});
|
|
349
|
+
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.241818f, 0.241818f, 0.032727f}, 2.0f, 1.1f, 2, 5, {});
|
|
350
|
+
test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {});
|
|
351
|
+
|
|
327
352
|
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
|
|
328
353
|
test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
|
|
329
354
|
test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
|
|
@@ -202,7 +202,7 @@ int main(int argc, char **argv) {
|
|
|
202
202
|
for (int i = 0; i < nthread; i++) {
|
|
203
203
|
threads[i] = std::thread([&, i]() {
|
|
204
204
|
for (const auto & test_kv : k_tests) {
|
|
205
|
-
const std::vector<llama_token> res =
|
|
205
|
+
const std::vector<llama_token> res = common_tokenize(ctx, test_kv.first, add_special, false);
|
|
206
206
|
|
|
207
207
|
// here only print the result of the first thread
|
|
208
208
|
// because the other threads are running the same tests
|
|
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
|
|
|
212
212
|
|
|
213
213
|
printf("\n");
|
|
214
214
|
printf("src: '%s'\n", test_kv.first.c_str());
|
|
215
|
-
printf("res: '%s'\n",
|
|
215
|
+
printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
|
|
216
216
|
printf("tok: ");
|
|
217
217
|
for (const auto & tok : res) {
|
|
218
218
|
printf("%d ", tok);
|
|
@@ -229,16 +229,16 @@ int main(int argc, char **argv) {
|
|
|
229
229
|
if (!correct) {
|
|
230
230
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
|
231
231
|
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
|
232
|
-
|
|
233
|
-
|
|
232
|
+
common_detokenize(ctx, res).c_str(),
|
|
233
|
+
common_detokenize(ctx, test_kv.second).c_str());
|
|
234
234
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
|
235
235
|
for (const auto & t : test_kv.second) {
|
|
236
|
-
fprintf(stderr, "%6d '%s', ", t,
|
|
236
|
+
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
|
|
237
237
|
}
|
|
238
238
|
fprintf(stderr, "\n");
|
|
239
239
|
fprintf(stderr, "%s : got tokens: ", __func__);
|
|
240
240
|
for (const auto & t : res) {
|
|
241
|
-
fprintf(stderr, "%6d '%s', ", t,
|
|
241
|
+
fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
|
|
242
242
|
}
|
|
243
243
|
fprintf(stderr, "\n");
|
|
244
244
|
|
|
@@ -273,7 +273,7 @@ int main(int argc, char **argv) {
|
|
|
273
273
|
{
|
|
274
274
|
const auto t_start = ggml_time_us();
|
|
275
275
|
|
|
276
|
-
res =
|
|
276
|
+
res = common_tokenize(ctx, text, add_special, false);
|
|
277
277
|
|
|
278
278
|
const auto t_end = ggml_time_us();
|
|
279
279
|
|
|
@@ -78,10 +78,10 @@ int main(int argc, char **argv) {
|
|
|
78
78
|
const int n_vocab = llama_n_vocab(model);
|
|
79
79
|
|
|
80
80
|
for (int i = 0; i < n_vocab; ++i) {
|
|
81
|
-
std::string str =
|
|
81
|
+
std::string str = common_detokenize(ctx, std::vector<int>(1, i));
|
|
82
82
|
try {
|
|
83
83
|
auto cps = unicode_cpts_from_utf8(str);
|
|
84
|
-
std::vector<llama_token> tokens =
|
|
84
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
|
|
85
85
|
if (ignore_merges && tokens.size() > 1) {
|
|
86
86
|
fprintf(stderr,
|
|
87
87
|
"%s : error: token %d detokenizes to '%s'(%zu) but "
|
|
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
|
|
|
94
94
|
fprintf(stderr, "]\n");
|
|
95
95
|
return 2;
|
|
96
96
|
}
|
|
97
|
-
std::string check =
|
|
97
|
+
std::string check = common_detokenize(ctx, tokens);
|
|
98
98
|
if (check != str) {
|
|
99
99
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
|
100
100
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
|
@@ -123,8 +123,8 @@ int main(int argc, char **argv) {
|
|
|
123
123
|
}
|
|
124
124
|
|
|
125
125
|
std::string str = unicode_cpt_to_utf8(cp);
|
|
126
|
-
std::vector<llama_token> tokens =
|
|
127
|
-
std::string check =
|
|
126
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, str, false);
|
|
127
|
+
std::string check = common_detokenize(ctx, tokens);
|
|
128
128
|
if (cp != 9601 && str != check) {
|
|
129
129
|
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
130
130
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
@@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
|
|
|
66
66
|
const int n_vocab = llama_n_vocab(model);
|
|
67
67
|
|
|
68
68
|
for (int i = 0; i < n_vocab; ++i) {
|
|
69
|
-
std::string str =
|
|
70
|
-
std::vector<llama_token> tokens =
|
|
71
|
-
std::string check =
|
|
69
|
+
std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);
|
|
70
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
|
|
71
|
+
std::string check = common_detokenize(ctx, tokens);
|
|
72
72
|
if (check != str) {
|
|
73
73
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
|
74
74
|
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
|
|
@@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
|
|
|
93
93
|
}
|
|
94
94
|
|
|
95
95
|
std::string str = unicode_cpt_to_utf8(cp);
|
|
96
|
-
std::vector<llama_token> tokens =
|
|
97
|
-
std::string check =
|
|
96
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, str, false, true);
|
|
97
|
+
std::string check = common_detokenize(ctx, tokens);
|
|
98
98
|
if (cp != 9601 && str != check) {
|
|
99
99
|
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
100
100
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
name: Nix aarch64 builds
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
workflow_dispatch: # allows manual triggering
|
|
5
|
-
schedule:
|
|
6
|
-
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
|
7
|
-
# 1.5h instead of minutes with the cold cache).
|
|
8
|
-
#
|
|
9
|
-
# randint(0, 59), randint(0, 23)
|
|
10
|
-
- cron: '26 12 * * *'
|
|
11
|
-
# But also rebuild if we touched any of the Nix expressions:
|
|
12
|
-
push:
|
|
13
|
-
branches:
|
|
14
|
-
- master
|
|
15
|
-
paths: ['**/*.nix', 'flake.lock']
|
|
16
|
-
pull_request:
|
|
17
|
-
types: [opened, synchronize, reopened]
|
|
18
|
-
paths: ['**/*.nix', 'flake.lock']
|
|
19
|
-
|
|
20
|
-
concurrency:
|
|
21
|
-
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
|
22
|
-
cancel-in-progress: true
|
|
23
|
-
|
|
24
|
-
# Fine-grant permission
|
|
25
|
-
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
|
26
|
-
permissions:
|
|
27
|
-
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
|
28
|
-
id-token: write
|
|
29
|
-
contents: read
|
|
30
|
-
|
|
31
|
-
jobs:
|
|
32
|
-
nix-build-aarch64:
|
|
33
|
-
runs-on: ubuntu-latest
|
|
34
|
-
steps:
|
|
35
|
-
- name: Checkout repository
|
|
36
|
-
uses: actions/checkout@v4
|
|
37
|
-
- name: Install QEMU
|
|
38
|
-
# Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
|
|
39
|
-
run: |
|
|
40
|
-
sudo apt-get update
|
|
41
|
-
sudo apt-get install -y qemu-user-static qemu-system-aarch64
|
|
42
|
-
sudo usermod -a -G kvm $USER
|
|
43
|
-
- name: Install Nix
|
|
44
|
-
uses: DeterminateSystems/nix-installer-action@v9
|
|
45
|
-
with:
|
|
46
|
-
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
47
|
-
extra-conf: |
|
|
48
|
-
extra-platforms = aarch64-linux
|
|
49
|
-
extra-system-features = nixos-test kvm
|
|
50
|
-
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
|
51
|
-
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
|
52
|
-
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
|
53
|
-
with:
|
|
54
|
-
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
|
55
|
-
- name: Set-up cachix to push the results to
|
|
56
|
-
uses: cachix/cachix-action@v13
|
|
57
|
-
with:
|
|
58
|
-
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
|
59
|
-
name: llama-cpp
|
|
60
|
-
- name: Show all output paths
|
|
61
|
-
run: >
|
|
62
|
-
nix run github:nix-community/nix-eval-jobs
|
|
63
|
-
-- --gc-roots-dir gcroot
|
|
64
|
-
--flake
|
|
65
|
-
".#packages.aarch64-linux"
|
|
66
|
-
- name: Build
|
|
67
|
-
run: >
|
|
68
|
-
nix run github:Mic92/nix-fast-build
|
|
69
|
-
-- --skip-cached --no-nom
|
|
70
|
-
--systems aarch64-linux
|
|
71
|
-
--flake
|
|
72
|
-
".#checks.aarch64-linux"
|