@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -8,10 +8,10 @@
|
|
|
8
8
|
#include <thread>
|
|
9
9
|
#include <vector>
|
|
10
10
|
|
|
11
|
-
int
|
|
11
|
+
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
|
12
12
|
|
|
13
|
-
void
|
|
14
|
-
|
|
13
|
+
void common_log_set_verbosity_thold(int verbosity) {
|
|
14
|
+
common_log_verbosity_thold = verbosity;
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
#define LOG_COL_DEFAULT "\033[0m"
|
|
@@ -29,16 +29,16 @@ static int64_t t_us() {
|
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
// colors
|
|
32
|
-
enum
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
32
|
+
enum common_log_col : int {
|
|
33
|
+
COMMON_LOG_COL_DEFAULT = 0,
|
|
34
|
+
COMMON_LOG_COL_BOLD,
|
|
35
|
+
COMMON_LOG_COL_RED,
|
|
36
|
+
COMMON_LOG_COL_GREEN,
|
|
37
|
+
COMMON_LOG_COL_YELLOW,
|
|
38
|
+
COMMON_LOG_COL_BLUE,
|
|
39
|
+
COMMON_LOG_COL_MAGENTA,
|
|
40
|
+
COMMON_LOG_COL_CYAN,
|
|
41
|
+
COMMON_LOG_COL_WHITE,
|
|
42
42
|
};
|
|
43
43
|
|
|
44
44
|
// disable colors by default
|
|
@@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
|
|
|
54
54
|
"",
|
|
55
55
|
};
|
|
56
56
|
|
|
57
|
-
struct
|
|
57
|
+
struct common_log_entry {
|
|
58
58
|
enum ggml_log_level level;
|
|
59
59
|
|
|
60
60
|
bool prefix;
|
|
@@ -71,7 +71,7 @@ struct gpt_log_entry {
|
|
|
71
71
|
if (!fcur) {
|
|
72
72
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
|
73
73
|
// these messages will still be logged to a file
|
|
74
|
-
if (level == GGML_LOG_LEVEL_DEBUG &&
|
|
74
|
+
if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
|
75
75
|
return;
|
|
76
76
|
}
|
|
77
77
|
|
|
@@ -86,19 +86,19 @@ struct gpt_log_entry {
|
|
|
86
86
|
if (timestamp) {
|
|
87
87
|
// [M.s.ms.us]
|
|
88
88
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
|
89
|
-
g_col[
|
|
89
|
+
g_col[COMMON_LOG_COL_BLUE],
|
|
90
90
|
(int) (timestamp / 1000000 / 60),
|
|
91
91
|
(int) (timestamp / 1000000 % 60),
|
|
92
92
|
(int) (timestamp / 1000 % 1000),
|
|
93
93
|
(int) (timestamp % 1000),
|
|
94
|
-
g_col[
|
|
94
|
+
g_col[COMMON_LOG_COL_DEFAULT]);
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
switch (level) {
|
|
98
|
-
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[
|
|
99
|
-
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[
|
|
100
|
-
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[
|
|
101
|
-
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[
|
|
98
|
+
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
|
|
99
|
+
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
|
|
100
|
+
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
|
|
101
|
+
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
|
|
102
102
|
default:
|
|
103
103
|
break;
|
|
104
104
|
}
|
|
@@ -107,18 +107,18 @@ struct gpt_log_entry {
|
|
|
107
107
|
fprintf(fcur, "%s", msg.data());
|
|
108
108
|
|
|
109
109
|
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
|
110
|
-
fprintf(fcur, "%s", g_col[
|
|
110
|
+
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
|
|
111
111
|
}
|
|
112
112
|
|
|
113
113
|
fflush(fcur);
|
|
114
114
|
}
|
|
115
115
|
};
|
|
116
116
|
|
|
117
|
-
struct
|
|
117
|
+
struct common_log {
|
|
118
118
|
// default capacity - will be expanded if needed
|
|
119
|
-
|
|
119
|
+
common_log() : common_log(256) {}
|
|
120
120
|
|
|
121
|
-
|
|
121
|
+
common_log(size_t capacity) {
|
|
122
122
|
file = nullptr;
|
|
123
123
|
prefix = false;
|
|
124
124
|
timestamps = false;
|
|
@@ -137,7 +137,7 @@ struct gpt_log {
|
|
|
137
137
|
resume();
|
|
138
138
|
}
|
|
139
139
|
|
|
140
|
-
~
|
|
140
|
+
~common_log() {
|
|
141
141
|
pause();
|
|
142
142
|
if (file) {
|
|
143
143
|
fclose(file);
|
|
@@ -158,12 +158,12 @@ private:
|
|
|
158
158
|
int64_t t_start;
|
|
159
159
|
|
|
160
160
|
// ring buffer of entries
|
|
161
|
-
std::vector<
|
|
161
|
+
std::vector<common_log_entry> entries;
|
|
162
162
|
size_t head;
|
|
163
163
|
size_t tail;
|
|
164
164
|
|
|
165
165
|
// worker thread copies into this
|
|
166
|
-
|
|
166
|
+
common_log_entry cur;
|
|
167
167
|
|
|
168
168
|
public:
|
|
169
169
|
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
|
@@ -219,7 +219,7 @@ public:
|
|
|
219
219
|
tail = (tail + 1) % entries.size();
|
|
220
220
|
if (tail == head) {
|
|
221
221
|
// expand the buffer
|
|
222
|
-
std::vector<
|
|
222
|
+
std::vector<common_log_entry> new_entries(2*entries.size());
|
|
223
223
|
|
|
224
224
|
size_t new_tail = 0;
|
|
225
225
|
|
|
@@ -320,15 +320,15 @@ public:
|
|
|
320
320
|
pause();
|
|
321
321
|
|
|
322
322
|
if (colors) {
|
|
323
|
-
g_col[
|
|
324
|
-
g_col[
|
|
325
|
-
g_col[
|
|
326
|
-
g_col[
|
|
327
|
-
g_col[
|
|
328
|
-
g_col[
|
|
329
|
-
g_col[
|
|
330
|
-
g_col[
|
|
331
|
-
g_col[
|
|
323
|
+
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
|
324
|
+
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
|
|
325
|
+
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
|
|
326
|
+
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
|
|
327
|
+
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
|
328
|
+
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
|
|
329
|
+
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
|
330
|
+
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
|
|
331
|
+
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
|
|
332
332
|
} else {
|
|
333
333
|
for (size_t i = 0; i < g_col.size(); i++) {
|
|
334
334
|
g_col[i] = "";
|
|
@@ -355,47 +355,47 @@ public:
|
|
|
355
355
|
// public API
|
|
356
356
|
//
|
|
357
357
|
|
|
358
|
-
struct
|
|
359
|
-
return new
|
|
358
|
+
struct common_log * common_log_init() {
|
|
359
|
+
return new common_log;
|
|
360
360
|
}
|
|
361
361
|
|
|
362
|
-
struct
|
|
363
|
-
static struct
|
|
362
|
+
struct common_log * common_log_main() {
|
|
363
|
+
static struct common_log log;
|
|
364
364
|
|
|
365
365
|
return &log;
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
-
void
|
|
368
|
+
void common_log_pause(struct common_log * log) {
|
|
369
369
|
log->pause();
|
|
370
370
|
}
|
|
371
371
|
|
|
372
|
-
void
|
|
372
|
+
void common_log_resume(struct common_log * log) {
|
|
373
373
|
log->resume();
|
|
374
374
|
}
|
|
375
375
|
|
|
376
|
-
void
|
|
376
|
+
void common_log_free(struct common_log * log) {
|
|
377
377
|
delete log;
|
|
378
378
|
}
|
|
379
379
|
|
|
380
|
-
void
|
|
380
|
+
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
|
381
381
|
va_list args;
|
|
382
382
|
va_start(args, fmt);
|
|
383
383
|
log->add(level, fmt, args);
|
|
384
384
|
va_end(args);
|
|
385
385
|
}
|
|
386
386
|
|
|
387
|
-
void
|
|
387
|
+
void common_log_set_file(struct common_log * log, const char * file) {
|
|
388
388
|
log->set_file(file);
|
|
389
389
|
}
|
|
390
390
|
|
|
391
|
-
void
|
|
391
|
+
void common_log_set_colors(struct common_log * log, bool colors) {
|
|
392
392
|
log->set_colors(colors);
|
|
393
393
|
}
|
|
394
394
|
|
|
395
|
-
void
|
|
395
|
+
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
|
396
396
|
log->set_prefix(prefix);
|
|
397
397
|
}
|
|
398
398
|
|
|
399
|
-
void
|
|
399
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
|
400
400
|
log->set_timestamps(timestamps);
|
|
401
401
|
}
|
|
@@ -14,23 +14,23 @@
|
|
|
14
14
|
#define LOG_DEFAULT_LLAMA 0
|
|
15
15
|
|
|
16
16
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
|
17
|
-
// set via
|
|
18
|
-
extern int
|
|
17
|
+
// set via common_log_set_verbosity()
|
|
18
|
+
extern int common_log_verbosity_thold;
|
|
19
19
|
|
|
20
|
-
void
|
|
20
|
+
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
|
21
21
|
|
|
22
|
-
// the
|
|
22
|
+
// the common_log uses an internal worker thread to print/write log messages
|
|
23
23
|
// when the worker thread is paused, incoming log messages are discarded
|
|
24
|
-
struct
|
|
24
|
+
struct common_log;
|
|
25
25
|
|
|
26
|
-
struct
|
|
27
|
-
struct
|
|
28
|
-
void
|
|
29
|
-
void
|
|
30
|
-
void
|
|
26
|
+
struct common_log * common_log_init();
|
|
27
|
+
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
|
|
28
|
+
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
|
|
29
|
+
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
|
|
30
|
+
void common_log_free (struct common_log * log);
|
|
31
31
|
|
|
32
32
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
|
33
|
-
void
|
|
33
|
+
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
|
|
34
34
|
|
|
35
35
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
|
36
36
|
//
|
|
@@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f
|
|
|
54
54
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
|
55
55
|
//
|
|
56
56
|
|
|
57
|
-
void
|
|
58
|
-
void
|
|
59
|
-
void
|
|
60
|
-
void
|
|
57
|
+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
|
58
|
+
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
|
59
|
+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
|
60
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
|
61
61
|
|
|
62
62
|
// helper macros for logging
|
|
63
63
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
|
@@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
|
|
|
66
66
|
//
|
|
67
67
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
|
68
68
|
//
|
|
69
|
-
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG >
|
|
69
|
+
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
|
|
70
70
|
//
|
|
71
71
|
|
|
72
72
|
#define LOG_TMPL(level, verbosity, ...) \
|
|
73
73
|
do { \
|
|
74
|
-
if ((verbosity) <=
|
|
75
|
-
|
|
74
|
+
if ((verbosity) <= common_log_verbosity_thold) { \
|
|
75
|
+
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
|
76
76
|
} \
|
|
77
77
|
} while (0)
|
|
78
78
|
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
#include <fstream>
|
|
9
9
|
#include <thread>
|
|
10
10
|
|
|
11
|
-
void
|
|
11
|
+
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
|
12
12
|
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
|
13
13
|
const int64_t t_start_ms = ggml_time_ms();
|
|
14
14
|
const int64_t inp_size = inp.size();
|
|
@@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
|
|
|
20
20
|
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
|
21
21
|
for (int64_t i = i_start; i < inp_size; ++i) {
|
|
22
22
|
const int64_t ngram_start = i - ngram_size;
|
|
23
|
-
|
|
23
|
+
common_ngram ngram(&inp[ngram_start], ngram_size);
|
|
24
24
|
const llama_token token = inp[i];
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
|
27
27
|
if (part_it == ngram_cache.end()) {
|
|
28
|
-
|
|
28
|
+
common_ngram_cache_part part;
|
|
29
29
|
part.emplace(token, 1);
|
|
30
30
|
ngram_cache.emplace(ngram, part);
|
|
31
31
|
} else {
|
|
32
|
-
|
|
32
|
+
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
|
33
33
|
if (token_count_it == part_it->second.end()) {
|
|
34
34
|
part_it->second.emplace(token, 1);
|
|
35
35
|
} else {
|
|
@@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
|
|
62
62
|
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
|
63
63
|
|
|
64
64
|
// Helper function that tries to draft a token from only the static ngram cache:
|
|
65
|
-
static llama_token try_draft(
|
|
66
|
-
|
|
65
|
+
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
|
66
|
+
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
|
67
67
|
if (part_static_it == nc_static.end()) {
|
|
68
68
|
return -1;
|
|
69
69
|
}
|
|
70
|
-
const
|
|
70
|
+
const common_ngram_cache_part part_static = part_static_it->second;
|
|
71
71
|
|
|
72
72
|
int max_count_static = 0;
|
|
73
73
|
int sum_count_static = 0;
|
|
@@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
|
|
|
95
95
|
|
|
96
96
|
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
|
97
97
|
static llama_token try_draft(
|
|
98
|
-
|
|
98
|
+
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
|
99
99
|
const int * min_sample_size, const int * min_percent) {
|
|
100
100
|
|
|
101
101
|
llama_token drafted_token = -1;
|
|
102
102
|
|
|
103
103
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
|
104
|
-
const
|
|
104
|
+
const common_ngram ngram_primary = ngrams_primary[i];
|
|
105
105
|
|
|
106
|
-
|
|
106
|
+
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
|
107
107
|
if (part_primary_it == nc_primary.end()) {
|
|
108
108
|
continue;
|
|
109
109
|
}
|
|
110
|
-
const
|
|
110
|
+
const common_ngram_cache_part part_primary = part_primary_it->second;
|
|
111
111
|
|
|
112
112
|
int max_count_primary = 0;
|
|
113
113
|
int max_count_static = 0;
|
|
@@ -117,7 +117,7 @@ static llama_token try_draft(
|
|
|
117
117
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
|
118
118
|
const llama_token token = token_count_primary.first;
|
|
119
119
|
|
|
120
|
-
|
|
120
|
+
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
|
121
121
|
|
|
122
122
|
const int32_t count_primary = token_count_primary.second;
|
|
123
123
|
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
|
@@ -142,9 +142,9 @@ static llama_token try_draft(
|
|
|
142
142
|
return drafted_token;
|
|
143
143
|
}
|
|
144
144
|
|
|
145
|
-
void
|
|
145
|
+
void common_ngram_cache_draft(
|
|
146
146
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
|
147
|
-
|
|
147
|
+
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
|
148
148
|
) {
|
|
149
149
|
GGML_ASSERT(draft.size() == 1);
|
|
150
150
|
const int inp_size = inp.size();
|
|
@@ -157,21 +157,21 @@ void llama_ngram_cache_draft(
|
|
|
157
157
|
llama_token drafted_token = -1;
|
|
158
158
|
|
|
159
159
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
|
160
|
-
|
|
160
|
+
common_ngram ngram_static;
|
|
161
161
|
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
|
162
162
|
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
|
163
163
|
}
|
|
164
|
-
|
|
165
|
-
|
|
164
|
+
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
|
165
|
+
common_ngram_cache_part part_static;
|
|
166
166
|
if (part_static_it != nc_static.end()) {
|
|
167
167
|
part_static = part_static_it->second;
|
|
168
168
|
}
|
|
169
169
|
|
|
170
170
|
// cd = context + dynamic
|
|
171
|
-
std::vector<
|
|
171
|
+
std::vector<common_ngram> ngrams_cd;
|
|
172
172
|
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
|
173
173
|
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
|
174
|
-
|
|
174
|
+
common_ngram ngram_cd;
|
|
175
175
|
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
|
176
176
|
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
|
177
177
|
}
|
|
@@ -196,16 +196,16 @@ void llama_ngram_cache_draft(
|
|
|
196
196
|
}
|
|
197
197
|
}
|
|
198
198
|
|
|
199
|
-
void
|
|
199
|
+
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
|
|
200
200
|
std::ofstream file_out(filename, std::ios::binary);
|
|
201
|
-
for (std::pair<
|
|
202
|
-
const
|
|
203
|
-
|
|
201
|
+
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
|
|
202
|
+
const common_ngram ngram = item.first;
|
|
203
|
+
common_ngram_cache_part token_counts = item.second;
|
|
204
204
|
GGML_ASSERT(!token_counts.empty());
|
|
205
205
|
const int32_t ntokens = token_counts.size();
|
|
206
206
|
GGML_ASSERT(ntokens > 0);
|
|
207
207
|
|
|
208
|
-
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(
|
|
208
|
+
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
|
209
209
|
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
|
210
210
|
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
|
211
211
|
const llama_token token = item2.first;
|
|
@@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
|
|
|
219
219
|
|
|
220
220
|
}
|
|
221
221
|
|
|
222
|
-
|
|
222
|
+
common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
|
223
223
|
std::ifstream hashmap_file(filename, std::ios::binary);
|
|
224
224
|
if (!hashmap_file) {
|
|
225
225
|
throw std::ifstream::failure("Unable to open file " + filename);
|
|
226
226
|
}
|
|
227
|
-
|
|
227
|
+
common_ngram_cache ngram_cache;
|
|
228
228
|
|
|
229
|
-
|
|
229
|
+
common_ngram ngram;
|
|
230
230
|
int32_t ntokens;
|
|
231
231
|
llama_token token;
|
|
232
232
|
int32_t count;
|
|
@@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
|
|
235
235
|
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
|
236
236
|
char * tokenc = reinterpret_cast<char*>(&token);
|
|
237
237
|
char * countc = reinterpret_cast<char*>(&count);
|
|
238
|
-
while(hashmap_file.read(ngramc, sizeof(
|
|
238
|
+
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
|
|
239
239
|
GGML_ASSERT(!hashmap_file.eof());
|
|
240
240
|
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
|
241
241
|
GGML_ASSERT(ntokens > 0);
|
|
242
|
-
|
|
242
|
+
common_ngram_cache_part token_counts;
|
|
243
243
|
|
|
244
244
|
for (int i = 0; i < ntokens; ++i) {
|
|
245
245
|
GGML_ASSERT(!hashmap_file.eof());
|
|
@@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
|
|
257
257
|
return ngram_cache;
|
|
258
258
|
}
|
|
259
259
|
|
|
260
|
-
void
|
|
261
|
-
for (std::pair<
|
|
262
|
-
const
|
|
263
|
-
|
|
260
|
+
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
|
|
261
|
+
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
|
|
262
|
+
const common_ngram ngram = ngram_part.first;
|
|
263
|
+
common_ngram_cache_part part = ngram_part.second;
|
|
264
264
|
|
|
265
|
-
|
|
265
|
+
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
|
266
266
|
if (part_merged_it == ngram_cache_target.end()) {
|
|
267
267
|
ngram_cache_target.emplace(ngram, part);
|
|
268
268
|
continue;
|
|
@@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
|
|
|
273
273
|
const int32_t count = token_count.second;
|
|
274
274
|
GGML_ASSERT(count > 0);
|
|
275
275
|
|
|
276
|
-
|
|
276
|
+
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
|
277
277
|
if (token_count_merged_it == part_merged_it->second.end()) {
|
|
278
278
|
part_merged_it->second.emplace(token, count);
|
|
279
279
|
continue;
|
|
@@ -12,22 +12,22 @@
|
|
|
12
12
|
|
|
13
13
|
// Data structures to map n-grams to empirical token probabilities:
|
|
14
14
|
|
|
15
|
-
struct
|
|
15
|
+
struct common_ngram {
|
|
16
16
|
llama_token tokens[LLAMA_NGRAM_MAX];
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
common_ngram() {
|
|
19
19
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
20
20
|
tokens[i] = -1;
|
|
21
21
|
}
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
common_ngram(const llama_token * input, const int ngram_size) {
|
|
25
25
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
26
26
|
tokens[i] = i < ngram_size ? input[i] : -1;
|
|
27
27
|
}
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
bool operator==(const
|
|
30
|
+
bool operator==(const common_ngram & other) const {
|
|
31
31
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
|
32
32
|
if (tokens[i] != other.tokens[i]) {
|
|
33
33
|
return false;
|
|
@@ -37,28 +37,28 @@ struct llama_ngram {
|
|
|
37
37
|
}
|
|
38
38
|
};
|
|
39
39
|
|
|
40
|
-
struct
|
|
40
|
+
struct common_token_hash_function {
|
|
41
41
|
size_t operator()(const llama_token token) const {
|
|
42
42
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
|
43
43
|
return token * 11400714819323198485llu;
|
|
44
44
|
}
|
|
45
45
|
};
|
|
46
46
|
|
|
47
|
-
struct
|
|
48
|
-
size_t operator()(const
|
|
49
|
-
size_t hash =
|
|
47
|
+
struct common_ngram_hash_function {
|
|
48
|
+
size_t operator()(const common_ngram & ngram) const {
|
|
49
|
+
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
|
50
50
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
|
51
|
-
hash ^=
|
|
51
|
+
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
|
52
52
|
}
|
|
53
53
|
return hash;
|
|
54
54
|
}
|
|
55
55
|
};
|
|
56
56
|
|
|
57
57
|
// token -> number of times token has been seen
|
|
58
|
-
typedef std::unordered_map<llama_token, int32_t>
|
|
58
|
+
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
|
59
59
|
|
|
60
60
|
// n-gram -> empirical distribution of following tokens
|
|
61
|
-
typedef std::unordered_map<
|
|
61
|
+
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
// Update an ngram cache with tokens.
|
|
@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
|
|
|
70
70
|
//
|
|
71
71
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
|
72
72
|
// Changes in the middle need a complete rebuild.
|
|
73
|
-
void
|
|
74
|
-
|
|
73
|
+
void common_ngram_cache_update(
|
|
74
|
+
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
|
75
75
|
|
|
76
76
|
// Try to draft tokens from ngram caches.
|
|
77
77
|
// inp: the tokens generated so far.
|
|
@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
|
|
|
81
81
|
// nc_context: ngram cache based on current context.
|
|
82
82
|
// nc_dynamic: ngram cache based on previous user generations.
|
|
83
83
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
|
84
|
-
void
|
|
84
|
+
void common_ngram_cache_draft(
|
|
85
85
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
|
86
|
-
|
|
86
|
+
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
|
87
87
|
|
|
88
88
|
// Save an ngram cache to a file.
|
|
89
89
|
// ngram_cache: the ngram cache to save.
|
|
90
90
|
// filename: the path under which to save the ngram cache.
|
|
91
|
-
void
|
|
91
|
+
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
|
|
92
92
|
|
|
93
|
-
// Load an ngram cache saved with
|
|
93
|
+
// Load an ngram cache saved with common_ngram_cache_save.
|
|
94
94
|
// filename: the path from which to load the ngram cache.
|
|
95
95
|
// returns: an ngram cache containing the information saved to filename.
|
|
96
|
-
|
|
96
|
+
common_ngram_cache common_ngram_cache_load(std::string & filename);
|
|
97
97
|
|
|
98
98
|
// Merge two ngram caches.
|
|
99
99
|
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
|
100
100
|
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
|
101
|
-
void
|
|
101
|
+
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
|