@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -37,13 +37,13 @@ struct ngram_container {
|
|
|
37
37
|
};
|
|
38
38
|
|
|
39
39
|
int main(int argc, char ** argv) {
|
|
40
|
-
|
|
40
|
+
common_params params;
|
|
41
41
|
|
|
42
|
-
if (!
|
|
42
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
43
43
|
return 1;
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
common_init();
|
|
47
47
|
|
|
48
48
|
const int W = 15; // lookahead window
|
|
49
49
|
const int N = 5; // n-gram size
|
|
@@ -56,7 +56,7 @@ int main(int argc, char ** argv) {
|
|
|
56
56
|
llama_numa_init(params.numa);
|
|
57
57
|
|
|
58
58
|
// load the target model
|
|
59
|
-
|
|
59
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
60
60
|
|
|
61
61
|
llama_model * model = llama_init.model;
|
|
62
62
|
llama_context * ctx = llama_init.context;
|
|
@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
|
|
|
65
65
|
std::vector<llama_token> inp;
|
|
66
66
|
std::vector<llama_token> all;
|
|
67
67
|
|
|
68
|
-
inp =
|
|
68
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
69
69
|
all = inp;
|
|
70
70
|
|
|
71
71
|
const int max_context_size = llama_n_ctx(ctx);
|
|
@@ -79,7 +79,7 @@ int main(int argc, char ** argv) {
|
|
|
79
79
|
LOG("\n\n");
|
|
80
80
|
|
|
81
81
|
for (auto id : inp) {
|
|
82
|
-
LOG("%s",
|
|
82
|
+
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
|
83
83
|
}
|
|
84
84
|
|
|
85
85
|
fflush(stderr);
|
|
@@ -89,8 +89,8 @@ int main(int argc, char ** argv) {
|
|
|
89
89
|
const auto t_enc_start = ggml_time_us();
|
|
90
90
|
|
|
91
91
|
// eval the prompt
|
|
92
|
-
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1
|
|
93
|
-
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1
|
|
92
|
+
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
|
|
93
|
+
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
|
94
94
|
|
|
95
95
|
for (int s = 1; s < W + G + 1; ++s) {
|
|
96
96
|
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
|
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
|
|
|
115
115
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
|
116
116
|
|
|
117
117
|
// target model sampling context
|
|
118
|
-
struct
|
|
118
|
+
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
|
119
119
|
|
|
120
120
|
// verification n-grams
|
|
121
121
|
std::vector<ngram_data> ngrams_cur(G);
|
|
@@ -156,12 +156,12 @@ int main(int argc, char ** argv) {
|
|
|
156
156
|
|
|
157
157
|
// sample first token
|
|
158
158
|
{
|
|
159
|
-
id =
|
|
159
|
+
id = common_sampler_sample(smpl, ctx, 0);
|
|
160
160
|
|
|
161
|
-
|
|
161
|
+
common_sampler_accept(smpl, id, true);
|
|
162
162
|
|
|
163
163
|
{
|
|
164
|
-
const std::string token_str =
|
|
164
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
165
165
|
|
|
166
166
|
LOG("%s", token_str.c_str());
|
|
167
167
|
fflush(stdout);
|
|
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
|
|
|
172
172
|
// debug
|
|
173
173
|
if (dump_kv_cache) {
|
|
174
174
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
175
|
-
|
|
175
|
+
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
176
176
|
}
|
|
177
177
|
|
|
178
178
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
|
@@ -201,10 +201,10 @@ int main(int argc, char ** argv) {
|
|
|
201
201
|
// V V V V V V
|
|
202
202
|
// id
|
|
203
203
|
{
|
|
204
|
-
|
|
204
|
+
common_batch_clear(batch);
|
|
205
205
|
|
|
206
206
|
// current token - first token of the first level
|
|
207
|
-
|
|
207
|
+
common_batch_add(batch, id, n_past, seq_id_all, true);
|
|
208
208
|
|
|
209
209
|
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
|
210
210
|
{
|
|
@@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
|
|
|
229
229
|
ngrams_cur[g].tokens [j + 1] = t;
|
|
230
230
|
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
|
231
231
|
|
|
232
|
-
|
|
232
|
+
common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
|
233
233
|
}
|
|
234
234
|
}
|
|
235
235
|
}
|
|
@@ -241,13 +241,13 @@ int main(int argc, char ** argv) {
|
|
|
241
241
|
seq_id_look[j] = i + j + 1;
|
|
242
242
|
}
|
|
243
243
|
|
|
244
|
-
|
|
244
|
+
common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
|
245
245
|
}
|
|
246
246
|
|
|
247
247
|
// fill the rest of the levels
|
|
248
248
|
for (int j = 1; j < N - 1; j++) {
|
|
249
249
|
for (int i = 0; i < W; i++) {
|
|
250
|
-
|
|
250
|
+
common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
|
251
251
|
}
|
|
252
252
|
}
|
|
253
253
|
}
|
|
@@ -281,13 +281,13 @@ int main(int argc, char ** argv) {
|
|
|
281
281
|
}
|
|
282
282
|
|
|
283
283
|
// sample the next token
|
|
284
|
-
id =
|
|
284
|
+
id = common_sampler_sample(smpl, ctx, i_batch);
|
|
285
285
|
|
|
286
|
-
|
|
286
|
+
common_sampler_accept(smpl, id, true);
|
|
287
287
|
|
|
288
288
|
// print
|
|
289
289
|
{
|
|
290
|
-
const std::string token_str =
|
|
290
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
291
291
|
|
|
292
292
|
if (v == 0) {
|
|
293
293
|
LOG("%s", token_str.c_str());
|
|
@@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
|
|
|
327
327
|
// print known n-grams starting with token id (debug)
|
|
328
328
|
if (0 && v == 0) {
|
|
329
329
|
if (ngrams_observed.cnt[id] > 0) {
|
|
330
|
-
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id],
|
|
330
|
+
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
|
|
331
331
|
}
|
|
332
332
|
|
|
333
333
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
|
@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
|
|
|
336
336
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
|
337
337
|
|
|
338
338
|
for (int j = 0; j < N - 1; j++) {
|
|
339
|
-
const std::string token_str =
|
|
339
|
+
const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
|
340
340
|
|
|
341
341
|
LOG("%s", token_str.c_str());
|
|
342
342
|
}
|
|
@@ -358,7 +358,7 @@ int main(int argc, char ** argv) {
|
|
|
358
358
|
if (v == 0) {
|
|
359
359
|
// sample from the last level
|
|
360
360
|
for (int i = 0; i < W; i++) {
|
|
361
|
-
tokens_j[N - 2][i] =
|
|
361
|
+
tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
|
362
362
|
}
|
|
363
363
|
} else {
|
|
364
364
|
for (int i = 0; i < W; i++) {
|
|
@@ -466,9 +466,9 @@ int main(int argc, char ** argv) {
|
|
|
466
466
|
LOG_INF("n_accept = %d\n", n_accept);
|
|
467
467
|
|
|
468
468
|
LOG_INF("\n");
|
|
469
|
-
|
|
469
|
+
common_perf_print(ctx, smpl);
|
|
470
470
|
|
|
471
|
-
|
|
471
|
+
common_sampler_free(smpl);
|
|
472
472
|
|
|
473
473
|
llama_kv_cache_view_free(&kvc_view);
|
|
474
474
|
|
|
@@ -2,22 +2,22 @@ set(TARGET llama-lookup)
|
|
|
2
2
|
add_executable(${TARGET} lookup.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
6
6
|
|
|
7
7
|
set(TARGET llama-lookup-create)
|
|
8
8
|
add_executable(${TARGET} lookup-create.cpp)
|
|
9
9
|
install(TARGETS ${TARGET} RUNTIME)
|
|
10
10
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
11
|
-
target_compile_features(${TARGET} PRIVATE
|
|
11
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
12
12
|
|
|
13
13
|
set(TARGET llama-lookup-merge)
|
|
14
14
|
add_executable(${TARGET} lookup-merge.cpp)
|
|
15
15
|
install(TARGETS ${TARGET} RUNTIME)
|
|
16
16
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
17
|
-
target_compile_features(${TARGET} PRIVATE
|
|
17
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
18
18
|
|
|
19
19
|
set(TARGET llama-lookup-stats)
|
|
20
20
|
add_executable(${TARGET} lookup-stats.cpp)
|
|
21
21
|
install(TARGETS ${TARGET} RUNTIME)
|
|
22
22
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
23
|
-
target_compile_features(${TARGET} PRIVATE
|
|
23
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -12,9 +12,9 @@
|
|
|
12
12
|
#include <vector>
|
|
13
13
|
|
|
14
14
|
int main(int argc, char ** argv){
|
|
15
|
-
|
|
15
|
+
common_params params;
|
|
16
16
|
|
|
17
|
-
if (!
|
|
17
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
18
18
|
return 1;
|
|
19
19
|
}
|
|
20
20
|
|
|
@@ -23,7 +23,7 @@ int main(int argc, char ** argv){
|
|
|
23
23
|
llama_numa_init(params.numa);
|
|
24
24
|
|
|
25
25
|
// load the model
|
|
26
|
-
|
|
26
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
27
27
|
|
|
28
28
|
llama_model * model = llama_init.model;
|
|
29
29
|
llama_context * ctx = llama_init.context;
|
|
@@ -31,15 +31,15 @@ int main(int argc, char ** argv){
|
|
|
31
31
|
|
|
32
32
|
// tokenize the prompt
|
|
33
33
|
std::vector<llama_token> inp;
|
|
34
|
-
inp =
|
|
34
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
35
35
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
common_ngram_cache ngram_cache;
|
|
39
|
+
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
|
40
40
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
|
43
43
|
|
|
44
44
|
return 0;
|
|
45
45
|
}
|
|
@@ -33,15 +33,15 @@ int main(int argc, char ** argv){
|
|
|
33
33
|
}
|
|
34
34
|
|
|
35
35
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
|
36
|
-
|
|
36
|
+
common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
|
|
37
37
|
|
|
38
38
|
for (size_t i = 1; i < args.size()-1; ++i) {
|
|
39
39
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
|
40
|
-
|
|
40
|
+
common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
|
|
46
|
-
|
|
46
|
+
common_ngram_cache_save(ngram_cache_merged, args.back());
|
|
47
47
|
}
|
|
@@ -13,33 +13,34 @@
|
|
|
13
13
|
#include <vector>
|
|
14
14
|
|
|
15
15
|
int main(int argc, char ** argv){
|
|
16
|
-
|
|
16
|
+
common_params params;
|
|
17
17
|
|
|
18
|
-
if (!
|
|
18
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
19
19
|
return 1;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
common_init();
|
|
23
23
|
|
|
24
|
-
const int n_draft = params.
|
|
24
|
+
const int n_draft = params.speculative.n_max;
|
|
25
25
|
|
|
26
26
|
// init llama.cpp
|
|
27
27
|
llama_backend_init();
|
|
28
28
|
llama_numa_init(params.numa);
|
|
29
29
|
|
|
30
30
|
// load the model
|
|
31
|
-
|
|
31
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
32
32
|
|
|
33
33
|
llama_model * model = llama_init.model;
|
|
34
34
|
llama_context * ctx = llama_init.context;
|
|
35
35
|
|
|
36
36
|
// tokenize the prompt
|
|
37
37
|
std::vector<llama_token> inp;
|
|
38
|
-
inp =
|
|
38
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
39
|
+
|
|
40
|
+
common_ngram_cache ngram_cache_context;
|
|
41
|
+
common_ngram_cache ngram_cache_dynamic;
|
|
42
|
+
common_ngram_cache ngram_cache_static;
|
|
39
43
|
|
|
40
|
-
llama_ngram_cache ngram_cache_context;
|
|
41
|
-
llama_ngram_cache ngram_cache_dynamic;
|
|
42
|
-
llama_ngram_cache ngram_cache_static;
|
|
43
44
|
int64_t t_draft_flat_us = 0;
|
|
44
45
|
int64_t t_draft_us = 0;
|
|
45
46
|
|
|
@@ -48,7 +49,7 @@ int main(int argc, char ** argv){
|
|
|
48
49
|
|
|
49
50
|
if (!params.lookup_cache_static.empty()) {
|
|
50
51
|
try {
|
|
51
|
-
ngram_cache_static =
|
|
52
|
+
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
|
52
53
|
} catch (std::ifstream::failure const &) {
|
|
53
54
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
|
54
55
|
exit(1);
|
|
@@ -57,7 +58,7 @@ int main(int argc, char ** argv){
|
|
|
57
58
|
|
|
58
59
|
if (!params.lookup_cache_dynamic.empty()) {
|
|
59
60
|
try {
|
|
60
|
-
ngram_cache_dynamic =
|
|
61
|
+
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
|
61
62
|
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
|
62
63
|
}
|
|
63
64
|
|
|
@@ -86,7 +87,7 @@ int main(int argc, char ** argv){
|
|
|
86
87
|
|
|
87
88
|
{
|
|
88
89
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
89
|
-
|
|
90
|
+
common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
|
90
91
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
91
92
|
}
|
|
92
93
|
|
|
@@ -105,7 +106,7 @@ int main(int argc, char ** argv){
|
|
|
105
106
|
|
|
106
107
|
{
|
|
107
108
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
108
|
-
|
|
109
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
|
109
110
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
110
111
|
}
|
|
111
112
|
}
|
|
@@ -115,7 +116,7 @@ int main(int argc, char ** argv){
|
|
|
115
116
|
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
|
116
117
|
{
|
|
117
118
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
118
|
-
|
|
119
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
|
119
120
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
120
121
|
}
|
|
121
122
|
}
|
|
@@ -133,7 +134,7 @@ int main(int argc, char ** argv){
|
|
|
133
134
|
}
|
|
134
135
|
|
|
135
136
|
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
|
136
|
-
|
|
137
|
+
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
|
137
138
|
ngram_cache_context.clear();
|
|
138
139
|
}
|
|
139
140
|
|
|
@@ -13,16 +13,16 @@
|
|
|
13
13
|
#include <vector>
|
|
14
14
|
|
|
15
15
|
int main(int argc, char ** argv){
|
|
16
|
-
|
|
16
|
+
common_params params;
|
|
17
17
|
|
|
18
|
-
if (!
|
|
18
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
19
19
|
return 1;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
common_init();
|
|
23
23
|
|
|
24
24
|
// max. number of additional tokens to draft if match is found
|
|
25
|
-
const int n_draft = params.
|
|
25
|
+
const int n_draft = params.speculative.n_max;
|
|
26
26
|
|
|
27
27
|
const bool dump_kv_cache = params.dump_kv_cache;
|
|
28
28
|
|
|
@@ -31,29 +31,29 @@ int main(int argc, char ** argv){
|
|
|
31
31
|
llama_numa_init(params.numa);
|
|
32
32
|
|
|
33
33
|
// load the model
|
|
34
|
-
|
|
34
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
35
35
|
|
|
36
36
|
llama_model * model = llama_init.model;
|
|
37
37
|
llama_context * ctx = llama_init.context;
|
|
38
38
|
|
|
39
39
|
// tokenize the prompt
|
|
40
40
|
std::vector<llama_token> inp;
|
|
41
|
-
inp =
|
|
41
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
common_ngram_cache ngram_cache_context;
|
|
44
|
+
common_ngram_cache ngram_cache_dynamic;
|
|
45
|
+
common_ngram_cache ngram_cache_static;
|
|
46
46
|
int64_t t_draft_flat_us = 0;
|
|
47
47
|
int64_t t_draft_us = 0;
|
|
48
48
|
|
|
49
49
|
{
|
|
50
50
|
// Fill up context ngram cache with tokens from user input:
|
|
51
51
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
52
|
-
|
|
52
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
|
53
53
|
|
|
54
54
|
if (!params.lookup_cache_static.empty()) {
|
|
55
55
|
try {
|
|
56
|
-
ngram_cache_static =
|
|
56
|
+
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
|
57
57
|
} catch (std::ifstream::failure const &) {
|
|
58
58
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
|
59
59
|
exit(1);
|
|
@@ -62,7 +62,7 @@ int main(int argc, char ** argv){
|
|
|
62
62
|
|
|
63
63
|
if (!params.lookup_cache_dynamic.empty()) {
|
|
64
64
|
try {
|
|
65
|
-
ngram_cache_dynamic =
|
|
65
|
+
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
|
66
66
|
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
|
67
67
|
}
|
|
68
68
|
|
|
@@ -80,7 +80,7 @@ int main(int argc, char ** argv){
|
|
|
80
80
|
LOG("\n\n");
|
|
81
81
|
|
|
82
82
|
for (auto id : inp) {
|
|
83
|
-
LOG("%s",
|
|
83
|
+
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
fflush(stderr);
|
|
@@ -89,8 +89,8 @@ int main(int argc, char ** argv){
|
|
|
89
89
|
|
|
90
90
|
const auto t_enc_start = ggml_time_us();
|
|
91
91
|
|
|
92
|
-
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1
|
|
93
|
-
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1
|
|
92
|
+
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
|
|
93
|
+
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
|
94
94
|
|
|
95
95
|
const auto t_enc_end = ggml_time_us();
|
|
96
96
|
|
|
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
|
|
|
102
102
|
|
|
103
103
|
bool has_eos = false;
|
|
104
104
|
|
|
105
|
-
struct
|
|
105
|
+
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
|
106
106
|
|
|
107
107
|
std::vector<llama_token> draft;
|
|
108
108
|
|
|
@@ -117,7 +117,7 @@ int main(int argc, char ** argv){
|
|
|
117
117
|
// debug
|
|
118
118
|
if (dump_kv_cache) {
|
|
119
119
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
120
|
-
|
|
120
|
+
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
121
121
|
}
|
|
122
122
|
|
|
123
123
|
// print current draft sequence
|
|
@@ -126,11 +126,11 @@ int main(int argc, char ** argv){
|
|
|
126
126
|
int i_dft = 0;
|
|
127
127
|
while (true) {
|
|
128
128
|
// sample from the target model
|
|
129
|
-
llama_token id =
|
|
129
|
+
llama_token id = common_sampler_sample(smpl, ctx, i_dft);
|
|
130
130
|
|
|
131
|
-
|
|
131
|
+
common_sampler_accept(smpl, id, true);
|
|
132
132
|
|
|
133
|
-
const std::string token_str =
|
|
133
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
134
134
|
|
|
135
135
|
if (!params.use_color) {
|
|
136
136
|
LOG("%s", token_str.c_str());
|
|
@@ -152,7 +152,7 @@ int main(int argc, char ** argv){
|
|
|
152
152
|
{
|
|
153
153
|
// Update context ngram cache with the newly accepted token:
|
|
154
154
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
155
|
-
|
|
155
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
|
156
156
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
157
157
|
}
|
|
158
158
|
|
|
@@ -178,7 +178,7 @@ int main(int argc, char ** argv){
|
|
|
178
178
|
{
|
|
179
179
|
// Update context ngram cache with the newly accepted token:
|
|
180
180
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
181
|
-
|
|
181
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
|
182
182
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
183
183
|
}
|
|
184
184
|
break;
|
|
@@ -192,18 +192,18 @@ int main(int argc, char ** argv){
|
|
|
192
192
|
// clean the cache of draft tokens that weren't accepted
|
|
193
193
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
194
194
|
|
|
195
|
-
|
|
196
|
-
|
|
195
|
+
common_batch_clear(batch_tgt);
|
|
196
|
+
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
|
197
197
|
|
|
198
198
|
// Draft already contains a single token sampled from the model:
|
|
199
199
|
GGML_ASSERT(draft.size() == 1);
|
|
200
200
|
GGML_ASSERT(draft[0] == inp.back());
|
|
201
201
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
202
202
|
|
|
203
|
-
|
|
203
|
+
common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
|
204
204
|
|
|
205
205
|
for (size_t i = 1; i < draft.size(); ++i) {
|
|
206
|
-
|
|
206
|
+
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
|
207
207
|
}
|
|
208
208
|
|
|
209
209
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
@@ -218,8 +218,8 @@ int main(int argc, char ** argv){
|
|
|
218
218
|
auto t_dec_end = ggml_time_us();
|
|
219
219
|
|
|
220
220
|
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
|
221
|
-
|
|
222
|
-
|
|
221
|
+
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
|
222
|
+
common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
|
223
223
|
|
|
224
224
|
LOG("\n\n");
|
|
225
225
|
|
|
@@ -237,9 +237,9 @@ int main(int argc, char ** argv){
|
|
|
237
237
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
238
238
|
|
|
239
239
|
LOG_INF("\ntarget:\n\n");
|
|
240
|
-
|
|
240
|
+
common_perf_print(ctx, smpl);
|
|
241
241
|
|
|
242
|
-
|
|
242
|
+
common_sampler_free(smpl);
|
|
243
243
|
|
|
244
244
|
llama_batch_free(batch_tgt);
|
|
245
245
|
|
|
@@ -2,4 +2,4 @@ set(TARGET llama-cli)
|
|
|
2
2
|
add_executable(${TARGET} main.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|