@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -37,13 +37,13 @@ struct Stats {
|
|
|
37
37
|
class IMatrixCollector {
|
|
38
38
|
public:
|
|
39
39
|
IMatrixCollector() = default;
|
|
40
|
-
void set_params(
|
|
40
|
+
void set_params(common_params params) { m_params = std::move(params); }
|
|
41
41
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
|
42
42
|
void save_imatrix(int ncall = -1) const;
|
|
43
43
|
bool load_imatrix(const char * file_name);
|
|
44
44
|
private:
|
|
45
45
|
std::unordered_map<std::string, Stats> m_stats;
|
|
46
|
-
|
|
46
|
+
common_params m_params;
|
|
47
47
|
std::mutex m_mutex;
|
|
48
48
|
int m_last_call = 0;
|
|
49
49
|
std::vector<float> m_src1_data;
|
|
@@ -428,7 +428,7 @@ static void process_logits(
|
|
|
428
428
|
}
|
|
429
429
|
}
|
|
430
430
|
|
|
431
|
-
static bool compute_imatrix(llama_context * ctx, const
|
|
431
|
+
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
432
432
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
433
433
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
|
434
434
|
const int n_ctx = llama_n_ctx(ctx);
|
|
@@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
436
436
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
|
437
437
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
438
438
|
|
|
439
|
-
std::vector<llama_token> tokens =
|
|
439
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
|
440
440
|
|
|
441
441
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
|
442
442
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
|
@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
496
496
|
// clear the KV cache
|
|
497
497
|
llama_kv_cache_clear(ctx);
|
|
498
498
|
|
|
499
|
+
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
500
|
+
|
|
499
501
|
for (int j = 0; j < num_batches; ++j) {
|
|
500
502
|
const int batch_start = start + j * n_batch;
|
|
501
503
|
const int batch_size = std::min(end - batch_start, n_batch);
|
|
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
508
510
|
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
|
509
511
|
}
|
|
510
512
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
+
common_batch_clear(batch);
|
|
514
|
+
for (int i = 0; i < batch_size; i++) {
|
|
515
|
+
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
if (llama_decode(ctx, batch)) {
|
|
513
519
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
520
|
+
llama_batch_free(batch);
|
|
514
521
|
return false;
|
|
515
522
|
}
|
|
516
523
|
|
|
@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
523
530
|
}
|
|
524
531
|
}
|
|
525
532
|
|
|
533
|
+
llama_batch_free(batch);
|
|
534
|
+
|
|
526
535
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
|
527
536
|
|
|
528
537
|
if (i == 0) {
|
|
@@ -568,17 +577,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
568
577
|
}
|
|
569
578
|
|
|
570
579
|
int main(int argc, char ** argv) {
|
|
571
|
-
|
|
580
|
+
common_params params;
|
|
572
581
|
|
|
573
582
|
params.n_ctx = 512;
|
|
574
583
|
params.logits_all = true;
|
|
575
584
|
params.escape = false;
|
|
576
585
|
|
|
577
|
-
if (!
|
|
586
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
|
578
587
|
return 1;
|
|
579
588
|
}
|
|
580
589
|
|
|
581
|
-
|
|
590
|
+
common_init();
|
|
582
591
|
|
|
583
592
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
|
584
593
|
|
|
@@ -607,7 +616,7 @@ int main(int argc, char ** argv) {
|
|
|
607
616
|
params.warmup = false;
|
|
608
617
|
|
|
609
618
|
// init
|
|
610
|
-
|
|
619
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
611
620
|
|
|
612
621
|
llama_model * model = llama_init.model;
|
|
613
622
|
llama_context * ctx = llama_init.context;
|
|
@@ -625,13 +634,22 @@ int main(int argc, char ** argv) {
|
|
|
625
634
|
// print system information
|
|
626
635
|
{
|
|
627
636
|
LOG_INF("\n");
|
|
628
|
-
LOG_INF("%s\n",
|
|
637
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
629
638
|
}
|
|
630
639
|
|
|
631
|
-
if (
|
|
632
|
-
|
|
640
|
+
if (params.prompt.empty()) {
|
|
641
|
+
if (params.in_files.empty()) {
|
|
642
|
+
LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
|
|
643
|
+
return 1;
|
|
644
|
+
}
|
|
645
|
+
LOG_INF("No prompt provided; combining precomputed matrices only.\n");
|
|
646
|
+
} else {
|
|
647
|
+
if (!compute_imatrix(ctx, params)) {
|
|
648
|
+
return 1;
|
|
649
|
+
}
|
|
633
650
|
}
|
|
634
651
|
|
|
652
|
+
|
|
635
653
|
g_collector.save_imatrix();
|
|
636
654
|
|
|
637
655
|
LOG("\n");
|
|
@@ -2,4 +2,4 @@ set(TARGET llama-infill)
|
|
|
2
2
|
add_executable(${TARGET} infill.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -35,58 +35,14 @@
|
|
|
35
35
|
|
|
36
36
|
static llama_context ** g_ctx;
|
|
37
37
|
static llama_model ** g_model;
|
|
38
|
-
static
|
|
39
|
-
static
|
|
38
|
+
static common_sampler ** g_smpl;
|
|
39
|
+
static common_params * g_params;
|
|
40
40
|
static std::vector<llama_token> * g_input_tokens;
|
|
41
41
|
static std::ostringstream * g_output_ss;
|
|
42
42
|
static std::vector<llama_token> * g_output_tokens;
|
|
43
43
|
|
|
44
44
|
static bool is_interacting = false;
|
|
45
45
|
|
|
46
|
-
static void write_logfile(
|
|
47
|
-
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
|
48
|
-
const std::vector<llama_token> & input_tokens, const std::string & output,
|
|
49
|
-
const std::vector<llama_token> & output_tokens
|
|
50
|
-
) {
|
|
51
|
-
if (params.logdir.empty()) {
|
|
52
|
-
return;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
const std::string timestamp = string_get_sortable_timestamp();
|
|
56
|
-
|
|
57
|
-
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
58
|
-
if (!success) {
|
|
59
|
-
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
60
|
-
__func__, params.logdir.c_str());
|
|
61
|
-
return;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
|
65
|
-
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
66
|
-
|
|
67
|
-
if (logfile == NULL) {
|
|
68
|
-
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
69
|
-
return;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
fprintf(logfile, "binary: infill\n");
|
|
73
|
-
char model_desc[128];
|
|
74
|
-
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
75
|
-
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
76
|
-
|
|
77
|
-
fprintf(logfile, "\n");
|
|
78
|
-
fprintf(logfile, "######################\n");
|
|
79
|
-
fprintf(logfile, "# Generation Results #\n");
|
|
80
|
-
fprintf(logfile, "######################\n");
|
|
81
|
-
fprintf(logfile, "\n");
|
|
82
|
-
|
|
83
|
-
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
84
|
-
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
85
|
-
|
|
86
|
-
llama_perf_dump_yaml(logfile, ctx);
|
|
87
|
-
fclose(logfile);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
46
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
91
47
|
static void sigint_handler(int signo) {
|
|
92
48
|
if (signo == SIGINT) {
|
|
@@ -95,12 +51,11 @@ static void sigint_handler(int signo) {
|
|
|
95
51
|
} else {
|
|
96
52
|
console::cleanup();
|
|
97
53
|
LOG("\n");
|
|
98
|
-
|
|
99
|
-
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
|
54
|
+
common_perf_print(*g_ctx, *g_smpl);
|
|
100
55
|
|
|
101
56
|
// make sure all logs are flushed
|
|
102
57
|
LOG("Interrupted by user\n");
|
|
103
|
-
|
|
58
|
+
common_log_pause(common_log_main());
|
|
104
59
|
|
|
105
60
|
_exit(130);
|
|
106
61
|
}
|
|
@@ -109,16 +64,16 @@ static void sigint_handler(int signo) {
|
|
|
109
64
|
#endif
|
|
110
65
|
|
|
111
66
|
int main(int argc, char ** argv) {
|
|
112
|
-
|
|
67
|
+
common_params params;
|
|
113
68
|
g_params = ¶ms;
|
|
114
69
|
|
|
115
|
-
if (!
|
|
70
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
|
116
71
|
return 1;
|
|
117
72
|
}
|
|
118
73
|
|
|
119
|
-
|
|
74
|
+
common_init();
|
|
120
75
|
|
|
121
|
-
auto & sparams = params.
|
|
76
|
+
auto & sparams = params.sampling;
|
|
122
77
|
|
|
123
78
|
console::init(params.simple_io, params.use_color);
|
|
124
79
|
atexit([]() { console::cleanup(); });
|
|
@@ -166,7 +121,7 @@ int main(int argc, char ** argv) {
|
|
|
166
121
|
|
|
167
122
|
llama_model * model = nullptr;
|
|
168
123
|
llama_context * ctx = nullptr;
|
|
169
|
-
|
|
124
|
+
common_sampler * smpl = nullptr;
|
|
170
125
|
|
|
171
126
|
g_model = &model;
|
|
172
127
|
g_ctx = &ctx;
|
|
@@ -174,7 +129,7 @@ int main(int argc, char ** argv) {
|
|
|
174
129
|
|
|
175
130
|
// load the model and apply lora adapter, if any
|
|
176
131
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
177
|
-
|
|
132
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
178
133
|
|
|
179
134
|
model = llama_init.model;
|
|
180
135
|
ctx = llama_init.context;
|
|
@@ -195,21 +150,21 @@ int main(int argc, char ** argv) {
|
|
|
195
150
|
// print system information
|
|
196
151
|
{
|
|
197
152
|
LOG_INF("\n");
|
|
198
|
-
LOG_INF("%s\n",
|
|
153
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
199
154
|
}
|
|
200
155
|
const bool add_bos = llama_add_bos_token(model);
|
|
201
156
|
GGML_ASSERT(!llama_add_eos_token(model));
|
|
202
157
|
|
|
203
158
|
std::vector<llama_token> embd_inp;
|
|
204
159
|
std::vector<llama_token> embd_end;
|
|
205
|
-
std::vector<llama_token> inp_pfx =
|
|
206
|
-
std::vector<llama_token> inp_sfx =
|
|
160
|
+
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
161
|
+
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
207
162
|
|
|
208
|
-
GGML_ASSERT(
|
|
209
|
-
GGML_ASSERT(
|
|
163
|
+
GGML_ASSERT(llama_token_fim_pre(model) >= 0);
|
|
164
|
+
GGML_ASSERT(llama_token_fim_suf(model) >= 0);
|
|
210
165
|
|
|
211
|
-
inp_pfx.insert(inp_pfx.begin(),
|
|
212
|
-
inp_sfx.insert(inp_sfx.begin(),
|
|
166
|
+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
|
|
167
|
+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
|
|
213
168
|
|
|
214
169
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
215
170
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
@@ -218,7 +173,7 @@ int main(int argc, char ** argv) {
|
|
|
218
173
|
}
|
|
219
174
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
220
175
|
|
|
221
|
-
const llama_token middle_token =
|
|
176
|
+
const llama_token middle_token = llama_token_fim_mid(model);
|
|
222
177
|
if (middle_token >= 0) {
|
|
223
178
|
embd_inp.push_back(middle_token);
|
|
224
179
|
}
|
|
@@ -257,13 +212,13 @@ int main(int argc, char ** argv) {
|
|
|
257
212
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
258
213
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
259
214
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
260
|
-
LOG_INF("%6d -> '%s'\n", embd_inp[i],
|
|
215
|
+
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
261
216
|
}
|
|
262
217
|
|
|
263
218
|
if (params.n_keep > 0) {
|
|
264
219
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
265
220
|
for (int i = 0; i < params.n_keep; i++) {
|
|
266
|
-
LOG_CNT("%s",
|
|
221
|
+
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
267
222
|
}
|
|
268
223
|
LOG_CNT("'\n");
|
|
269
224
|
}
|
|
@@ -298,11 +253,11 @@ int main(int argc, char ** argv) {
|
|
|
298
253
|
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
299
254
|
}
|
|
300
255
|
}
|
|
301
|
-
smpl =
|
|
256
|
+
smpl = common_sampler_init(model, sparams);
|
|
302
257
|
|
|
303
|
-
LOG_INF("sampler seed: %u\n",
|
|
258
|
+
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
|
304
259
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
305
|
-
LOG_INF("sampler chain: %s\n",
|
|
260
|
+
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
|
306
261
|
|
|
307
262
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
308
263
|
|
|
@@ -396,7 +351,7 @@ int main(int argc, char ** argv) {
|
|
|
396
351
|
|
|
397
352
|
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
398
353
|
|
|
399
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval
|
|
354
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
|
400
355
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
401
356
|
return 1;
|
|
402
357
|
}
|
|
@@ -411,9 +366,9 @@ int main(int argc, char ** argv) {
|
|
|
411
366
|
embd.clear();
|
|
412
367
|
|
|
413
368
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
414
|
-
const llama_token id =
|
|
369
|
+
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
|
415
370
|
|
|
416
|
-
|
|
371
|
+
common_sampler_accept(smpl, id, true);
|
|
417
372
|
|
|
418
373
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
419
374
|
|
|
@@ -434,7 +389,7 @@ int main(int argc, char ** argv) {
|
|
|
434
389
|
|
|
435
390
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
436
391
|
// for the prompt, we don't apply grammar rules
|
|
437
|
-
|
|
392
|
+
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
|
438
393
|
|
|
439
394
|
++n_consumed;
|
|
440
395
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -446,7 +401,7 @@ int main(int argc, char ** argv) {
|
|
|
446
401
|
// display text
|
|
447
402
|
if (input_echo) {
|
|
448
403
|
for (auto id : embd) {
|
|
449
|
-
const std::string token_str =
|
|
404
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
450
405
|
LOG("%s", token_str.c_str());
|
|
451
406
|
|
|
452
407
|
if (embd.size() > 1) {
|
|
@@ -465,10 +420,10 @@ int main(int argc, char ** argv) {
|
|
|
465
420
|
// if not currently processing queued inputs;
|
|
466
421
|
if ((int) embd_inp.size() <= n_consumed) {
|
|
467
422
|
// deal with eot token in infill mode
|
|
468
|
-
if ((
|
|
423
|
+
if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
|
469
424
|
if (is_interacting && !params.interactive_first) {
|
|
470
425
|
// print an eot token
|
|
471
|
-
LOG("%s",
|
|
426
|
+
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
|
472
427
|
}
|
|
473
428
|
LOG("\n");
|
|
474
429
|
console::set_display(console::user_input);
|
|
@@ -505,11 +460,11 @@ int main(int argc, char ** argv) {
|
|
|
505
460
|
}
|
|
506
461
|
|
|
507
462
|
// tokenize new prefix and suffix
|
|
508
|
-
std::vector<llama_token> inp_pfx =
|
|
509
|
-
std::vector<llama_token> inp_sfx =
|
|
463
|
+
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
464
|
+
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
510
465
|
|
|
511
|
-
inp_pfx.insert(inp_pfx.begin(),
|
|
512
|
-
inp_sfx.insert(inp_sfx.begin(),
|
|
466
|
+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
|
|
467
|
+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
|
|
513
468
|
|
|
514
469
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
515
470
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
@@ -529,7 +484,7 @@ int main(int argc, char ** argv) {
|
|
|
529
484
|
is_interacting = false;
|
|
530
485
|
}
|
|
531
486
|
// deal with end of generation tokens in interactive mode
|
|
532
|
-
else if (llama_token_is_eog(model,
|
|
487
|
+
else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
|
533
488
|
LOG_DBG("found EOS token\n");
|
|
534
489
|
|
|
535
490
|
if (params.interactive) {
|
|
@@ -579,7 +534,7 @@ int main(int argc, char ** argv) {
|
|
|
579
534
|
|
|
580
535
|
const size_t original_size = embd_inp.size();
|
|
581
536
|
|
|
582
|
-
const auto line_inp =
|
|
537
|
+
const auto line_inp = common_tokenize(ctx, buffer, false);
|
|
583
538
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
584
539
|
|
|
585
540
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
@@ -587,7 +542,7 @@ int main(int argc, char ** argv) {
|
|
|
587
542
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
588
543
|
const llama_token token = embd_inp[i];
|
|
589
544
|
output_tokens.push_back(token);
|
|
590
|
-
output_ss <<
|
|
545
|
+
output_ss << common_token_to_piece(ctx, token);
|
|
591
546
|
}
|
|
592
547
|
|
|
593
548
|
n_remain -= line_inp.size();
|
|
@@ -601,7 +556,7 @@ int main(int argc, char ** argv) {
|
|
|
601
556
|
|
|
602
557
|
if (n_past > 0) {
|
|
603
558
|
if (is_interacting) {
|
|
604
|
-
|
|
559
|
+
common_sampler_reset(smpl);
|
|
605
560
|
}
|
|
606
561
|
is_interacting = false;
|
|
607
562
|
}
|
|
@@ -620,17 +575,16 @@ int main(int argc, char ** argv) {
|
|
|
620
575
|
}
|
|
621
576
|
}
|
|
622
577
|
if (!params.interactive && n_remain <= 0) {
|
|
623
|
-
LOG("%s",
|
|
578
|
+
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
|
624
579
|
}
|
|
625
580
|
|
|
626
581
|
LOG("\n");
|
|
627
|
-
|
|
628
|
-
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
|
582
|
+
common_perf_print(ctx, smpl);
|
|
629
583
|
|
|
630
584
|
llama_free(ctx);
|
|
631
585
|
llama_free_model(model);
|
|
632
586
|
|
|
633
|
-
|
|
587
|
+
common_sampler_free(smpl);
|
|
634
588
|
llama_backend_free();
|
|
635
589
|
|
|
636
590
|
return 0;
|
|
@@ -2,4 +2,4 @@ set(TARGET llama-bench)
|
|
|
2
2
|
add_executable(${TARGET} llama-bench.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|