@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
set(TARGET llama-simple)
|
|
2
2
|
add_executable(${TARGET} simple.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
-
target_link_libraries(${TARGET} PRIVATE
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -1,50 +1,116 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "common.h"
|
|
3
|
-
#include "log.h"
|
|
4
1
|
#include "llama.h"
|
|
5
|
-
|
|
2
|
+
#include <cstdio>
|
|
3
|
+
#include <cstring>
|
|
4
|
+
#include <string>
|
|
6
5
|
#include <vector>
|
|
7
6
|
|
|
8
7
|
static void print_usage(int, char ** argv) {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
printf("\nexample usage:\n");
|
|
9
|
+
printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
|
|
10
|
+
printf("\n");
|
|
12
11
|
}
|
|
13
12
|
|
|
14
13
|
int main(int argc, char ** argv) {
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
14
|
+
// path to the model gguf file
|
|
15
|
+
std::string model_path;
|
|
16
|
+
// prompt to generate text from
|
|
17
|
+
std::string prompt = "Hello my name is";
|
|
18
|
+
// number of layers to offload to the GPU
|
|
19
|
+
int ngl = 99;
|
|
20
|
+
// number of tokens to predict
|
|
21
|
+
int n_predict = 32;
|
|
22
|
+
|
|
23
|
+
// parse command line arguments
|
|
24
|
+
|
|
25
|
+
{
|
|
26
|
+
int i = 1;
|
|
27
|
+
for (; i < argc; i++) {
|
|
28
|
+
if (strcmp(argv[i], "-m") == 0) {
|
|
29
|
+
if (i + 1 < argc) {
|
|
30
|
+
model_path = argv[++i];
|
|
31
|
+
} else {
|
|
32
|
+
print_usage(argc, argv);
|
|
33
|
+
return 1;
|
|
34
|
+
}
|
|
35
|
+
} else if (strcmp(argv[i], "-n") == 0) {
|
|
36
|
+
if (i + 1 < argc) {
|
|
37
|
+
try {
|
|
38
|
+
n_predict = std::stoi(argv[++i]);
|
|
39
|
+
} catch (...) {
|
|
40
|
+
print_usage(argc, argv);
|
|
41
|
+
return 1;
|
|
42
|
+
}
|
|
43
|
+
} else {
|
|
44
|
+
print_usage(argc, argv);
|
|
45
|
+
return 1;
|
|
46
|
+
}
|
|
47
|
+
} else if (strcmp(argv[i], "-ngl") == 0) {
|
|
48
|
+
if (i + 1 < argc) {
|
|
49
|
+
try {
|
|
50
|
+
ngl = std::stoi(argv[++i]);
|
|
51
|
+
} catch (...) {
|
|
52
|
+
print_usage(argc, argv);
|
|
53
|
+
return 1;
|
|
54
|
+
}
|
|
55
|
+
} else {
|
|
56
|
+
print_usage(argc, argv);
|
|
57
|
+
return 1;
|
|
58
|
+
}
|
|
59
|
+
} else {
|
|
60
|
+
// prompt starts here
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (model_path.empty()) {
|
|
65
|
+
print_usage(argc, argv);
|
|
66
|
+
return 1;
|
|
67
|
+
}
|
|
68
|
+
if (i < argc) {
|
|
69
|
+
prompt = argv[i++];
|
|
70
|
+
for (; i < argc; i++) {
|
|
71
|
+
prompt += " ";
|
|
72
|
+
prompt += argv[i];
|
|
73
|
+
}
|
|
74
|
+
}
|
|
22
75
|
}
|
|
23
76
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
// total length of the sequence including the prompt
|
|
27
|
-
const int n_predict = params.n_predict;
|
|
77
|
+
// load dynamic backends
|
|
28
78
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
llama_backend_init();
|
|
32
|
-
llama_numa_init(params.numa);
|
|
79
|
+
ggml_backend_load_all();
|
|
33
80
|
|
|
34
81
|
// initialize the model
|
|
35
82
|
|
|
36
|
-
llama_model_params model_params =
|
|
83
|
+
llama_model_params model_params = llama_model_default_params();
|
|
84
|
+
model_params.n_gpu_layers = ngl;
|
|
37
85
|
|
|
38
|
-
llama_model * model = llama_load_model_from_file(
|
|
86
|
+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
|
39
87
|
|
|
40
88
|
if (model == NULL) {
|
|
41
89
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
42
90
|
return 1;
|
|
43
91
|
}
|
|
44
92
|
|
|
93
|
+
// tokenize the prompt
|
|
94
|
+
|
|
95
|
+
// find the number of tokens in the prompt
|
|
96
|
+
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
|
97
|
+
|
|
98
|
+
// allocate space for the tokens and tokenize the prompt
|
|
99
|
+
std::vector<llama_token> prompt_tokens(n_prompt);
|
|
100
|
+
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
|
101
|
+
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
|
|
102
|
+
return 1;
|
|
103
|
+
}
|
|
104
|
+
|
|
45
105
|
// initialize the context
|
|
46
106
|
|
|
47
|
-
llama_context_params ctx_params =
|
|
107
|
+
llama_context_params ctx_params = llama_context_default_params();
|
|
108
|
+
// n_ctx is the context size
|
|
109
|
+
ctx_params.n_ctx = n_prompt + n_predict - 1;
|
|
110
|
+
// n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
|
|
111
|
+
ctx_params.n_batch = n_prompt;
|
|
112
|
+
// enable performance counters
|
|
113
|
+
ctx_params.no_perf = false;
|
|
48
114
|
|
|
49
115
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
50
116
|
|
|
@@ -53,117 +119,87 @@ int main(int argc, char ** argv) {
|
|
|
53
119
|
return 1;
|
|
54
120
|
}
|
|
55
121
|
|
|
56
|
-
|
|
122
|
+
// initialize the sampler
|
|
57
123
|
|
|
124
|
+
auto sparams = llama_sampler_chain_default_params();
|
|
58
125
|
sparams.no_perf = false;
|
|
59
|
-
|
|
60
126
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
|
61
127
|
|
|
62
128
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
|
63
129
|
|
|
64
|
-
// tokenize the prompt
|
|
65
|
-
|
|
66
|
-
std::vector<llama_token> tokens_list;
|
|
67
|
-
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
|
68
|
-
|
|
69
|
-
const int n_ctx = llama_n_ctx(ctx);
|
|
70
|
-
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
|
71
|
-
|
|
72
|
-
LOG("\n");
|
|
73
|
-
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
|
74
|
-
|
|
75
|
-
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
|
76
|
-
if (n_kv_req > n_ctx) {
|
|
77
|
-
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
|
78
|
-
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
|
79
|
-
return 1;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
130
|
// print the prompt token-by-token
|
|
83
131
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
llama_batch batch = llama_batch_init(512, 0, 1);
|
|
94
|
-
|
|
95
|
-
// evaluate the initial prompt
|
|
96
|
-
for (size_t i = 0; i < tokens_list.size(); i++) {
|
|
97
|
-
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
|
132
|
+
for (auto id : prompt_tokens) {
|
|
133
|
+
char buf[128];
|
|
134
|
+
int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
|
|
135
|
+
if (n < 0) {
|
|
136
|
+
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
|
137
|
+
return 1;
|
|
138
|
+
}
|
|
139
|
+
std::string s(buf, n);
|
|
140
|
+
printf("%s", s.c_str());
|
|
98
141
|
}
|
|
99
142
|
|
|
100
|
-
//
|
|
101
|
-
batch.logits[batch.n_tokens - 1] = true;
|
|
143
|
+
// prepare a batch for the prompt
|
|
102
144
|
|
|
103
|
-
|
|
104
|
-
LOG("%s: llama_decode() failed\n", __func__);
|
|
105
|
-
return 1;
|
|
106
|
-
}
|
|
145
|
+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
|
107
146
|
|
|
108
147
|
// main loop
|
|
109
148
|
|
|
110
|
-
|
|
149
|
+
const auto t_main_start = ggml_time_us();
|
|
111
150
|
int n_decode = 0;
|
|
151
|
+
llama_token new_token_id;
|
|
112
152
|
|
|
113
|
-
|
|
153
|
+
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
|
|
154
|
+
// evaluate the current batch with the transformer model
|
|
155
|
+
if (llama_decode(ctx, batch)) {
|
|
156
|
+
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
|
157
|
+
return 1;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
n_pos += batch.n_tokens;
|
|
114
161
|
|
|
115
|
-
while (n_cur <= n_predict) {
|
|
116
162
|
// sample the next token
|
|
117
163
|
{
|
|
118
|
-
|
|
164
|
+
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
|
119
165
|
|
|
120
166
|
// is it an end of generation?
|
|
121
|
-
if (llama_token_is_eog(model, new_token_id)
|
|
122
|
-
LOG("\n");
|
|
123
|
-
|
|
167
|
+
if (llama_token_is_eog(model, new_token_id)) {
|
|
124
168
|
break;
|
|
125
169
|
}
|
|
126
170
|
|
|
127
|
-
|
|
171
|
+
char buf[128];
|
|
172
|
+
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
|
173
|
+
if (n < 0) {
|
|
174
|
+
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
|
175
|
+
return 1;
|
|
176
|
+
}
|
|
177
|
+
std::string s(buf, n);
|
|
178
|
+
printf("%s", s.c_str());
|
|
128
179
|
fflush(stdout);
|
|
129
180
|
|
|
130
|
-
// prepare the next batch
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
// push this new token for next evaluation
|
|
134
|
-
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
|
181
|
+
// prepare the next batch with the sampled token
|
|
182
|
+
batch = llama_batch_get_one(&new_token_id, 1);
|
|
135
183
|
|
|
136
184
|
n_decode += 1;
|
|
137
185
|
}
|
|
138
|
-
|
|
139
|
-
n_cur += 1;
|
|
140
|
-
|
|
141
|
-
// evaluate the current batch with the transformer model
|
|
142
|
-
if (llama_decode(ctx, batch)) {
|
|
143
|
-
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
|
144
|
-
return 1;
|
|
145
|
-
}
|
|
146
186
|
}
|
|
147
187
|
|
|
148
|
-
|
|
188
|
+
printf("\n");
|
|
149
189
|
|
|
150
190
|
const auto t_main_end = ggml_time_us();
|
|
151
191
|
|
|
152
|
-
|
|
192
|
+
fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
|
153
193
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
|
154
194
|
|
|
155
|
-
|
|
195
|
+
fprintf(stderr, "\n");
|
|
156
196
|
llama_perf_sampler_print(smpl);
|
|
157
197
|
llama_perf_context_print(ctx);
|
|
198
|
+
fprintf(stderr, "\n");
|
|
158
199
|
|
|
159
|
-
LOG("\n");
|
|
160
|
-
|
|
161
|
-
llama_batch_free(batch);
|
|
162
200
|
llama_sampler_free(smpl);
|
|
163
201
|
llama_free(ctx);
|
|
164
202
|
llama_free_model(model);
|
|
165
203
|
|
|
166
|
-
llama_backend_free();
|
|
167
|
-
|
|
168
204
|
return 0;
|
|
169
205
|
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
#include "llama.h"
|
|
2
|
+
#include <cstdio>
|
|
3
|
+
#include <cstring>
|
|
4
|
+
#include <iostream>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
static void print_usage(int, char ** argv) {
|
|
9
|
+
printf("\nexample usage:\n");
|
|
10
|
+
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
|
|
11
|
+
printf("\n");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
int main(int argc, char ** argv) {
|
|
15
|
+
std::string model_path;
|
|
16
|
+
int ngl = 99;
|
|
17
|
+
int n_ctx = 2048;
|
|
18
|
+
|
|
19
|
+
// parse command line arguments
|
|
20
|
+
for (int i = 1; i < argc; i++) {
|
|
21
|
+
try {
|
|
22
|
+
if (strcmp(argv[i], "-m") == 0) {
|
|
23
|
+
if (i + 1 < argc) {
|
|
24
|
+
model_path = argv[++i];
|
|
25
|
+
} else {
|
|
26
|
+
print_usage(argc, argv);
|
|
27
|
+
return 1;
|
|
28
|
+
}
|
|
29
|
+
} else if (strcmp(argv[i], "-c") == 0) {
|
|
30
|
+
if (i + 1 < argc) {
|
|
31
|
+
n_ctx = std::stoi(argv[++i]);
|
|
32
|
+
} else {
|
|
33
|
+
print_usage(argc, argv);
|
|
34
|
+
return 1;
|
|
35
|
+
}
|
|
36
|
+
} else if (strcmp(argv[i], "-ngl") == 0) {
|
|
37
|
+
if (i + 1 < argc) {
|
|
38
|
+
ngl = std::stoi(argv[++i]);
|
|
39
|
+
} else {
|
|
40
|
+
print_usage(argc, argv);
|
|
41
|
+
return 1;
|
|
42
|
+
}
|
|
43
|
+
} else {
|
|
44
|
+
print_usage(argc, argv);
|
|
45
|
+
return 1;
|
|
46
|
+
}
|
|
47
|
+
} catch (std::exception & e) {
|
|
48
|
+
fprintf(stderr, "error: %s\n", e.what());
|
|
49
|
+
print_usage(argc, argv);
|
|
50
|
+
return 1;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
if (model_path.empty()) {
|
|
54
|
+
print_usage(argc, argv);
|
|
55
|
+
return 1;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// only print errors
|
|
59
|
+
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
|
|
60
|
+
if (level >= GGML_LOG_LEVEL_ERROR) {
|
|
61
|
+
fprintf(stderr, "%s", text);
|
|
62
|
+
}
|
|
63
|
+
}, nullptr);
|
|
64
|
+
|
|
65
|
+
// load dynamic backends
|
|
66
|
+
ggml_backend_load_all();
|
|
67
|
+
|
|
68
|
+
// initialize the model
|
|
69
|
+
llama_model_params model_params = llama_model_default_params();
|
|
70
|
+
model_params.n_gpu_layers = ngl;
|
|
71
|
+
|
|
72
|
+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
|
73
|
+
if (!model) {
|
|
74
|
+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
75
|
+
return 1;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// initialize the context
|
|
79
|
+
llama_context_params ctx_params = llama_context_default_params();
|
|
80
|
+
ctx_params.n_ctx = n_ctx;
|
|
81
|
+
ctx_params.n_batch = n_ctx;
|
|
82
|
+
|
|
83
|
+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
84
|
+
if (!ctx) {
|
|
85
|
+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
|
86
|
+
return 1;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// initialize the sampler
|
|
90
|
+
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
|
91
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
|
|
92
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
|
|
93
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
|
94
|
+
|
|
95
|
+
// helper function to evaluate a prompt and generate a response
|
|
96
|
+
auto generate = [&](const std::string & prompt) {
|
|
97
|
+
std::string response;
|
|
98
|
+
|
|
99
|
+
// tokenize the prompt
|
|
100
|
+
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
|
101
|
+
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
|
102
|
+
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
|
|
103
|
+
GGML_ABORT("failed to tokenize the prompt\n");
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// prepare a batch for the prompt
|
|
107
|
+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
|
108
|
+
llama_token new_token_id;
|
|
109
|
+
while (true) {
|
|
110
|
+
// check if we have enough space in the context to evaluate this batch
|
|
111
|
+
int n_ctx = llama_n_ctx(ctx);
|
|
112
|
+
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
|
113
|
+
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
114
|
+
printf("\033[0m\n");
|
|
115
|
+
fprintf(stderr, "context size exceeded\n");
|
|
116
|
+
exit(0);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (llama_decode(ctx, batch)) {
|
|
120
|
+
GGML_ABORT("failed to decode\n");
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// sample the next token
|
|
124
|
+
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
|
125
|
+
|
|
126
|
+
// is it an end of generation?
|
|
127
|
+
if (llama_token_is_eog(model, new_token_id)) {
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// convert the token to a string, print it and add it to the response
|
|
132
|
+
char buf[256];
|
|
133
|
+
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
|
134
|
+
if (n < 0) {
|
|
135
|
+
GGML_ABORT("failed to convert token to piece\n");
|
|
136
|
+
}
|
|
137
|
+
std::string piece(buf, n);
|
|
138
|
+
printf("%s", piece.c_str());
|
|
139
|
+
fflush(stdout);
|
|
140
|
+
response += piece;
|
|
141
|
+
|
|
142
|
+
// prepare the next batch with the sampled token
|
|
143
|
+
batch = llama_batch_get_one(&new_token_id, 1);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return response;
|
|
147
|
+
};
|
|
148
|
+
|
|
149
|
+
std::vector<llama_chat_message> messages;
|
|
150
|
+
std::vector<char> formatted(llama_n_ctx(ctx));
|
|
151
|
+
int prev_len = 0;
|
|
152
|
+
while (true) {
|
|
153
|
+
// get user input
|
|
154
|
+
printf("\033[32m> \033[0m");
|
|
155
|
+
std::string user;
|
|
156
|
+
std::getline(std::cin, user);
|
|
157
|
+
|
|
158
|
+
if (user.empty()) {
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// add the user input to the message list and format it
|
|
163
|
+
messages.push_back({"user", strdup(user.c_str())});
|
|
164
|
+
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
|
165
|
+
if (new_len > (int)formatted.size()) {
|
|
166
|
+
formatted.resize(new_len);
|
|
167
|
+
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
|
168
|
+
}
|
|
169
|
+
if (new_len < 0) {
|
|
170
|
+
fprintf(stderr, "failed to apply the chat template\n");
|
|
171
|
+
return 1;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// remove previous messages to obtain the prompt to generate the response
|
|
175
|
+
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
|
|
176
|
+
|
|
177
|
+
// generate a response
|
|
178
|
+
printf("\033[33m");
|
|
179
|
+
std::string response = generate(prompt);
|
|
180
|
+
printf("\n\033[0m");
|
|
181
|
+
|
|
182
|
+
// add the response to the messages
|
|
183
|
+
messages.push_back({"assistant", strdup(response.c_str())});
|
|
184
|
+
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
|
|
185
|
+
if (prev_len < 0) {
|
|
186
|
+
fprintf(stderr, "failed to apply the chat template\n");
|
|
187
|
+
return 1;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// free resources
|
|
192
|
+
for (auto & msg : messages) {
|
|
193
|
+
free(const_cast<char *>(msg.content));
|
|
194
|
+
}
|
|
195
|
+
llama_sampler_free(smpl);
|
|
196
|
+
llama_free(ctx);
|
|
197
|
+
llama_free_model(model);
|
|
198
|
+
|
|
199
|
+
return 0;
|
|
200
|
+
}
|
|
@@ -2,4 +2,4 @@ set(TARGET llama-speculative)
|
|
|
2
2
|
add_executable(${TARGET} speculative.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|