@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -33,8 +33,8 @@
|
|
|
33
33
|
|
|
34
34
|
static llama_context ** g_ctx;
|
|
35
35
|
static llama_model ** g_model;
|
|
36
|
-
static
|
|
37
|
-
static
|
|
36
|
+
static common_sampler ** g_smpl;
|
|
37
|
+
static common_params * g_params;
|
|
38
38
|
static std::vector<llama_token> * g_input_tokens;
|
|
39
39
|
static std::ostringstream * g_output_ss;
|
|
40
40
|
static std::vector<llama_token> * g_output_tokens;
|
|
@@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) {
|
|
|
62
62
|
return f.tellg() == 0;
|
|
63
63
|
}
|
|
64
64
|
|
|
65
|
-
static void write_logfile(
|
|
66
|
-
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
|
67
|
-
const std::vector<llama_token> & input_tokens, const std::string & output,
|
|
68
|
-
const std::vector<llama_token> & output_tokens
|
|
69
|
-
) {
|
|
70
|
-
if (params.logdir.empty()) {
|
|
71
|
-
return;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
const std::string timestamp = string_get_sortable_timestamp();
|
|
75
|
-
|
|
76
|
-
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
77
|
-
if (!success) {
|
|
78
|
-
LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
|
|
79
|
-
return;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
|
83
|
-
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
84
|
-
|
|
85
|
-
if (logfile == NULL) {
|
|
86
|
-
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
87
|
-
return;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
fprintf(logfile, "binary: main\n");
|
|
91
|
-
char model_desc[128];
|
|
92
|
-
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
93
|
-
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
94
|
-
|
|
95
|
-
fprintf(logfile, "\n");
|
|
96
|
-
fprintf(logfile, "######################\n");
|
|
97
|
-
fprintf(logfile, "# Generation Results #\n");
|
|
98
|
-
fprintf(logfile, "######################\n");
|
|
99
|
-
fprintf(logfile, "\n");
|
|
100
|
-
|
|
101
|
-
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
102
|
-
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
103
|
-
|
|
104
|
-
llama_perf_dump_yaml(logfile, ctx);
|
|
105
|
-
fclose(logfile);
|
|
106
|
-
}
|
|
107
|
-
|
|
108
65
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
109
66
|
static void sigint_handler(int signo) {
|
|
110
67
|
if (signo == SIGINT) {
|
|
@@ -114,12 +71,11 @@ static void sigint_handler(int signo) {
|
|
|
114
71
|
} else {
|
|
115
72
|
console::cleanup();
|
|
116
73
|
LOG("\n");
|
|
117
|
-
|
|
118
|
-
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
|
74
|
+
common_perf_print(*g_ctx, *g_smpl);
|
|
119
75
|
|
|
120
76
|
// make sure all logs are flushed
|
|
121
77
|
LOG("Interrupted by user\n");
|
|
122
|
-
|
|
78
|
+
common_log_pause(common_log_main());
|
|
123
79
|
|
|
124
80
|
_exit(130);
|
|
125
81
|
}
|
|
@@ -127,24 +83,24 @@ static void sigint_handler(int signo) {
|
|
|
127
83
|
}
|
|
128
84
|
#endif
|
|
129
85
|
|
|
130
|
-
static std::string chat_add_and_format(struct llama_model * model, std::vector<
|
|
131
|
-
|
|
132
|
-
auto formatted =
|
|
86
|
+
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
|
87
|
+
common_chat_msg new_msg{role, content};
|
|
88
|
+
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
133
89
|
chat_msgs.push_back({role, content});
|
|
134
90
|
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
|
135
91
|
return formatted;
|
|
136
92
|
}
|
|
137
93
|
|
|
138
94
|
int main(int argc, char ** argv) {
|
|
139
|
-
|
|
95
|
+
common_params params;
|
|
140
96
|
g_params = ¶ms;
|
|
141
|
-
if (!
|
|
97
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
|
142
98
|
return 1;
|
|
143
99
|
}
|
|
144
100
|
|
|
145
|
-
|
|
101
|
+
common_init();
|
|
146
102
|
|
|
147
|
-
auto & sparams = params.
|
|
103
|
+
auto & sparams = params.sampling;
|
|
148
104
|
|
|
149
105
|
// save choice to use color for later
|
|
150
106
|
// (note for later: this is a slightly awkward choice)
|
|
@@ -187,9 +143,9 @@ int main(int argc, char ** argv) {
|
|
|
187
143
|
|
|
188
144
|
llama_model * model = nullptr;
|
|
189
145
|
llama_context * ctx = nullptr;
|
|
190
|
-
|
|
146
|
+
common_sampler * smpl = nullptr;
|
|
191
147
|
|
|
192
|
-
std::vector<
|
|
148
|
+
std::vector<common_chat_msg> chat_msgs;
|
|
193
149
|
|
|
194
150
|
g_model = &model;
|
|
195
151
|
g_ctx = &ctx;
|
|
@@ -197,7 +153,7 @@ int main(int argc, char ** argv) {
|
|
|
197
153
|
|
|
198
154
|
// load the model and apply lora adapter, if any
|
|
199
155
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
200
|
-
|
|
156
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
201
157
|
|
|
202
158
|
model = llama_init.model;
|
|
203
159
|
ctx = llama_init.context;
|
|
@@ -209,6 +165,10 @@ int main(int argc, char ** argv) {
|
|
|
209
165
|
|
|
210
166
|
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
|
211
167
|
|
|
168
|
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
|
169
|
+
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
|
|
170
|
+
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
|
|
171
|
+
|
|
212
172
|
struct ggml_threadpool_params tpp_batch =
|
|
213
173
|
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
|
214
174
|
struct ggml_threadpool_params tpp =
|
|
@@ -218,7 +178,7 @@ int main(int argc, char ** argv) {
|
|
|
218
178
|
|
|
219
179
|
struct ggml_threadpool * threadpool_batch = NULL;
|
|
220
180
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
|
221
|
-
threadpool_batch =
|
|
181
|
+
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
|
|
222
182
|
if (!threadpool_batch) {
|
|
223
183
|
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
|
224
184
|
return 1;
|
|
@@ -228,7 +188,7 @@ int main(int argc, char ** argv) {
|
|
|
228
188
|
tpp.paused = true;
|
|
229
189
|
}
|
|
230
190
|
|
|
231
|
-
struct ggml_threadpool * threadpool =
|
|
191
|
+
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
|
232
192
|
if (!threadpool) {
|
|
233
193
|
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
234
194
|
return 1;
|
|
@@ -246,7 +206,7 @@ int main(int argc, char ** argv) {
|
|
|
246
206
|
// print chat template example in conversation mode
|
|
247
207
|
if (params.conversation) {
|
|
248
208
|
if (params.enable_chat_template) {
|
|
249
|
-
LOG_INF("%s: chat template example:\n%s\n", __func__,
|
|
209
|
+
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
|
250
210
|
} else {
|
|
251
211
|
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
|
252
212
|
}
|
|
@@ -255,7 +215,7 @@ int main(int argc, char ** argv) {
|
|
|
255
215
|
// print system information
|
|
256
216
|
{
|
|
257
217
|
LOG_INF("\n");
|
|
258
|
-
LOG_INF("%s\n",
|
|
218
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
259
219
|
LOG_INF("\n");
|
|
260
220
|
}
|
|
261
221
|
|
|
@@ -296,7 +256,7 @@ int main(int argc, char ** argv) {
|
|
|
296
256
|
: params.prompt;
|
|
297
257
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
|
298
258
|
LOG_DBG("tokenize the prompt\n");
|
|
299
|
-
embd_inp =
|
|
259
|
+
embd_inp = common_tokenize(ctx, prompt, true, true);
|
|
300
260
|
} else {
|
|
301
261
|
LOG_DBG("use session tokens\n");
|
|
302
262
|
embd_inp = session_tokens;
|
|
@@ -379,13 +339,13 @@ int main(int argc, char ** argv) {
|
|
|
379
339
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
380
340
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
381
341
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
382
|
-
LOG_INF("%6d -> '%s'\n", embd_inp[i],
|
|
342
|
+
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
383
343
|
}
|
|
384
344
|
|
|
385
345
|
if (params.n_keep > add_bos) {
|
|
386
346
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
387
347
|
for (int i = 0; i < params.n_keep; i++) {
|
|
388
|
-
LOG_CNT("%s",
|
|
348
|
+
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
389
349
|
}
|
|
390
350
|
LOG_CNT("'\n");
|
|
391
351
|
}
|
|
@@ -415,9 +375,9 @@ int main(int argc, char ** argv) {
|
|
|
415
375
|
for (const auto & antiprompt : params.antiprompt) {
|
|
416
376
|
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
|
|
417
377
|
if (params.verbose_prompt) {
|
|
418
|
-
auto tmp =
|
|
378
|
+
auto tmp = common_tokenize(ctx, antiprompt, false, true);
|
|
419
379
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
420
|
-
LOG_INF("%6d -> '%s'\n", tmp[i],
|
|
380
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
|
421
381
|
}
|
|
422
382
|
}
|
|
423
383
|
}
|
|
@@ -430,9 +390,9 @@ int main(int argc, char ** argv) {
|
|
|
430
390
|
if (!params.input_prefix.empty()) {
|
|
431
391
|
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
432
392
|
if (params.verbose_prompt) {
|
|
433
|
-
auto tmp =
|
|
393
|
+
auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
|
|
434
394
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
435
|
-
LOG_INF("%6d -> '%s'\n", tmp[i],
|
|
395
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
|
436
396
|
}
|
|
437
397
|
}
|
|
438
398
|
}
|
|
@@ -440,23 +400,23 @@ int main(int argc, char ** argv) {
|
|
|
440
400
|
if (!params.input_suffix.empty()) {
|
|
441
401
|
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
442
402
|
if (params.verbose_prompt) {
|
|
443
|
-
auto tmp =
|
|
403
|
+
auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
|
|
444
404
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
445
|
-
LOG_INF("%6d -> '%s'\n", tmp[i],
|
|
405
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
|
446
406
|
}
|
|
447
407
|
}
|
|
448
408
|
}
|
|
449
409
|
}
|
|
450
410
|
|
|
451
|
-
smpl =
|
|
411
|
+
smpl = common_sampler_init(model, sparams);
|
|
452
412
|
if (!smpl) {
|
|
453
413
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
|
454
414
|
return 1;
|
|
455
415
|
}
|
|
456
416
|
|
|
457
|
-
LOG_INF("sampler seed: %u\n",
|
|
417
|
+
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
|
458
418
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
459
|
-
LOG_INF("sampler chain: %s\n",
|
|
419
|
+
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
|
460
420
|
|
|
461
421
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
462
422
|
|
|
@@ -521,14 +481,14 @@ int main(int argc, char ** argv) {
|
|
|
521
481
|
|
|
522
482
|
antiprompt_ids.reserve(params.antiprompt.size());
|
|
523
483
|
for (const std::string & antiprompt : params.antiprompt) {
|
|
524
|
-
antiprompt_ids.emplace_back(::
|
|
484
|
+
antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
|
|
525
485
|
}
|
|
526
486
|
|
|
527
487
|
if (llama_model_has_encoder(model)) {
|
|
528
488
|
int enc_input_size = embd_inp.size();
|
|
529
489
|
llama_token * enc_input_buf = embd_inp.data();
|
|
530
490
|
|
|
531
|
-
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size
|
|
491
|
+
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
|
|
532
492
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
533
493
|
return 1;
|
|
534
494
|
}
|
|
@@ -569,30 +529,30 @@ int main(int argc, char ** argv) {
|
|
|
569
529
|
if (!params.ctx_shift){
|
|
570
530
|
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
|
571
531
|
break;
|
|
572
|
-
}
|
|
573
|
-
if (params.n_predict == -2) {
|
|
574
|
-
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
575
|
-
break;
|
|
576
|
-
}
|
|
532
|
+
}
|
|
577
533
|
|
|
578
|
-
|
|
579
|
-
|
|
534
|
+
if (params.n_predict == -2) {
|
|
535
|
+
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
536
|
+
break;
|
|
537
|
+
}
|
|
580
538
|
|
|
581
|
-
|
|
582
|
-
|
|
539
|
+
const int n_left = n_past - params.n_keep;
|
|
540
|
+
const int n_discard = n_left/2;
|
|
583
541
|
|
|
584
|
-
|
|
585
|
-
|
|
542
|
+
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
543
|
+
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
586
544
|
|
|
587
|
-
|
|
545
|
+
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
|
546
|
+
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
|
588
547
|
|
|
589
|
-
|
|
548
|
+
n_past -= n_discard;
|
|
590
549
|
|
|
591
|
-
|
|
550
|
+
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
592
551
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
552
|
+
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
553
|
+
|
|
554
|
+
LOG_DBG("clear session path\n");
|
|
555
|
+
path_session.clear();
|
|
596
556
|
}
|
|
597
557
|
} else {
|
|
598
558
|
// context extension via Self-Extend
|
|
@@ -648,7 +608,7 @@ int main(int argc, char ** argv) {
|
|
|
648
608
|
|
|
649
609
|
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
650
610
|
|
|
651
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval
|
|
611
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
|
652
612
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
653
613
|
return 1;
|
|
654
614
|
}
|
|
@@ -679,9 +639,9 @@ int main(int argc, char ** argv) {
|
|
|
679
639
|
LOG_DBG("saved session to %s\n", path_session.c_str());
|
|
680
640
|
}
|
|
681
641
|
|
|
682
|
-
const llama_token id =
|
|
642
|
+
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
|
683
643
|
|
|
684
|
-
|
|
644
|
+
common_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
|
685
645
|
|
|
686
646
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
687
647
|
|
|
@@ -702,7 +662,7 @@ int main(int argc, char ** argv) {
|
|
|
702
662
|
|
|
703
663
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
704
664
|
// for the prompt, we don't apply grammar rules
|
|
705
|
-
|
|
665
|
+
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
|
706
666
|
|
|
707
667
|
++n_consumed;
|
|
708
668
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -714,7 +674,7 @@ int main(int argc, char ** argv) {
|
|
|
714
674
|
// display text
|
|
715
675
|
if (input_echo && display) {
|
|
716
676
|
for (auto id : embd) {
|
|
717
|
-
const std::string token_str =
|
|
677
|
+
const std::string token_str = common_token_to_piece(ctx, id, params.special);
|
|
718
678
|
|
|
719
679
|
// Console/Stream Output
|
|
720
680
|
LOG("%s", token_str.c_str());
|
|
@@ -743,7 +703,7 @@ int main(int argc, char ** argv) {
|
|
|
743
703
|
// check for reverse prompt in the last n_prev tokens
|
|
744
704
|
if (!params.antiprompt.empty()) {
|
|
745
705
|
const int n_prev = 32;
|
|
746
|
-
const std::string last_output =
|
|
706
|
+
const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
|
|
747
707
|
|
|
748
708
|
is_antiprompt = false;
|
|
749
709
|
// Check if each of the reverse prompts appears at the end of the output.
|
|
@@ -765,7 +725,7 @@ int main(int argc, char ** argv) {
|
|
|
765
725
|
}
|
|
766
726
|
|
|
767
727
|
// check for reverse prompt using special tokens
|
|
768
|
-
llama_token last_token =
|
|
728
|
+
llama_token last_token = common_sampler_last(smpl);
|
|
769
729
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
|
770
730
|
if (ids.size() == 1 && last_token == ids[0]) {
|
|
771
731
|
if (params.interactive) {
|
|
@@ -782,13 +742,13 @@ int main(int argc, char ** argv) {
|
|
|
782
742
|
}
|
|
783
743
|
|
|
784
744
|
// deal with end of generation tokens in interactive mode
|
|
785
|
-
if (llama_token_is_eog(model,
|
|
745
|
+
if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
|
786
746
|
LOG_DBG("found an EOG token\n");
|
|
787
747
|
|
|
788
748
|
if (params.interactive) {
|
|
789
749
|
if (!params.antiprompt.empty()) {
|
|
790
750
|
// tokenize and inject first reverse prompt
|
|
791
|
-
const auto first_antiprompt =
|
|
751
|
+
const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
|
|
792
752
|
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
|
793
753
|
is_antiprompt = true;
|
|
794
754
|
}
|
|
@@ -803,8 +763,8 @@ int main(int argc, char ** argv) {
|
|
|
803
763
|
|
|
804
764
|
// if current token is not EOG, we add it to current assistant message
|
|
805
765
|
if (params.conversation) {
|
|
806
|
-
const auto id =
|
|
807
|
-
assistant_ss <<
|
|
766
|
+
const auto id = common_sampler_last(smpl);
|
|
767
|
+
assistant_ss << common_token_to_piece(ctx, id, false);
|
|
808
768
|
}
|
|
809
769
|
|
|
810
770
|
if (n_past > 0 && is_interacting) {
|
|
@@ -862,9 +822,9 @@ int main(int argc, char ** argv) {
|
|
|
862
822
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
|
863
823
|
: std::move(buffer);
|
|
864
824
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
|
865
|
-
const auto line_pfx =
|
|
866
|
-
const auto line_inp =
|
|
867
|
-
const auto line_sfx =
|
|
825
|
+
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
|
|
826
|
+
const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
|
|
827
|
+
const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
|
|
868
828
|
|
|
869
829
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
870
830
|
|
|
@@ -882,7 +842,7 @@ int main(int argc, char ** argv) {
|
|
|
882
842
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
883
843
|
const llama_token token = embd_inp[i];
|
|
884
844
|
output_tokens.push_back(token);
|
|
885
|
-
output_ss <<
|
|
845
|
+
output_ss << common_token_to_piece(ctx, token);
|
|
886
846
|
}
|
|
887
847
|
|
|
888
848
|
// reset assistant message
|
|
@@ -899,7 +859,7 @@ int main(int argc, char ** argv) {
|
|
|
899
859
|
|
|
900
860
|
if (n_past > 0) {
|
|
901
861
|
if (is_interacting) {
|
|
902
|
-
|
|
862
|
+
common_sampler_reset(smpl);
|
|
903
863
|
}
|
|
904
864
|
is_interacting = false;
|
|
905
865
|
}
|
|
@@ -925,18 +885,17 @@ int main(int argc, char ** argv) {
|
|
|
925
885
|
}
|
|
926
886
|
|
|
927
887
|
LOG("\n\n");
|
|
928
|
-
|
|
929
|
-
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
|
888
|
+
common_perf_print(ctx, smpl);
|
|
930
889
|
|
|
931
|
-
|
|
890
|
+
common_sampler_free(smpl);
|
|
932
891
|
|
|
933
892
|
llama_free(ctx);
|
|
934
893
|
llama_free_model(model);
|
|
935
894
|
|
|
936
895
|
llama_backend_free();
|
|
937
896
|
|
|
938
|
-
|
|
939
|
-
|
|
897
|
+
ggml_threadpool_free_fn(threadpool);
|
|
898
|
+
ggml_threadpool_free_fn(threadpool_batch);
|
|
940
899
|
|
|
941
900
|
return 0;
|
|
942
901
|
}
|
|
@@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
|
|
|
29
29
|
target_include_directories(${TARGET} PRIVATE ${_common_path})
|
|
30
30
|
install(TARGETS ${TARGET} RUNTIME)
|
|
31
31
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
32
|
-
target_compile_features(${TARGET} PRIVATE
|
|
32
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -2,4 +2,4 @@ set(TARGET llama-parallel)
|
|
|
2
2
|
add_executable(${TARGET} parallel.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -54,7 +54,7 @@ static std::vector<std::string> k_prompts = {
|
|
|
54
54
|
struct client {
|
|
55
55
|
~client() {
|
|
56
56
|
if (smpl) {
|
|
57
|
-
|
|
57
|
+
common_sampler_free(smpl);
|
|
58
58
|
}
|
|
59
59
|
}
|
|
60
60
|
|
|
@@ -75,7 +75,7 @@ struct client {
|
|
|
75
75
|
std::string prompt;
|
|
76
76
|
std::string response;
|
|
77
77
|
|
|
78
|
-
struct
|
|
78
|
+
struct common_sampler * smpl = nullptr;
|
|
79
79
|
};
|
|
80
80
|
|
|
81
81
|
static void print_date_time() {
|
|
@@ -103,13 +103,13 @@ static std::vector<std::string> split_string(const std::string& input, char deli
|
|
|
103
103
|
int main(int argc, char ** argv) {
|
|
104
104
|
srand(1234);
|
|
105
105
|
|
|
106
|
-
|
|
106
|
+
common_params params;
|
|
107
107
|
|
|
108
|
-
if (!
|
|
108
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
|
109
109
|
return 1;
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
|
|
112
|
+
common_init();
|
|
113
113
|
|
|
114
114
|
// number of simultaneous "clients" to simulate
|
|
115
115
|
const int32_t n_clients = params.n_parallel;
|
|
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
|
|
|
130
130
|
llama_numa_init(params.numa);
|
|
131
131
|
|
|
132
132
|
// load the target model
|
|
133
|
-
|
|
133
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
134
134
|
|
|
135
135
|
llama_model * model = llama_init.model;
|
|
136
136
|
llama_context * ctx = llama_init.context;
|
|
@@ -160,11 +160,11 @@ int main(int argc, char ** argv) {
|
|
|
160
160
|
for (size_t i = 0; i < clients.size(); ++i) {
|
|
161
161
|
auto & client = clients[i];
|
|
162
162
|
client.id = i;
|
|
163
|
-
client.smpl =
|
|
163
|
+
client.smpl = common_sampler_init(model, params.sampling);
|
|
164
164
|
}
|
|
165
165
|
|
|
166
166
|
std::vector<llama_token> tokens_system;
|
|
167
|
-
tokens_system =
|
|
167
|
+
tokens_system = common_tokenize(ctx, k_system, true);
|
|
168
168
|
const int32_t n_tokens_system = tokens_system.size();
|
|
169
169
|
|
|
170
170
|
llama_seq_id g_seq_id = 0;
|
|
@@ -189,7 +189,7 @@ int main(int argc, char ** argv) {
|
|
|
189
189
|
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
|
190
190
|
|
|
191
191
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
|
192
|
-
|
|
192
|
+
common_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
|
193
193
|
}
|
|
194
194
|
|
|
195
195
|
if (llama_decode(ctx, batch) != 0) {
|
|
@@ -210,10 +210,10 @@ int main(int argc, char ** argv) {
|
|
|
210
210
|
while (true) {
|
|
211
211
|
if (dump_kv_cache) {
|
|
212
212
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
213
|
-
|
|
213
|
+
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
214
214
|
}
|
|
215
215
|
|
|
216
|
-
|
|
216
|
+
common_batch_clear(batch);
|
|
217
217
|
|
|
218
218
|
// decode any currently ongoing sequences
|
|
219
219
|
for (auto & client : clients) {
|
|
@@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
|
|
|
223
223
|
|
|
224
224
|
client.i_batch = batch.n_tokens;
|
|
225
225
|
|
|
226
|
-
|
|
226
|
+
common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
|
|
227
227
|
|
|
228
228
|
client.n_decoded += 1;
|
|
229
229
|
}
|
|
@@ -252,14 +252,14 @@ int main(int argc, char ** argv) {
|
|
|
252
252
|
client.prompt = client.input + "\nAssistant:";
|
|
253
253
|
client.response = "";
|
|
254
254
|
|
|
255
|
-
|
|
255
|
+
common_sampler_reset(client.smpl);
|
|
256
256
|
|
|
257
257
|
// do not prepend BOS because we have a system prompt!
|
|
258
258
|
std::vector<llama_token> tokens_prompt;
|
|
259
|
-
tokens_prompt =
|
|
259
|
+
tokens_prompt = common_tokenize(ctx, client.prompt, false);
|
|
260
260
|
|
|
261
261
|
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
|
262
|
-
|
|
262
|
+
common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
|
|
263
263
|
}
|
|
264
264
|
|
|
265
265
|
// extract the logits only for the last token
|
|
@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
|
|
|
308
308
|
batch.n_seq_id + i,
|
|
309
309
|
batch.seq_id + i,
|
|
310
310
|
batch.logits + i,
|
|
311
|
-
0, 0, 0, // unused
|
|
312
311
|
};
|
|
313
312
|
|
|
314
313
|
const int ret = llama_decode(ctx, batch_view);
|
|
@@ -340,9 +339,9 @@ int main(int argc, char ** argv) {
|
|
|
340
339
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
|
341
340
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
|
342
341
|
|
|
343
|
-
const llama_token id =
|
|
342
|
+
const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
|
|
344
343
|
|
|
345
|
-
|
|
344
|
+
common_sampler_accept(client.smpl, id, true);
|
|
346
345
|
|
|
347
346
|
if (client.n_decoded == 1) {
|
|
348
347
|
// start measuring generation time after the first token to make sure all concurrent clients
|
|
@@ -350,7 +349,7 @@ int main(int argc, char ** argv) {
|
|
|
350
349
|
client.t_start_gen = ggml_time_us();
|
|
351
350
|
}
|
|
352
351
|
|
|
353
|
-
const std::string token_str =
|
|
352
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
354
353
|
|
|
355
354
|
client.response += token_str;
|
|
356
355
|
client.sampled = id;
|
|
@@ -2,4 +2,4 @@ set(TARGET llama-passkey)
|
|
|
2
2
|
add_executable(${TARGET} passkey.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
-
target_compile_features(${TARGET} PRIVATE
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|