@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -24,22 +24,24 @@
|
|
|
24
24
|
|
|
25
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
26
26
|
|
|
27
|
-
struct
|
|
27
|
+
struct common_lora_adapter_info {
|
|
28
28
|
std::string path;
|
|
29
29
|
float scale;
|
|
30
30
|
};
|
|
31
31
|
|
|
32
|
-
struct
|
|
32
|
+
struct common_lora_adapter_container : common_lora_adapter_info {
|
|
33
33
|
struct llama_lora_adapter * adapter;
|
|
34
34
|
};
|
|
35
35
|
|
|
36
|
+
using llama_tokens = std::vector<llama_token>;
|
|
37
|
+
|
|
36
38
|
// build info
|
|
37
39
|
extern int LLAMA_BUILD_NUMBER;
|
|
38
|
-
extern char
|
|
39
|
-
extern char
|
|
40
|
-
extern char
|
|
40
|
+
extern const char * LLAMA_COMMIT;
|
|
41
|
+
extern const char * LLAMA_COMPILER;
|
|
42
|
+
extern const char * LLAMA_BUILD_TARGET;
|
|
41
43
|
|
|
42
|
-
struct
|
|
44
|
+
struct common_control_vector_load_info;
|
|
43
45
|
|
|
44
46
|
//
|
|
45
47
|
// CPU utils
|
|
@@ -78,18 +80,23 @@ enum llama_example {
|
|
|
78
80
|
LLAMA_EXAMPLE_LLAVA,
|
|
79
81
|
LLAMA_EXAMPLE_LOOKUP,
|
|
80
82
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
|
+
LLAMA_EXAMPLE_TTS,
|
|
81
84
|
|
|
82
85
|
LLAMA_EXAMPLE_COUNT,
|
|
83
86
|
};
|
|
84
87
|
|
|
85
|
-
enum
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
88
|
+
enum common_sampler_type {
|
|
89
|
+
COMMON_SAMPLER_TYPE_NONE = 0,
|
|
90
|
+
COMMON_SAMPLER_TYPE_DRY = 1,
|
|
91
|
+
COMMON_SAMPLER_TYPE_TOP_K = 2,
|
|
92
|
+
COMMON_SAMPLER_TYPE_TOP_P = 3,
|
|
93
|
+
COMMON_SAMPLER_TYPE_MIN_P = 4,
|
|
94
|
+
//COMMON_SAMPLER_TYPE_TFS_Z = 5,
|
|
95
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
|
|
96
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
|
97
|
+
COMMON_SAMPLER_TYPE_XTC = 8,
|
|
98
|
+
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
99
|
+
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
|
93
100
|
};
|
|
94
101
|
|
|
95
102
|
// dimensionality reduction methods, used by cvector-generator
|
|
@@ -98,39 +105,49 @@ enum dimre_method {
|
|
|
98
105
|
DIMRE_METHOD_MEAN,
|
|
99
106
|
};
|
|
100
107
|
|
|
101
|
-
//
|
|
102
|
-
struct
|
|
108
|
+
// sampling parameters
|
|
109
|
+
struct common_params_sampling {
|
|
103
110
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
104
111
|
|
|
105
|
-
int32_t n_prev
|
|
106
|
-
int32_t n_probs
|
|
107
|
-
int32_t min_keep
|
|
108
|
-
int32_t top_k
|
|
109
|
-
float top_p
|
|
110
|
-
float min_p
|
|
111
|
-
float
|
|
112
|
-
float
|
|
113
|
-
float
|
|
114
|
-
float
|
|
115
|
-
float
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
float
|
|
119
|
-
float
|
|
120
|
-
|
|
121
|
-
float
|
|
122
|
-
float
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
112
|
+
int32_t n_prev = 64; // number of previous tokens to remember
|
|
113
|
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
114
|
+
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
115
|
+
int32_t top_k = 40; // <= 0 to use vocab size
|
|
116
|
+
float top_p = 0.95f; // 1.0 = disabled
|
|
117
|
+
float min_p = 0.05f; // 0.0 = disabled
|
|
118
|
+
float xtc_probability = 0.00f; // 0.0 = disabled
|
|
119
|
+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
|
120
|
+
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
121
|
+
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
122
|
+
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
123
|
+
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
124
|
+
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
125
|
+
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
126
|
+
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
127
|
+
float penalty_present = 0.00f; // 0.0 = disabled
|
|
128
|
+
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
|
129
|
+
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
|
130
|
+
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
|
131
|
+
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
|
132
|
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
133
|
+
float mirostat_tau = 5.00f; // target entropy
|
|
134
|
+
float mirostat_eta = 0.10f; // learning rate
|
|
135
|
+
bool ignore_eos = false;
|
|
136
|
+
bool no_perf = false; // disable performance metrics
|
|
137
|
+
bool timing_per_token = false;
|
|
138
|
+
|
|
139
|
+
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
std::vector<enum common_sampler_type> samplers = {
|
|
143
|
+
COMMON_SAMPLER_TYPE_PENALTIES,
|
|
144
|
+
COMMON_SAMPLER_TYPE_DRY,
|
|
145
|
+
COMMON_SAMPLER_TYPE_TOP_K,
|
|
146
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
|
147
|
+
COMMON_SAMPLER_TYPE_TOP_P,
|
|
148
|
+
COMMON_SAMPLER_TYPE_MIN_P,
|
|
149
|
+
COMMON_SAMPLER_TYPE_XTC,
|
|
150
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
|
134
151
|
};
|
|
135
152
|
|
|
136
153
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
@@ -141,21 +158,39 @@ struct gpt_sampler_params {
|
|
|
141
158
|
std::string print() const;
|
|
142
159
|
};
|
|
143
160
|
|
|
144
|
-
struct
|
|
161
|
+
struct common_params_speculative {
|
|
162
|
+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
163
|
+
|
|
164
|
+
int32_t n_ctx = 0; // draft context size
|
|
165
|
+
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
|
166
|
+
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
|
167
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
168
|
+
float p_split = 0.1f; // speculative decoding split probability
|
|
169
|
+
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
|
170
|
+
|
|
171
|
+
struct cpu_params cpuparams;
|
|
172
|
+
struct cpu_params cpuparams_batch;
|
|
173
|
+
|
|
174
|
+
std::string model = ""; // draft model for speculative decoding // NOLINT
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
struct common_params_vocoder {
|
|
178
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
179
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
180
|
+
|
|
181
|
+
std::string model = ""; // model path // NOLINT
|
|
182
|
+
std::string model_url = ""; // model url to download // NOLINT
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
struct common_params {
|
|
145
186
|
int32_t n_predict = -1; // new tokens to predict
|
|
146
|
-
int32_t n_ctx =
|
|
187
|
+
int32_t n_ctx = 4096; // context size
|
|
147
188
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
148
189
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
149
190
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
150
|
-
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
|
151
191
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
|
152
192
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
|
153
193
|
int32_t n_sequences = 1; // number of sequences to decode
|
|
154
|
-
float p_split = 0.1f; // speculative decoding split probability
|
|
155
|
-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
156
|
-
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
157
|
-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
158
|
-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
159
194
|
int32_t grp_attn_n = 1; // group-attention factor
|
|
160
195
|
int32_t grp_attn_w = 512; // group-attention width
|
|
161
196
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
|
@@ -166,28 +201,35 @@ struct gpt_params {
|
|
|
166
201
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
|
167
202
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
168
203
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
169
|
-
float defrag_thold =
|
|
204
|
+
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
205
|
+
|
|
206
|
+
// offload params
|
|
207
|
+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
208
|
+
|
|
209
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
210
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
211
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
212
|
+
|
|
213
|
+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
170
214
|
|
|
171
215
|
struct cpu_params cpuparams;
|
|
172
216
|
struct cpu_params cpuparams_batch;
|
|
173
|
-
struct cpu_params draft_cpuparams;
|
|
174
|
-
struct cpu_params draft_cpuparams_batch;
|
|
175
217
|
|
|
176
218
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
177
219
|
void * cb_eval_user_data = nullptr;
|
|
178
220
|
|
|
179
221
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
|
180
222
|
|
|
181
|
-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
182
223
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
183
224
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
184
225
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
185
226
|
|
|
186
|
-
struct
|
|
227
|
+
struct common_params_sampling sampling;
|
|
228
|
+
struct common_params_speculative speculative;
|
|
229
|
+
struct common_params_vocoder vocoder;
|
|
187
230
|
|
|
188
231
|
std::string model = ""; // model path // NOLINT
|
|
189
|
-
std::string
|
|
190
|
-
std::string model_alias = "unknown"; // model alias // NOLINT
|
|
232
|
+
std::string model_alias = ""; // model alias // NOLINT
|
|
191
233
|
std::string model_url = ""; // model url to download // NOLINT
|
|
192
234
|
std::string hf_token = ""; // HF token // NOLINT
|
|
193
235
|
std::string hf_repo = ""; // HF repo // NOLINT
|
|
@@ -197,7 +239,6 @@ struct gpt_params {
|
|
|
197
239
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
|
198
240
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
|
199
241
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
|
200
|
-
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
|
201
242
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
|
202
243
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
|
203
244
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
|
@@ -208,9 +249,9 @@ struct gpt_params {
|
|
|
208
249
|
std::vector<llama_model_kv_override> kv_overrides;
|
|
209
250
|
|
|
210
251
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
|
211
|
-
std::vector<
|
|
252
|
+
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
|
212
253
|
|
|
213
|
-
std::vector<
|
|
254
|
+
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
|
214
255
|
|
|
215
256
|
int32_t verbosity = 0;
|
|
216
257
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
|
@@ -259,8 +300,8 @@ struct gpt_params {
|
|
|
259
300
|
bool warmup = true; // warmup run
|
|
260
301
|
bool check_tensors = false; // validate tensor data
|
|
261
302
|
|
|
262
|
-
|
|
263
|
-
|
|
303
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
304
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
264
305
|
|
|
265
306
|
// multimodal models (see examples/llava)
|
|
266
307
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
|
@@ -268,21 +309,21 @@ struct gpt_params {
|
|
|
268
309
|
|
|
269
310
|
// embedding
|
|
270
311
|
bool embedding = false; // get only sentence embedding
|
|
271
|
-
int32_t embd_normalize = 2; // normalisation for
|
|
312
|
+
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
272
313
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
273
|
-
std::string embd_sep = "\n"; // separator of
|
|
314
|
+
std::string embd_sep = "\n"; // separator of embeddings
|
|
274
315
|
bool reranking = false; // enable reranking support on server
|
|
275
316
|
|
|
276
317
|
// server params
|
|
277
318
|
int32_t port = 8080; // server listens on this network port
|
|
278
319
|
int32_t timeout_read = 600; // http read timeout in seconds
|
|
279
320
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
280
|
-
|
|
321
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
322
|
+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
281
323
|
|
|
282
324
|
std::string hostname = "127.0.0.1";
|
|
283
325
|
std::string public_path = ""; // NOLINT
|
|
284
326
|
std::string chat_template = ""; // NOLINT
|
|
285
|
-
std::string system_prompt = ""; // NOLINT
|
|
286
327
|
bool enable_chat_template = true;
|
|
287
328
|
|
|
288
329
|
std::vector<std::string> api_keys;
|
|
@@ -290,7 +331,10 @@ struct gpt_params {
|
|
|
290
331
|
std::string ssl_file_key = ""; // NOLINT
|
|
291
332
|
std::string ssl_file_cert = ""; // NOLINT
|
|
292
333
|
|
|
293
|
-
|
|
334
|
+
// "advanced" endpoints are disabled by default for better security
|
|
335
|
+
bool webui = true;
|
|
336
|
+
bool endpoint_slots = false;
|
|
337
|
+
bool endpoint_props = false; // only control POST requests, not GET
|
|
294
338
|
bool endpoint_metrics = false;
|
|
295
339
|
|
|
296
340
|
bool log_json = false;
|
|
@@ -345,20 +389,31 @@ struct gpt_params {
|
|
|
345
389
|
|
|
346
390
|
// call once at the start of a program if it uses libcommon
|
|
347
391
|
// initializes the logging system and prints info about the build
|
|
348
|
-
void
|
|
392
|
+
void common_init();
|
|
349
393
|
|
|
350
|
-
std::string
|
|
394
|
+
std::string common_params_get_system_info(const common_params & params);
|
|
351
395
|
|
|
352
|
-
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
353
|
-
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
354
|
-
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
|
396
|
+
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
397
|
+
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
|
398
|
+
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
|
355
399
|
bool set_process_priority(enum ggml_sched_priority prio);
|
|
356
400
|
|
|
357
401
|
//
|
|
358
402
|
// String utils
|
|
359
403
|
//
|
|
360
404
|
|
|
361
|
-
|
|
405
|
+
#ifdef __GNUC__
|
|
406
|
+
#ifdef __MINGW32__
|
|
407
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
408
|
+
#else
|
|
409
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
410
|
+
#endif
|
|
411
|
+
#else
|
|
412
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
413
|
+
#endif
|
|
414
|
+
|
|
415
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
416
|
+
std::string string_format(const char * fmt, ...);
|
|
362
417
|
|
|
363
418
|
std::string string_strip(const std::string & str);
|
|
364
419
|
std::string string_get_sortable_timestamp();
|
|
@@ -367,6 +422,7 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
367
422
|
|
|
368
423
|
template<class T>
|
|
369
424
|
static std::vector<T> string_split(const std::string & str, char delim) {
|
|
425
|
+
static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
|
|
370
426
|
std::vector<T> values;
|
|
371
427
|
std::istringstream str_stream(str);
|
|
372
428
|
std::string token;
|
|
@@ -379,6 +435,27 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|
|
379
435
|
return values;
|
|
380
436
|
}
|
|
381
437
|
|
|
438
|
+
template<>
|
|
439
|
+
std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
|
|
440
|
+
{
|
|
441
|
+
std::vector<std::string> parts;
|
|
442
|
+
size_t begin_pos = 0;
|
|
443
|
+
size_t separator_pos = input.find(separator);
|
|
444
|
+
while (separator_pos != std::string::npos) {
|
|
445
|
+
std::string part = input.substr(begin_pos, separator_pos - begin_pos);
|
|
446
|
+
parts.emplace_back(part);
|
|
447
|
+
begin_pos = separator_pos + 1;
|
|
448
|
+
separator_pos = input.find(separator, begin_pos);
|
|
449
|
+
}
|
|
450
|
+
parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
|
|
451
|
+
return parts;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
static bool string_starts_with(const std::string & str,
|
|
455
|
+
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
|
456
|
+
return str.rfind(prefix, 0) == 0;
|
|
457
|
+
}
|
|
458
|
+
|
|
382
459
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
383
460
|
void string_process_escapes(std::string & input);
|
|
384
461
|
|
|
@@ -401,48 +478,69 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
|
401
478
|
// Model utils
|
|
402
479
|
//
|
|
403
480
|
|
|
404
|
-
struct
|
|
481
|
+
struct common_init_result {
|
|
405
482
|
struct llama_model * model = nullptr;
|
|
406
483
|
struct llama_context * context = nullptr;
|
|
407
|
-
std::vector<
|
|
484
|
+
std::vector<common_lora_adapter_container> lora_adapters;
|
|
408
485
|
};
|
|
409
486
|
|
|
410
|
-
struct
|
|
487
|
+
struct common_init_result common_init_from_params(common_params & params);
|
|
411
488
|
|
|
412
|
-
struct llama_model_params
|
|
413
|
-
struct llama_context_params
|
|
489
|
+
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
|
490
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
414
491
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
415
492
|
|
|
416
|
-
struct llama_model *
|
|
417
|
-
|
|
493
|
+
struct llama_model * common_load_model_from_url(
|
|
494
|
+
const std::string & model_url,
|
|
495
|
+
const std::string & local_path,
|
|
496
|
+
const std::string & hf_token,
|
|
497
|
+
const struct llama_model_params & params);
|
|
498
|
+
struct llama_model * common_load_model_from_hf(
|
|
499
|
+
const std::string & repo,
|
|
500
|
+
const std::string & remote_path,
|
|
501
|
+
const std::string & local_path,
|
|
502
|
+
const std::string & hf_token,
|
|
503
|
+
const struct llama_model_params & params);
|
|
418
504
|
|
|
419
505
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
420
|
-
void
|
|
506
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
|
421
507
|
|
|
508
|
+
//
|
|
422
509
|
// Batch utils
|
|
510
|
+
//
|
|
423
511
|
|
|
424
|
-
void
|
|
512
|
+
void common_batch_clear(struct llama_batch & batch);
|
|
425
513
|
|
|
426
|
-
void
|
|
514
|
+
void common_batch_add(
|
|
427
515
|
struct llama_batch & batch,
|
|
428
516
|
llama_token id,
|
|
429
517
|
llama_pos pos,
|
|
430
518
|
const std::vector<llama_seq_id> & seq_ids,
|
|
431
519
|
bool logits);
|
|
432
520
|
|
|
521
|
+
//
|
|
522
|
+
// Token utils
|
|
523
|
+
//
|
|
524
|
+
|
|
525
|
+
// longest common prefix
|
|
526
|
+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
|
527
|
+
|
|
528
|
+
// longet common subsequence
|
|
529
|
+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
|
530
|
+
|
|
433
531
|
//
|
|
434
532
|
// Vocab utils
|
|
435
533
|
//
|
|
436
534
|
|
|
437
535
|
// tokenizes a string into a vector of tokens
|
|
438
536
|
// should work similar to Python's `tokenizer.encode`
|
|
439
|
-
std::vector<llama_token>
|
|
537
|
+
std::vector<llama_token> common_tokenize(
|
|
440
538
|
const struct llama_context * ctx,
|
|
441
539
|
const std::string & text,
|
|
442
540
|
bool add_special,
|
|
443
541
|
bool parse_special = false);
|
|
444
542
|
|
|
445
|
-
std::vector<llama_token>
|
|
543
|
+
std::vector<llama_token> common_tokenize(
|
|
446
544
|
const struct llama_model * model,
|
|
447
545
|
const std::string & text,
|
|
448
546
|
bool add_special,
|
|
@@ -450,7 +548,7 @@ std::vector<llama_token> llama_tokenize(
|
|
|
450
548
|
|
|
451
549
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
|
452
550
|
// should work similar to Python's `tokenizer.id_to_piece`
|
|
453
|
-
std::string
|
|
551
|
+
std::string common_token_to_piece(
|
|
454
552
|
const struct llama_context * ctx,
|
|
455
553
|
llama_token token,
|
|
456
554
|
bool special = true);
|
|
@@ -458,7 +556,7 @@ std::string llama_token_to_piece(
|
|
|
458
556
|
// detokenizes a vector of tokens into a string
|
|
459
557
|
// should work similar to Python's `tokenizer.decode`
|
|
460
558
|
// optionally renders special/control tokens
|
|
461
|
-
std::string
|
|
559
|
+
std::string common_detokenize(
|
|
462
560
|
llama_context * ctx,
|
|
463
561
|
const std::vector<llama_token> & tokens,
|
|
464
562
|
bool special = true);
|
|
@@ -468,31 +566,31 @@ std::string llama_detokenize(
|
|
|
468
566
|
//
|
|
469
567
|
|
|
470
568
|
// same with llama_chat_message, but uses std::string
|
|
471
|
-
struct
|
|
569
|
+
struct common_chat_msg {
|
|
472
570
|
std::string role;
|
|
473
571
|
std::string content;
|
|
474
572
|
};
|
|
475
573
|
|
|
476
574
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
477
|
-
bool
|
|
575
|
+
bool common_chat_verify_template(const std::string & tmpl);
|
|
478
576
|
|
|
479
577
|
// CPP wrapper for llama_chat_apply_template
|
|
480
578
|
// If the built-in template is not supported, we default to chatml
|
|
481
579
|
// If the custom "tmpl" is not supported, we throw an error
|
|
482
|
-
std::string
|
|
580
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
|
483
581
|
const std::string & tmpl,
|
|
484
|
-
const std::vector<
|
|
582
|
+
const std::vector<common_chat_msg> & chat,
|
|
485
583
|
bool add_ass);
|
|
486
584
|
|
|
487
585
|
// Format single message, while taking into account the position of that message in chat history
|
|
488
|
-
std::string
|
|
586
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
|
489
587
|
const std::string & tmpl,
|
|
490
|
-
const std::vector<
|
|
491
|
-
const
|
|
588
|
+
const std::vector<common_chat_msg> & past_msg,
|
|
589
|
+
const common_chat_msg & new_msg,
|
|
492
590
|
bool add_ass);
|
|
493
591
|
|
|
494
592
|
// Returns an example of formatted chat
|
|
495
|
-
std::string
|
|
593
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
|
496
594
|
const std::string & tmpl);
|
|
497
595
|
|
|
498
596
|
//
|
|
@@ -500,31 +598,32 @@ std::string llama_chat_format_example(const struct llama_model * model,
|
|
|
500
598
|
//
|
|
501
599
|
|
|
502
600
|
// Dump the KV cache view with the number of sequences per cell.
|
|
503
|
-
void
|
|
601
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
504
602
|
|
|
505
603
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
506
|
-
void
|
|
604
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
507
605
|
|
|
508
606
|
//
|
|
509
607
|
// Embedding utils
|
|
510
608
|
//
|
|
511
609
|
|
|
512
|
-
|
|
610
|
+
// TODO: repace embd_norm with an enum
|
|
611
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
|
513
612
|
|
|
514
|
-
float
|
|
613
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
515
614
|
|
|
516
615
|
//
|
|
517
616
|
// Control vector utils
|
|
518
617
|
//
|
|
519
618
|
|
|
520
|
-
struct
|
|
619
|
+
struct common_control_vector_data {
|
|
521
620
|
int n_embd;
|
|
522
621
|
|
|
523
622
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
|
524
623
|
std::vector<float> data;
|
|
525
624
|
};
|
|
526
625
|
|
|
527
|
-
struct
|
|
626
|
+
struct common_control_vector_load_info {
|
|
528
627
|
float strength;
|
|
529
628
|
|
|
530
629
|
std::string fname;
|
|
@@ -532,7 +631,7 @@ struct llama_control_vector_load_info {
|
|
|
532
631
|
|
|
533
632
|
// Load control vectors, scale each by strength, and add them together.
|
|
534
633
|
// On error, returns {-1, empty}
|
|
535
|
-
|
|
634
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
|
536
635
|
|
|
537
636
|
//
|
|
538
637
|
// Split utils
|
|
@@ -541,15 +640,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
541
640
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
|
542
641
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
|
543
642
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
544
|
-
|
|
545
|
-
//
|
|
546
|
-
// YAML utils
|
|
547
|
-
//
|
|
548
|
-
|
|
549
|
-
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
|
550
|
-
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
|
551
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
|
552
|
-
|
|
553
|
-
void yaml_dump_non_result_info(
|
|
554
|
-
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
555
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|