@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
#include <algorithm>
|
|
14
14
|
#include <cinttypes>
|
|
15
|
+
#include <climits>
|
|
15
16
|
#include <cmath>
|
|
16
17
|
#include <codecvt>
|
|
17
18
|
#include <cstdarg>
|
|
@@ -23,10 +24,10 @@
|
|
|
23
24
|
#include <regex>
|
|
24
25
|
#include <sstream>
|
|
25
26
|
#include <string>
|
|
27
|
+
#include <thread>
|
|
26
28
|
#include <unordered_map>
|
|
27
29
|
#include <unordered_set>
|
|
28
30
|
#include <vector>
|
|
29
|
-
#include <thread>
|
|
30
31
|
|
|
31
32
|
#if defined(__APPLE__) && defined(__MACH__)
|
|
32
33
|
#include <sys/types.h>
|
|
@@ -362,10 +363,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
|
|
362
363
|
return true;
|
|
363
364
|
}
|
|
364
365
|
|
|
365
|
-
void
|
|
366
|
+
void common_init() {
|
|
366
367
|
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
|
367
|
-
if (LOG_DEFAULT_LLAMA <=
|
|
368
|
-
|
|
368
|
+
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
|
369
|
+
common_log_add(common_log_main(), level, "%s", text);
|
|
369
370
|
}
|
|
370
371
|
}, NULL);
|
|
371
372
|
|
|
@@ -378,7 +379,7 @@ void gpt_init() {
|
|
|
378
379
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
|
379
380
|
}
|
|
380
381
|
|
|
381
|
-
std::string
|
|
382
|
+
std::string common_params_get_system_info(const common_params & params) {
|
|
382
383
|
std::ostringstream os;
|
|
383
384
|
|
|
384
385
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
|
@@ -400,17 +401,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
|
400
401
|
// String utils
|
|
401
402
|
//
|
|
402
403
|
|
|
403
|
-
std::
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
404
|
+
std::string string_format(const char * fmt, ...) {
|
|
405
|
+
va_list ap;
|
|
406
|
+
va_list ap2;
|
|
407
|
+
va_start(ap, fmt);
|
|
408
|
+
va_copy(ap2, ap);
|
|
409
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
410
|
+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
411
|
+
std::vector<char> buf(size + 1);
|
|
412
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
413
|
+
GGML_ASSERT(size2 == size);
|
|
414
|
+
va_end(ap2);
|
|
415
|
+
va_end(ap);
|
|
416
|
+
return std::string(buf.data(), size);
|
|
414
417
|
}
|
|
415
418
|
|
|
416
419
|
std::string string_strip(const std::string & str) {
|
|
@@ -493,7 +496,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
|
493
496
|
first = false;
|
|
494
497
|
}
|
|
495
498
|
|
|
496
|
-
auto detokenized =
|
|
499
|
+
auto detokenized = common_token_to_piece(ctx, token);
|
|
497
500
|
|
|
498
501
|
detokenized.erase(
|
|
499
502
|
std::remove_if(
|
|
@@ -524,7 +527,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
524
527
|
first = false;
|
|
525
528
|
}
|
|
526
529
|
|
|
527
|
-
auto detokenized =
|
|
530
|
+
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
528
531
|
|
|
529
532
|
detokenized.erase(
|
|
530
533
|
std::remove_if(
|
|
@@ -533,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
533
536
|
[](const unsigned char c) { return !std::isprint(c); }),
|
|
534
537
|
detokenized.end());
|
|
535
538
|
|
|
536
|
-
buf << "\n"
|
|
537
|
-
<< "
|
|
538
|
-
<< "
|
|
539
|
-
<< "
|
|
540
|
-
<< "
|
|
541
|
-
<< "
|
|
539
|
+
buf << "\n" << std::to_string(i)
|
|
540
|
+
<< ", token '" << detokenized << "'"
|
|
541
|
+
<< ", pos " << std::to_string(batch.pos[i])
|
|
542
|
+
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
543
|
+
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
|
|
544
|
+
<< ", logits " << std::to_string(batch.logits[i]);
|
|
542
545
|
}
|
|
543
546
|
|
|
544
547
|
buf << " ]";
|
|
@@ -649,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
649
652
|
|
|
650
653
|
std::u32string filename_utf32;
|
|
651
654
|
try {
|
|
655
|
+
#if defined(__clang__)
|
|
656
|
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
657
|
+
# pragma clang diagnostic push
|
|
658
|
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
659
|
+
#endif
|
|
652
660
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
661
|
+
|
|
662
|
+
#if defined(__clang__)
|
|
663
|
+
# pragma clang diagnostic pop
|
|
664
|
+
#endif
|
|
665
|
+
|
|
653
666
|
filename_utf32 = converter.from_bytes(filename);
|
|
654
667
|
|
|
655
668
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
|
@@ -819,16 +832,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
|
819
832
|
//
|
|
820
833
|
// Model utils
|
|
821
834
|
//
|
|
822
|
-
struct
|
|
823
|
-
|
|
824
|
-
auto mparams =
|
|
835
|
+
struct common_init_result common_init_from_params(common_params & params) {
|
|
836
|
+
common_init_result iparams;
|
|
837
|
+
auto mparams = common_model_params_to_llama(params);
|
|
825
838
|
|
|
826
839
|
llama_model * model = nullptr;
|
|
827
840
|
|
|
828
841
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
829
|
-
model =
|
|
842
|
+
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
|
830
843
|
} else if (!params.model_url.empty()) {
|
|
831
|
-
model =
|
|
844
|
+
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
|
832
845
|
} else {
|
|
833
846
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
834
847
|
}
|
|
@@ -863,7 +876,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
863
876
|
}
|
|
864
877
|
}
|
|
865
878
|
|
|
866
|
-
auto cparams =
|
|
879
|
+
auto cparams = common_context_params_to_llama(params);
|
|
867
880
|
|
|
868
881
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
869
882
|
if (lctx == NULL) {
|
|
@@ -872,11 +885,17 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
872
885
|
return iparams;
|
|
873
886
|
}
|
|
874
887
|
|
|
888
|
+
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
|
889
|
+
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
|
890
|
+
llama_free_model(model);
|
|
891
|
+
return iparams;
|
|
892
|
+
}
|
|
893
|
+
|
|
875
894
|
if (!params.control_vectors.empty()) {
|
|
876
895
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
877
896
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
878
897
|
|
|
879
|
-
const auto cvec =
|
|
898
|
+
const auto cvec = common_control_vector_load(params.control_vectors);
|
|
880
899
|
if (cvec.n_embd == -1) {
|
|
881
900
|
llama_free(lctx);
|
|
882
901
|
llama_free_model(model);
|
|
@@ -900,7 +919,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
900
919
|
|
|
901
920
|
// load and optionally apply lora adapters
|
|
902
921
|
for (auto & la : params.lora_adapters) {
|
|
903
|
-
|
|
922
|
+
common_lora_adapter_container loaded_la;
|
|
904
923
|
loaded_la.path = la.path;
|
|
905
924
|
loaded_la.scale = la.scale;
|
|
906
925
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
|
@@ -913,12 +932,31 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
913
932
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
|
914
933
|
}
|
|
915
934
|
if (!params.lora_init_without_apply) {
|
|
916
|
-
|
|
935
|
+
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
|
917
936
|
}
|
|
918
937
|
|
|
919
|
-
if (params.
|
|
938
|
+
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
920
939
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
921
|
-
params.
|
|
940
|
+
params.sampling.ignore_eos = false;
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
if (params.sampling.ignore_eos) {
|
|
944
|
+
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
|
|
945
|
+
if (llama_token_is_eog(model, i)) {
|
|
946
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
947
|
+
params.sampling.logit_bias.push_back({i, -INFINITY});
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
if (params.sampling.penalty_last_n == -1) {
|
|
953
|
+
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
954
|
+
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
if (params.sampling.dry_penalty_last_n == -1) {
|
|
958
|
+
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
959
|
+
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
922
960
|
}
|
|
923
961
|
|
|
924
962
|
if (params.warmup) {
|
|
@@ -939,7 +977,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
939
977
|
}
|
|
940
978
|
|
|
941
979
|
if (llama_model_has_encoder(model)) {
|
|
942
|
-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()
|
|
980
|
+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
|
943
981
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
944
982
|
if (decoder_start_token_id == -1) {
|
|
945
983
|
decoder_start_token_id = bos;
|
|
@@ -948,7 +986,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
948
986
|
tmp.push_back(decoder_start_token_id);
|
|
949
987
|
}
|
|
950
988
|
if (llama_model_has_decoder(model)) {
|
|
951
|
-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)
|
|
989
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
952
990
|
}
|
|
953
991
|
llama_kv_cache_clear(lctx);
|
|
954
992
|
llama_synchronize(lctx);
|
|
@@ -961,7 +999,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
961
999
|
return iparams;
|
|
962
1000
|
}
|
|
963
1001
|
|
|
964
|
-
void
|
|
1002
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
|
965
1003
|
llama_lora_adapter_clear(ctx);
|
|
966
1004
|
for (auto & la : lora_adapters) {
|
|
967
1005
|
if (la.scale != 0.0f) {
|
|
@@ -970,9 +1008,12 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
|
|
970
1008
|
}
|
|
971
1009
|
}
|
|
972
1010
|
|
|
973
|
-
struct llama_model_params
|
|
1011
|
+
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
974
1012
|
auto mparams = llama_model_default_params();
|
|
975
1013
|
|
|
1014
|
+
if (!params.devices.empty()) {
|
|
1015
|
+
mparams.devices = params.devices.data();
|
|
1016
|
+
}
|
|
976
1017
|
if (params.n_gpu_layers != -1) {
|
|
977
1018
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
978
1019
|
}
|
|
@@ -993,36 +1034,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|
|
993
1034
|
return mparams;
|
|
994
1035
|
}
|
|
995
1036
|
|
|
996
|
-
|
|
997
|
-
if (s == "f32") {
|
|
998
|
-
return GGML_TYPE_F32;
|
|
999
|
-
}
|
|
1000
|
-
if (s == "f16") {
|
|
1001
|
-
return GGML_TYPE_F16;
|
|
1002
|
-
}
|
|
1003
|
-
if (s == "q8_0") {
|
|
1004
|
-
return GGML_TYPE_Q8_0;
|
|
1005
|
-
}
|
|
1006
|
-
if (s == "q4_0") {
|
|
1007
|
-
return GGML_TYPE_Q4_0;
|
|
1008
|
-
}
|
|
1009
|
-
if (s == "q4_1") {
|
|
1010
|
-
return GGML_TYPE_Q4_1;
|
|
1011
|
-
}
|
|
1012
|
-
if (s == "iq4_nl") {
|
|
1013
|
-
return GGML_TYPE_IQ4_NL;
|
|
1014
|
-
}
|
|
1015
|
-
if (s == "q5_0") {
|
|
1016
|
-
return GGML_TYPE_Q5_0;
|
|
1017
|
-
}
|
|
1018
|
-
if (s == "q5_1") {
|
|
1019
|
-
return GGML_TYPE_Q5_1;
|
|
1020
|
-
}
|
|
1021
|
-
|
|
1022
|
-
throw std::runtime_error("Invalid cache type: " + s);
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
|
1037
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
|
1026
1038
|
auto cparams = llama_context_default_params();
|
|
1027
1039
|
|
|
1028
1040
|
cparams.n_ctx = params.n_ctx;
|
|
@@ -1031,7 +1043,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
1031
1043
|
cparams.n_ubatch = params.n_ubatch;
|
|
1032
1044
|
cparams.n_threads = params.cpuparams.n_threads;
|
|
1033
1045
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
|
1034
|
-
|
|
1046
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
|
1035
1047
|
cparams.logits_all = params.logits_all;
|
|
1036
1048
|
cparams.embeddings = params.embedding;
|
|
1037
1049
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
@@ -1056,8 +1068,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
1056
1068
|
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1057
1069
|
}
|
|
1058
1070
|
|
|
1059
|
-
cparams.type_k =
|
|
1060
|
-
cparams.type_v =
|
|
1071
|
+
cparams.type_k = params.cache_type_k;
|
|
1072
|
+
cparams.type_v = params.cache_type_v;
|
|
1061
1073
|
|
|
1062
1074
|
return cparams;
|
|
1063
1075
|
}
|
|
@@ -1083,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
|
|
1083
1095
|
#define CURL_MAX_RETRY 3
|
|
1084
1096
|
#define CURL_RETRY_DELAY_SECONDS 2
|
|
1085
1097
|
|
|
1086
|
-
|
|
1087
|
-
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
1088
|
-
// While we wait for C++20's std::string::starts_with...
|
|
1089
|
-
return str.rfind(prefix, 0) == 0;
|
|
1090
|
-
}
|
|
1091
|
-
|
|
1092
|
-
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
|
1098
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
|
1093
1099
|
int remaining_attempts = max_attempts;
|
|
1094
1100
|
|
|
1095
1101
|
while (remaining_attempts > 0) {
|
|
@@ -1112,8 +1118,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
|
1112
1118
|
return false;
|
|
1113
1119
|
}
|
|
1114
1120
|
|
|
1115
|
-
static bool
|
|
1116
|
-
|
|
1121
|
+
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
1117
1122
|
// Initialize libcurl
|
|
1118
1123
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1119
1124
|
if (!curl) {
|
|
@@ -1182,15 +1187,17 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
1182
1187
|
}
|
|
1183
1188
|
|
|
1184
1189
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
1185
|
-
struct
|
|
1190
|
+
struct common_load_model_from_url_headers {
|
|
1186
1191
|
std::string etag;
|
|
1187
1192
|
std::string last_modified;
|
|
1188
1193
|
};
|
|
1189
|
-
|
|
1194
|
+
|
|
1195
|
+
common_load_model_from_url_headers headers;
|
|
1196
|
+
|
|
1190
1197
|
{
|
|
1191
1198
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
1192
1199
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
1193
|
-
|
|
1200
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
1194
1201
|
|
|
1195
1202
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
1196
1203
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
@@ -1326,18 +1333,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
1326
1333
|
return true;
|
|
1327
1334
|
}
|
|
1328
1335
|
|
|
1329
|
-
struct llama_model *
|
|
1330
|
-
const
|
|
1331
|
-
const
|
|
1332
|
-
const
|
|
1336
|
+
struct llama_model * common_load_model_from_url(
|
|
1337
|
+
const std::string & model_url,
|
|
1338
|
+
const std::string & local_path,
|
|
1339
|
+
const std::string & hf_token,
|
|
1333
1340
|
const struct llama_model_params & params) {
|
|
1334
1341
|
// Basic validation of the model_url
|
|
1335
|
-
if (
|
|
1342
|
+
if (model_url.empty()) {
|
|
1336
1343
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
|
1337
1344
|
return NULL;
|
|
1338
1345
|
}
|
|
1339
1346
|
|
|
1340
|
-
if (!
|
|
1347
|
+
if (!common_download_file(model_url, local_path, hf_token)) {
|
|
1341
1348
|
return NULL;
|
|
1342
1349
|
}
|
|
1343
1350
|
|
|
@@ -1348,9 +1355,9 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1348
1355
|
/*.no_alloc = */ true,
|
|
1349
1356
|
/*.ctx = */ NULL,
|
|
1350
1357
|
};
|
|
1351
|
-
auto * ctx_gguf = gguf_init_from_file(
|
|
1358
|
+
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
|
1352
1359
|
if (!ctx_gguf) {
|
|
1353
|
-
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__,
|
|
1360
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
|
1354
1361
|
return NULL;
|
|
1355
1362
|
}
|
|
1356
1363
|
|
|
@@ -1369,13 +1376,13 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1369
1376
|
// Verify the first split file format
|
|
1370
1377
|
// and extract split URL and PATH prefixes
|
|
1371
1378
|
{
|
|
1372
|
-
if (!llama_split_prefix(split_prefix, sizeof(split_prefix),
|
|
1373
|
-
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__,
|
|
1379
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
|
1380
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
|
1374
1381
|
return NULL;
|
|
1375
1382
|
}
|
|
1376
1383
|
|
|
1377
|
-
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
|
1378
|
-
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
|
1384
|
+
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
|
1385
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
|
1379
1386
|
return NULL;
|
|
1380
1387
|
}
|
|
1381
1388
|
}
|
|
@@ -1390,7 +1397,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1390
1397
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
1391
1398
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
1392
1399
|
|
|
1393
|
-
return
|
|
1400
|
+
return common_download_file(split_url, split_path, hf_token);
|
|
1394
1401
|
}, idx));
|
|
1395
1402
|
}
|
|
1396
1403
|
|
|
@@ -1402,14 +1409,14 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1402
1409
|
}
|
|
1403
1410
|
}
|
|
1404
1411
|
|
|
1405
|
-
return llama_load_model_from_file(
|
|
1412
|
+
return llama_load_model_from_file(local_path.c_str(), params);
|
|
1406
1413
|
}
|
|
1407
1414
|
|
|
1408
|
-
struct llama_model *
|
|
1409
|
-
const
|
|
1410
|
-
const
|
|
1411
|
-
const
|
|
1412
|
-
const
|
|
1415
|
+
struct llama_model * common_load_model_from_hf(
|
|
1416
|
+
const std::string & repo,
|
|
1417
|
+
const std::string & remote_path,
|
|
1418
|
+
const std::string & local_path,
|
|
1419
|
+
const std::string & hf_token,
|
|
1413
1420
|
const struct llama_model_params & params) {
|
|
1414
1421
|
// construct hugging face model url:
|
|
1415
1422
|
//
|
|
@@ -1423,27 +1430,27 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
1423
1430
|
std::string model_url = "https://huggingface.co/";
|
|
1424
1431
|
model_url += repo;
|
|
1425
1432
|
model_url += "/resolve/main/";
|
|
1426
|
-
model_url +=
|
|
1433
|
+
model_url += remote_path;
|
|
1427
1434
|
|
|
1428
|
-
return
|
|
1435
|
+
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
|
1429
1436
|
}
|
|
1430
1437
|
|
|
1431
1438
|
#else
|
|
1432
1439
|
|
|
1433
|
-
struct llama_model *
|
|
1434
|
-
const
|
|
1435
|
-
const
|
|
1436
|
-
const
|
|
1440
|
+
struct llama_model * common_load_model_from_url(
|
|
1441
|
+
const std::string & /*model_url*/,
|
|
1442
|
+
const std::string & /*local_path*/,
|
|
1443
|
+
const std::string & /*hf_token*/,
|
|
1437
1444
|
const struct llama_model_params & /*params*/) {
|
|
1438
1445
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
1439
1446
|
return nullptr;
|
|
1440
1447
|
}
|
|
1441
1448
|
|
|
1442
|
-
struct llama_model *
|
|
1443
|
-
const
|
|
1444
|
-
const
|
|
1445
|
-
const
|
|
1446
|
-
const
|
|
1449
|
+
struct llama_model * common_load_model_from_hf(
|
|
1450
|
+
const std::string & /*repo*/,
|
|
1451
|
+
const std::string & /*remote_path*/,
|
|
1452
|
+
const std::string & /*local_path*/,
|
|
1453
|
+
const std::string & /*hf_token*/,
|
|
1447
1454
|
const struct llama_model_params & /*params*/) {
|
|
1448
1455
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1449
1456
|
return nullptr;
|
|
@@ -1455,11 +1462,11 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
1455
1462
|
// Batch utils
|
|
1456
1463
|
//
|
|
1457
1464
|
|
|
1458
|
-
void
|
|
1465
|
+
void common_batch_clear(struct llama_batch & batch) {
|
|
1459
1466
|
batch.n_tokens = 0;
|
|
1460
1467
|
}
|
|
1461
1468
|
|
|
1462
|
-
void
|
|
1469
|
+
void common_batch_add(
|
|
1463
1470
|
struct llama_batch & batch,
|
|
1464
1471
|
llama_token id,
|
|
1465
1472
|
llama_pos pos,
|
|
@@ -1478,19 +1485,79 @@ void llama_batch_add(
|
|
|
1478
1485
|
batch.n_tokens++;
|
|
1479
1486
|
}
|
|
1480
1487
|
|
|
1488
|
+
//
|
|
1489
|
+
// Token utils
|
|
1490
|
+
//
|
|
1491
|
+
|
|
1492
|
+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
|
|
1493
|
+
size_t i;
|
|
1494
|
+
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
1495
|
+
|
|
1496
|
+
return i;
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
|
|
1500
|
+
// check for empty sequences
|
|
1501
|
+
if (a.empty() || b.empty()) {
|
|
1502
|
+
return 0;
|
|
1503
|
+
}
|
|
1504
|
+
|
|
1505
|
+
// get the lengths of the input sequences
|
|
1506
|
+
size_t a_len = a.size();
|
|
1507
|
+
size_t b_len = b.size();
|
|
1508
|
+
|
|
1509
|
+
// initialize the maximum length of the longest common subsequence (LCS)
|
|
1510
|
+
size_t max_length = 0;
|
|
1511
|
+
|
|
1512
|
+
// use two rows instead of a 2D matrix to optimize space
|
|
1513
|
+
std::vector<size_t> prev_row(b_len + 1, 0);
|
|
1514
|
+
std::vector<size_t> curr_row(b_len + 1, 0);
|
|
1515
|
+
|
|
1516
|
+
// iterate through the elements of a
|
|
1517
|
+
for (size_t i = 1; i <= a_len; i++) {
|
|
1518
|
+
// iterate through the elements of b
|
|
1519
|
+
for (size_t j = 1; j <= b_len; j++) {
|
|
1520
|
+
// if elements at the current positions match
|
|
1521
|
+
if (a[i - 1] == b[j - 1]) {
|
|
1522
|
+
// if it's the first element of either sequences, set LCS length to 1
|
|
1523
|
+
if (i == 1 || j == 1) {
|
|
1524
|
+
curr_row[j] = 1;
|
|
1525
|
+
} else {
|
|
1526
|
+
// increment LCS length by 1 compared to the previous element
|
|
1527
|
+
curr_row[j] = prev_row[j - 1] + 1;
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
// update max_length if necessary
|
|
1531
|
+
if (curr_row[j] > max_length) {
|
|
1532
|
+
max_length = curr_row[j];
|
|
1533
|
+
}
|
|
1534
|
+
} else {
|
|
1535
|
+
// reset LCS length if elements don't match
|
|
1536
|
+
curr_row[j] = 0;
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
// update the previous row for the next iteration
|
|
1541
|
+
prev_row = curr_row;
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
// return the maximum length of the LCS
|
|
1545
|
+
return max_length;
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1481
1548
|
//
|
|
1482
1549
|
// Vocab utils
|
|
1483
1550
|
//
|
|
1484
1551
|
|
|
1485
|
-
std::vector<llama_token>
|
|
1552
|
+
std::vector<llama_token> common_tokenize(
|
|
1486
1553
|
const struct llama_context * ctx,
|
|
1487
1554
|
const std::string & text,
|
|
1488
1555
|
bool add_special,
|
|
1489
1556
|
bool parse_special) {
|
|
1490
|
-
return
|
|
1557
|
+
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
|
1491
1558
|
}
|
|
1492
1559
|
|
|
1493
|
-
std::vector<llama_token>
|
|
1560
|
+
std::vector<llama_token> common_tokenize(
|
|
1494
1561
|
const struct llama_model * model,
|
|
1495
1562
|
const std::string & text,
|
|
1496
1563
|
bool add_special,
|
|
@@ -1509,7 +1576,7 @@ std::vector<llama_token> llama_tokenize(
|
|
|
1509
1576
|
return result;
|
|
1510
1577
|
}
|
|
1511
1578
|
|
|
1512
|
-
std::string
|
|
1579
|
+
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
1513
1580
|
std::string piece;
|
|
1514
1581
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
1515
1582
|
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
|
@@ -1525,7 +1592,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
|
|
1525
1592
|
return piece;
|
|
1526
1593
|
}
|
|
1527
1594
|
|
|
1528
|
-
std::string
|
|
1595
|
+
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
1529
1596
|
std::string text;
|
|
1530
1597
|
text.resize(std::max(text.capacity(), tokens.size()));
|
|
1531
1598
|
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
@@ -1545,15 +1612,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
|
1545
1612
|
// Chat template utils
|
|
1546
1613
|
//
|
|
1547
1614
|
|
|
1548
|
-
bool
|
|
1615
|
+
bool common_chat_verify_template(const std::string & tmpl) {
|
|
1549
1616
|
llama_chat_message chat[] = {{"user", "test"}};
|
|
1550
1617
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
1551
1618
|
return res >= 0;
|
|
1552
1619
|
}
|
|
1553
1620
|
|
|
1554
|
-
std::string
|
|
1621
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
|
1555
1622
|
const std::string & tmpl,
|
|
1556
|
-
const std::vector<
|
|
1623
|
+
const std::vector<common_chat_msg> & msgs,
|
|
1557
1624
|
bool add_ass) {
|
|
1558
1625
|
int alloc_size = 0;
|
|
1559
1626
|
bool fallback = false; // indicate if we must fallback to default chatml
|
|
@@ -1595,42 +1662,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|
|
1595
1662
|
return formatted_chat;
|
|
1596
1663
|
}
|
|
1597
1664
|
|
|
1598
|
-
std::string
|
|
1665
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
|
1599
1666
|
const std::string & tmpl,
|
|
1600
|
-
const std::vector<
|
|
1601
|
-
const
|
|
1667
|
+
const std::vector<common_chat_msg> & past_msg,
|
|
1668
|
+
const common_chat_msg & new_msg,
|
|
1602
1669
|
bool add_ass) {
|
|
1603
1670
|
std::ostringstream ss;
|
|
1604
|
-
auto fmt_past_msg = past_msg.empty() ? "" :
|
|
1605
|
-
std::vector<
|
|
1671
|
+
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
|
1672
|
+
std::vector<common_chat_msg> chat_new(past_msg);
|
|
1606
1673
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
|
1607
1674
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
|
1608
1675
|
ss << "\n";
|
|
1609
1676
|
};
|
|
1610
1677
|
// format chat with new_msg
|
|
1611
1678
|
chat_new.push_back(new_msg);
|
|
1612
|
-
auto fmt_new_msg =
|
|
1679
|
+
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
|
1613
1680
|
// get the diff part
|
|
1614
1681
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
|
1615
1682
|
return ss.str();
|
|
1616
1683
|
}
|
|
1617
1684
|
|
|
1618
|
-
std::string
|
|
1685
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
|
1619
1686
|
const std::string & tmpl) {
|
|
1620
|
-
std::vector<
|
|
1687
|
+
std::vector<common_chat_msg> msgs = {
|
|
1621
1688
|
{"system", "You are a helpful assistant"},
|
|
1622
1689
|
{"user", "Hello"},
|
|
1623
1690
|
{"assistant", "Hi there"},
|
|
1624
1691
|
{"user", "How are you?"},
|
|
1625
1692
|
};
|
|
1626
|
-
return
|
|
1693
|
+
return common_chat_apply_template(model, tmpl, msgs, true);
|
|
1627
1694
|
}
|
|
1628
1695
|
|
|
1629
1696
|
//
|
|
1630
1697
|
// KV cache utils
|
|
1631
1698
|
//
|
|
1632
1699
|
|
|
1633
|
-
void
|
|
1700
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
1634
1701
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
1635
1702
|
|
|
1636
1703
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
@@ -1653,7 +1720,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
|
1653
1720
|
printf("\n=== Done dumping\n");
|
|
1654
1721
|
}
|
|
1655
1722
|
|
|
1656
|
-
void
|
|
1723
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
1657
1724
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
1658
1725
|
|
|
1659
1726
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
@@ -1705,7 +1772,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
|
1705
1772
|
// Embedding utils
|
|
1706
1773
|
//
|
|
1707
1774
|
|
|
1708
|
-
void
|
|
1775
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
|
1709
1776
|
double sum = 0.0;
|
|
1710
1777
|
|
|
1711
1778
|
switch (embd_norm) {
|
|
@@ -1714,7 +1781,9 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
|
1714
1781
|
break;
|
|
1715
1782
|
case 0: // max absolute
|
|
1716
1783
|
for (int i = 0; i < n; i++) {
|
|
1717
|
-
if (sum < std::abs(inp[i]))
|
|
1784
|
+
if (sum < std::abs(inp[i])) {
|
|
1785
|
+
sum = std::abs(inp[i]);
|
|
1786
|
+
}
|
|
1718
1787
|
}
|
|
1719
1788
|
sum /= 32760.0; // make an int16 range
|
|
1720
1789
|
break;
|
|
@@ -1739,7 +1808,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
|
1739
1808
|
}
|
|
1740
1809
|
}
|
|
1741
1810
|
|
|
1742
|
-
float
|
|
1811
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
|
1743
1812
|
double sum = 0.0;
|
|
1744
1813
|
double sum1 = 0.0;
|
|
1745
1814
|
double sum2 = 0.0;
|
|
@@ -1765,8 +1834,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
|
1765
1834
|
// Control vector utils
|
|
1766
1835
|
//
|
|
1767
1836
|
|
|
1768
|
-
static
|
|
1769
|
-
|
|
1837
|
+
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
|
1838
|
+
common_control_vector_data result = { -1, {} };
|
|
1770
1839
|
|
|
1771
1840
|
ggml_context * ctx = nullptr;
|
|
1772
1841
|
struct gguf_init_params meta_gguf_params = {
|
|
@@ -1850,11 +1919,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
1850
1919
|
return result;
|
|
1851
1920
|
}
|
|
1852
1921
|
|
|
1853
|
-
|
|
1854
|
-
|
|
1922
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
|
1923
|
+
common_control_vector_data result = { -1, {} };
|
|
1855
1924
|
|
|
1856
1925
|
for (const auto & info : load_infos) {
|
|
1857
|
-
auto cur =
|
|
1926
|
+
auto cur = common_control_vector_load_one(info);
|
|
1858
1927
|
|
|
1859
1928
|
if (cur.n_embd == -1) {
|
|
1860
1929
|
result.n_embd = -1;
|
|
@@ -1884,211 +1953,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
1884
1953
|
return result;
|
|
1885
1954
|
}
|
|
1886
1955
|
|
|
1887
|
-
//
|
|
1888
|
-
// YAML utils
|
|
1889
|
-
//
|
|
1890
|
-
|
|
1891
|
-
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
1892
|
-
if (data.empty()) {
|
|
1893
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
1894
|
-
return;
|
|
1895
|
-
}
|
|
1896
|
-
|
|
1897
|
-
fprintf(stream, "%s: [", prop_name);
|
|
1898
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
1899
|
-
fprintf(stream, "%e, ", data[i]);
|
|
1900
|
-
}
|
|
1901
|
-
fprintf(stream, "%e]\n", data.back());
|
|
1902
|
-
}
|
|
1903
|
-
|
|
1904
|
-
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
1905
|
-
if (data.empty()) {
|
|
1906
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
1907
|
-
return;
|
|
1908
|
-
}
|
|
1909
|
-
|
|
1910
|
-
fprintf(stream, "%s: [", prop_name);
|
|
1911
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
1912
|
-
fprintf(stream, "%d, ", data[i]);
|
|
1913
|
-
}
|
|
1914
|
-
fprintf(stream, "%d]\n", data.back());
|
|
1915
|
-
}
|
|
1916
|
-
|
|
1917
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
1918
|
-
std::string data_str(data == NULL ? "" : data);
|
|
1919
|
-
|
|
1920
|
-
if (data_str.empty()) {
|
|
1921
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
1922
|
-
return;
|
|
1923
|
-
}
|
|
1924
|
-
|
|
1925
|
-
size_t pos_start = 0;
|
|
1926
|
-
size_t pos_found = 0;
|
|
1927
|
-
|
|
1928
|
-
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
|
1929
|
-
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
1930
|
-
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
1931
|
-
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
1932
|
-
data_str = "\"" + data_str + "\"";
|
|
1933
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
1934
|
-
return;
|
|
1935
|
-
}
|
|
1936
|
-
|
|
1937
|
-
if (data_str.find('\n') == std::string::npos) {
|
|
1938
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
1939
|
-
return;
|
|
1940
|
-
}
|
|
1941
|
-
|
|
1942
|
-
fprintf(stream, "%s: |\n", prop_name);
|
|
1943
|
-
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
1944
|
-
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
1945
|
-
pos_start = pos_found + 1;
|
|
1946
|
-
}
|
|
1947
|
-
}
|
|
1948
|
-
|
|
1949
|
-
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
1950
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
1951
|
-
const auto & sparams = params.sparams;
|
|
1952
|
-
|
|
1953
|
-
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
1954
|
-
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
1955
|
-
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
1956
|
-
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
1957
|
-
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
1958
|
-
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
1959
|
-
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
1960
|
-
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
1961
|
-
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
1962
|
-
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
1963
|
-
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
1964
|
-
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
1965
|
-
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
1966
|
-
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
1967
|
-
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
1968
|
-
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
1969
|
-
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
1970
|
-
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
1971
|
-
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
|
1972
|
-
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
1973
|
-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
1974
|
-
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
1975
|
-
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
1976
|
-
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
1977
|
-
|
|
1978
|
-
#ifdef NDEBUG
|
|
1979
|
-
fprintf(stream, "debug: false\n");
|
|
1980
|
-
#else
|
|
1981
|
-
fprintf(stream, "debug: true\n");
|
|
1982
|
-
#endif // NDEBUG
|
|
1983
|
-
|
|
1984
|
-
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
1985
|
-
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
1986
|
-
|
|
1987
|
-
#ifdef __OPTIMIZE__
|
|
1988
|
-
fprintf(stream, "optimize: true\n");
|
|
1989
|
-
#else
|
|
1990
|
-
fprintf(stream, "optimize: false\n");
|
|
1991
|
-
#endif // __OPTIMIZE__
|
|
1992
|
-
|
|
1993
|
-
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
1994
|
-
|
|
1995
|
-
fprintf(stream, "\n");
|
|
1996
|
-
fprintf(stream, "###############\n");
|
|
1997
|
-
fprintf(stream, "# User Inputs #\n");
|
|
1998
|
-
fprintf(stream, "###############\n");
|
|
1999
|
-
fprintf(stream, "\n");
|
|
2000
|
-
|
|
2001
|
-
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
2002
|
-
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
2003
|
-
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
2004
|
-
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
2005
|
-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
2006
|
-
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
2007
|
-
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
2008
|
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
2009
|
-
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
2010
|
-
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
2011
|
-
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
2012
|
-
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
2013
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
|
2014
|
-
|
|
2015
|
-
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2016
|
-
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2017
|
-
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2018
|
-
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2019
|
-
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2020
|
-
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2021
|
-
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
2022
|
-
|
|
2023
|
-
fprintf(stream, "logit_bias:\n");
|
|
2024
|
-
for (const auto & logit_bias : sparams.logit_bias) {
|
|
2025
|
-
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
|
2026
|
-
}
|
|
2027
|
-
|
|
2028
|
-
fprintf(stream, "lora:\n");
|
|
2029
|
-
for (auto & la : params.lora_adapters) {
|
|
2030
|
-
if (la.scale == 1.0f) {
|
|
2031
|
-
fprintf(stream, " - %s\n", la.path.c_str());
|
|
2032
|
-
}
|
|
2033
|
-
}
|
|
2034
|
-
fprintf(stream, "lora_scaled:\n");
|
|
2035
|
-
for (auto & la : params.lora_adapters) {
|
|
2036
|
-
if (la.scale != 1.0f) {
|
|
2037
|
-
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
|
2038
|
-
}
|
|
2039
|
-
}
|
|
2040
|
-
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
|
2041
|
-
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2042
|
-
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2043
|
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
2044
|
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2045
|
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2046
|
-
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2047
|
-
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2048
|
-
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2049
|
-
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2050
|
-
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
2051
|
-
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
2052
|
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
2053
|
-
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
2054
|
-
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
2055
|
-
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
2056
|
-
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
2057
|
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
2058
|
-
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
|
2059
|
-
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
2060
|
-
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2061
|
-
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2062
|
-
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
2063
|
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2064
|
-
|
|
2065
|
-
fprintf(stream, "reverse_prompt:\n");
|
|
2066
|
-
for (std::string ap : params.antiprompt) {
|
|
2067
|
-
size_t pos = 0;
|
|
2068
|
-
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
2069
|
-
ap.replace(pos, 1, "\\n");
|
|
2070
|
-
pos += 1;
|
|
2071
|
-
}
|
|
2072
|
-
|
|
2073
|
-
fprintf(stream, " - %s\n", ap.c_str());
|
|
2074
|
-
}
|
|
2075
|
-
|
|
2076
|
-
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
2077
|
-
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
2078
|
-
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2079
|
-
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2080
|
-
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2081
|
-
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2082
|
-
|
|
2083
|
-
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
|
2084
|
-
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
2085
|
-
|
|
2086
|
-
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
|
2087
|
-
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
|
2088
|
-
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
2089
|
-
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
2090
|
-
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
2091
|
-
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
|
2092
|
-
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
2093
|
-
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
2094
|
-
}
|