@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#ifndef __cplusplus
|
|
4
|
+
#error "This header is for C++ only"
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
#include <memory>
|
|
8
|
+
|
|
9
|
+
#include "llama.h"
|
|
10
|
+
|
|
11
|
+
struct llama_model_deleter {
|
|
12
|
+
void operator()(llama_model * model) { llama_free_model(model); }
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
struct llama_context_deleter {
|
|
16
|
+
void operator()(llama_context * context) { llama_free(context); }
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
struct llama_sampler_deleter {
|
|
20
|
+
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
|
24
|
+
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
|
25
|
+
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#define LLAMA_H
|
|
3
3
|
|
|
4
4
|
#include "ggml.h"
|
|
5
|
+
#include "ggml-cpu.h"
|
|
5
6
|
#include "ggml-backend.h"
|
|
6
7
|
|
|
7
8
|
#include <stddef.h>
|
|
@@ -103,12 +104,15 @@ extern "C" {
|
|
|
103
104
|
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
104
105
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
105
106
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
107
|
+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
106
108
|
};
|
|
107
109
|
|
|
108
110
|
enum llama_rope_type {
|
|
109
|
-
LLAMA_ROPE_TYPE_NONE
|
|
110
|
-
LLAMA_ROPE_TYPE_NORM
|
|
111
|
-
LLAMA_ROPE_TYPE_NEOX
|
|
111
|
+
LLAMA_ROPE_TYPE_NONE = -1,
|
|
112
|
+
LLAMA_ROPE_TYPE_NORM = 0,
|
|
113
|
+
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
|
114
|
+
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
|
115
|
+
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
|
112
116
|
};
|
|
113
117
|
|
|
114
118
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
|
@@ -170,9 +174,9 @@ extern "C" {
|
|
|
170
174
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
|
171
175
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
|
172
176
|
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
|
173
|
-
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, //
|
|
174
|
-
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, //
|
|
175
|
-
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, //
|
|
177
|
+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
|
178
|
+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
|
179
|
+
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
|
176
180
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
177
181
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
178
182
|
|
|
@@ -184,7 +188,8 @@ extern "C" {
|
|
|
184
188
|
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
|
185
189
|
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
|
186
190
|
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
|
187
|
-
|
|
191
|
+
LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
|
|
192
|
+
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
|
|
188
193
|
};
|
|
189
194
|
|
|
190
195
|
enum llama_pooling_type {
|
|
@@ -205,7 +210,7 @@ extern "C" {
|
|
|
205
210
|
enum llama_split_mode {
|
|
206
211
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
|
207
212
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
|
208
|
-
LLAMA_SPLIT_MODE_ROW = 2, // split
|
|
213
|
+
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
|
209
214
|
};
|
|
210
215
|
|
|
211
216
|
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
@@ -217,6 +222,7 @@ extern "C" {
|
|
|
217
222
|
|
|
218
223
|
typedef struct llama_token_data_array {
|
|
219
224
|
// TODO: consider SoA
|
|
225
|
+
// NOTE: this pointer can be modified by the samplers
|
|
220
226
|
llama_token_data * data;
|
|
221
227
|
size_t size;
|
|
222
228
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
|
@@ -232,8 +238,11 @@ extern "C" {
|
|
|
232
238
|
// - token : the token ids of the input (used when embd is NULL)
|
|
233
239
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
234
240
|
// - pos : the positions of the respective token in the sequence
|
|
241
|
+
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
|
235
242
|
// - seq_id : the sequence to which the respective token belongs
|
|
243
|
+
// (if set to NULL, the sequence ID will be assumed to be 0)
|
|
236
244
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
245
|
+
// (if set to NULL, only the logits for last token will be returned)
|
|
237
246
|
//
|
|
238
247
|
typedef struct llama_batch {
|
|
239
248
|
int32_t n_tokens;
|
|
@@ -244,15 +253,6 @@ extern "C" {
|
|
|
244
253
|
int32_t * n_seq_id;
|
|
245
254
|
llama_seq_id ** seq_id;
|
|
246
255
|
int8_t * logits; // TODO: rename this to "output"
|
|
247
|
-
|
|
248
|
-
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
|
249
|
-
// for future-proof code, use the above fields instead and ignore everything below
|
|
250
|
-
//
|
|
251
|
-
// pos[i] = all_pos_0 + i*all_pos_1
|
|
252
|
-
//
|
|
253
|
-
llama_pos all_pos_0; // used if pos == NULL
|
|
254
|
-
llama_pos all_pos_1; // used if pos == NULL
|
|
255
|
-
llama_seq_id all_seq_id; // used if seq_id == NULL
|
|
256
256
|
} llama_batch;
|
|
257
257
|
|
|
258
258
|
enum llama_model_kv_override_type {
|
|
@@ -276,13 +276,13 @@ extern "C" {
|
|
|
276
276
|
};
|
|
277
277
|
|
|
278
278
|
struct llama_model_params {
|
|
279
|
+
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
|
|
280
|
+
ggml_backend_dev_t * devices;
|
|
281
|
+
|
|
279
282
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
280
283
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
281
284
|
|
|
282
|
-
//
|
|
283
|
-
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
|
284
|
-
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
|
285
|
-
// LLAMA_SPLIT_MODE_LAYER: ignored
|
|
285
|
+
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
|
286
286
|
int32_t main_gpu;
|
|
287
287
|
|
|
288
288
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
|
@@ -433,6 +433,7 @@ extern "C" {
|
|
|
433
433
|
LLAMA_API bool llama_supports_mmap (void);
|
|
434
434
|
LLAMA_API bool llama_supports_mlock (void);
|
|
435
435
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
|
436
|
+
LLAMA_API bool llama_supports_rpc (void);
|
|
436
437
|
|
|
437
438
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
|
438
439
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
@@ -457,6 +458,7 @@ extern "C" {
|
|
|
457
458
|
// Functions to access the model's GGUF metadata scalar values
|
|
458
459
|
// - The functions return the length of the string on success, or -1 on failure
|
|
459
460
|
// - The output string is always null-terminated and cleared on failure
|
|
461
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
|
460
462
|
// - GGUF array values are not supported by these functions
|
|
461
463
|
|
|
462
464
|
// Get metadata value as a string by key name
|
|
@@ -480,9 +482,6 @@ extern "C" {
|
|
|
480
482
|
// Returns the total number of parameters in the model
|
|
481
483
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
|
482
484
|
|
|
483
|
-
// Get a llama model tensor
|
|
484
|
-
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
|
485
|
-
|
|
486
485
|
// Returns true if the model contains an encoder that requires llama_encode() call
|
|
487
486
|
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
|
|
488
487
|
|
|
@@ -673,6 +672,9 @@ extern "C" {
|
|
|
673
672
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
674
673
|
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
|
675
674
|
|
|
675
|
+
// Check if the context supports KV cache shifting
|
|
676
|
+
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
|
|
677
|
+
|
|
676
678
|
//
|
|
677
679
|
// State / sessions
|
|
678
680
|
//
|
|
@@ -775,15 +777,15 @@ extern "C" {
|
|
|
775
777
|
// Decoding
|
|
776
778
|
//
|
|
777
779
|
|
|
778
|
-
// Return batch for single sequence of tokens
|
|
780
|
+
// Return batch for single sequence of tokens
|
|
781
|
+
// The sequence ID will be fixed to 0
|
|
782
|
+
// The position of the tokens will be tracked automatically by llama_decode
|
|
779
783
|
//
|
|
780
784
|
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
|
781
785
|
//
|
|
782
786
|
LLAMA_API struct llama_batch llama_batch_get_one(
|
|
783
787
|
llama_token * tokens,
|
|
784
|
-
int32_t n_tokens
|
|
785
|
-
llama_pos pos_0,
|
|
786
|
-
llama_seq_id seq_id);
|
|
788
|
+
int32_t n_tokens);
|
|
787
789
|
|
|
788
790
|
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
|
789
791
|
// Each token can be assigned up to n_seq_max sequence ids
|
|
@@ -803,7 +805,7 @@ extern "C" {
|
|
|
803
805
|
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
|
804
806
|
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
|
805
807
|
// 0 - success
|
|
806
|
-
// < 0 - error
|
|
808
|
+
// < 0 - error. the KV cache state is restored to the state before this call
|
|
807
809
|
LLAMA_API int32_t llama_encode(
|
|
808
810
|
struct llama_context * ctx,
|
|
809
811
|
struct llama_batch batch);
|
|
@@ -811,7 +813,7 @@ extern "C" {
|
|
|
811
813
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
812
814
|
// 0 - success
|
|
813
815
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
814
|
-
// < 0 - error
|
|
816
|
+
// < 0 - error. the KV cache state is restored to the state before this call
|
|
815
817
|
LLAMA_API int32_t llama_decode(
|
|
816
818
|
struct llama_context * ctx,
|
|
817
819
|
struct llama_batch batch);
|
|
@@ -896,6 +898,7 @@ extern "C" {
|
|
|
896
898
|
// Special tokens
|
|
897
899
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
|
898
900
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
|
901
|
+
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
|
|
899
902
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
|
900
903
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
|
901
904
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
|
@@ -904,11 +907,17 @@ extern "C" {
|
|
|
904
907
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
|
905
908
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
|
906
909
|
|
|
907
|
-
//
|
|
908
|
-
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model)
|
|
909
|
-
LLAMA_API llama_token llama_token_middle(const struct llama_model * model)
|
|
910
|
-
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model)
|
|
911
|
-
|
|
910
|
+
// infill tokens
|
|
911
|
+
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
|
|
912
|
+
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
|
|
913
|
+
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
|
|
914
|
+
|
|
915
|
+
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
|
|
916
|
+
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
|
|
917
|
+
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
|
|
918
|
+
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
|
|
919
|
+
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
|
|
920
|
+
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
|
|
912
921
|
|
|
913
922
|
//
|
|
914
923
|
// Tokenization
|
|
@@ -983,6 +992,9 @@ extern "C" {
|
|
|
983
992
|
char * buf,
|
|
984
993
|
int32_t length);
|
|
985
994
|
|
|
995
|
+
// Get list of built-in chat templates
|
|
996
|
+
LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
|
|
997
|
+
|
|
986
998
|
//
|
|
987
999
|
// Sampling API
|
|
988
1000
|
//
|
|
@@ -1067,12 +1079,13 @@ extern "C" {
|
|
|
1067
1079
|
|
|
1068
1080
|
// available samplers:
|
|
1069
1081
|
|
|
1070
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_greedy
|
|
1071
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_dist
|
|
1082
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
|
1083
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
|
1072
1084
|
|
|
1073
1085
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
1074
1086
|
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
|
1075
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void)
|
|
1087
|
+
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
|
1088
|
+
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
1076
1089
|
|
|
1077
1090
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1078
1091
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
|
@@ -1083,16 +1096,18 @@ extern "C" {
|
|
|
1083
1096
|
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
|
1084
1097
|
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
|
1085
1098
|
|
|
1086
|
-
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
|
1087
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
|
|
1088
|
-
|
|
1089
1099
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
|
1090
1100
|
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
|
|
1101
|
+
|
|
1102
|
+
/// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
|
|
1091
1103
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
|
|
1092
1104
|
|
|
1093
1105
|
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
|
1094
1106
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
|
1095
1107
|
|
|
1108
|
+
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
|
1109
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
|
1110
|
+
|
|
1096
1111
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
|
1097
1112
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
1098
1113
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
@@ -1121,22 +1136,50 @@ extern "C" {
|
|
|
1121
1136
|
const char * grammar_str,
|
|
1122
1137
|
const char * grammar_root);
|
|
1123
1138
|
|
|
1139
|
+
/// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
|
|
1124
1140
|
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
|
|
1125
|
-
int32_t
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1141
|
+
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
1142
|
+
float penalty_repeat, // 1.0 = disabled
|
|
1143
|
+
float penalty_freq, // 0.0 = disabled
|
|
1144
|
+
float penalty_present); // 0.0 = disabled
|
|
1145
|
+
|
|
1146
|
+
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
|
1147
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
|
1148
|
+
const struct llama_model * model,
|
|
1149
|
+
float dry_multiplier,
|
|
1150
|
+
float dry_base,
|
|
1151
|
+
int32_t dry_allowed_length,
|
|
1152
|
+
int32_t dry_penalty_last_n,
|
|
1153
|
+
const char ** seq_breakers,
|
|
1154
|
+
size_t num_breakers);
|
|
1134
1155
|
|
|
1135
1156
|
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
|
1136
1157
|
int32_t n_vocab,
|
|
1137
1158
|
int32_t n_logit_bias,
|
|
1138
1159
|
const llama_logit_bias * logit_bias);
|
|
1139
1160
|
|
|
1161
|
+
// this sampler is meant to be used for fill-in-the-middle infilling
|
|
1162
|
+
// it's supposed to be used after top_k + top_p sampling
|
|
1163
|
+
//
|
|
1164
|
+
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
|
|
1165
|
+
// 2. combine probs of tokens that have the same prefix
|
|
1166
|
+
//
|
|
1167
|
+
// example:
|
|
1168
|
+
//
|
|
1169
|
+
// - before:
|
|
1170
|
+
// "hel": 0.5
|
|
1171
|
+
// "hell": 0.2
|
|
1172
|
+
// "hello": 0.1
|
|
1173
|
+
// "dummy": 0.1
|
|
1174
|
+
//
|
|
1175
|
+
// - after:
|
|
1176
|
+
// "hel": 0.8
|
|
1177
|
+
// "dummy": 0.1
|
|
1178
|
+
//
|
|
1179
|
+
// 3. discard non-EOG tokens with low prob
|
|
1180
|
+
// 4. if no tokens are left -> pick EOT
|
|
1181
|
+
//
|
|
1182
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
|
1140
1183
|
|
|
1141
1184
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
|
1142
1185
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
|
@@ -1208,8 +1251,6 @@ extern "C" {
|
|
|
1208
1251
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
|
1209
1252
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
|
1210
1253
|
|
|
1211
|
-
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
|
1212
|
-
|
|
1213
1254
|
#ifdef __cplusplus
|
|
1214
1255
|
}
|
|
1215
1256
|
#endif
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
ied 4 ½ months
|
|
2
|
+
__ggml_vocab_test__
|
|
3
|
+
Führer
|
|
4
|
+
__ggml_vocab_test__
|
|
5
|
+
|
|
6
|
+
__ggml_vocab_test__
|
|
7
|
+
|
|
8
|
+
__ggml_vocab_test__
|
|
9
|
+
|
|
10
|
+
__ggml_vocab_test__
|
|
11
|
+
|
|
12
|
+
__ggml_vocab_test__
|
|
13
|
+
|
|
14
|
+
__ggml_vocab_test__
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__ggml_vocab_test__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
__ggml_vocab_test__
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__ggml_vocab_test__
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__ggml_vocab_test__
|
|
30
|
+
Hello world
|
|
31
|
+
__ggml_vocab_test__
|
|
32
|
+
Hello world
|
|
33
|
+
__ggml_vocab_test__
|
|
34
|
+
Hello World
|
|
35
|
+
__ggml_vocab_test__
|
|
36
|
+
Hello World
|
|
37
|
+
__ggml_vocab_test__
|
|
38
|
+
Hello World!
|
|
39
|
+
__ggml_vocab_test__
|
|
40
|
+
Hello, world!
|
|
41
|
+
__ggml_vocab_test__
|
|
42
|
+
Hello, world!
|
|
43
|
+
__ggml_vocab_test__
|
|
44
|
+
this is 🦙.cpp
|
|
45
|
+
__ggml_vocab_test__
|
|
46
|
+
w048 7tuijk dsdfhu
|
|
47
|
+
__ggml_vocab_test__
|
|
48
|
+
нещо на Български
|
|
49
|
+
__ggml_vocab_test__
|
|
50
|
+
កាន់តែពិសេសអាចខលចេញ
|
|
51
|
+
__ggml_vocab_test__
|
|
52
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
|
53
|
+
__ggml_vocab_test__
|
|
54
|
+
Hello
|
|
55
|
+
__ggml_vocab_test__
|
|
56
|
+
Hello
|
|
57
|
+
__ggml_vocab_test__
|
|
58
|
+
Hello
|
|
59
|
+
__ggml_vocab_test__
|
|
60
|
+
Hello
|
|
61
|
+
__ggml_vocab_test__
|
|
62
|
+
Hello
|
|
63
|
+
__ggml_vocab_test__
|
|
64
|
+
Hello
|
|
65
|
+
Hello
|
|
66
|
+
__ggml_vocab_test__
|
|
67
|
+
(
|
|
68
|
+
__ggml_vocab_test__
|
|
69
|
+
|
|
70
|
+
=
|
|
71
|
+
__ggml_vocab_test__
|
|
72
|
+
' era
|
|
73
|
+
__ggml_vocab_test__
|
|
74
|
+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
|
75
|
+
__ggml_vocab_test__
|
|
76
|
+
!!!!!!
|
|
77
|
+
__ggml_vocab_test__
|
|
78
|
+
3
|
|
79
|
+
__ggml_vocab_test__
|
|
80
|
+
33
|
|
81
|
+
__ggml_vocab_test__
|
|
82
|
+
333
|
|
83
|
+
__ggml_vocab_test__
|
|
84
|
+
3333
|
|
85
|
+
__ggml_vocab_test__
|
|
86
|
+
33333
|
|
87
|
+
__ggml_vocab_test__
|
|
88
|
+
333333
|
|
89
|
+
__ggml_vocab_test__
|
|
90
|
+
3333333
|
|
91
|
+
__ggml_vocab_test__
|
|
92
|
+
33333333
|
|
93
|
+
__ggml_vocab_test__
|
|
94
|
+
333333333
|
|
95
|
+
__ggml_vocab_test__
|
|
96
|
+
Cửa Việt
|
|
97
|
+
__ggml_vocab_test__
|
|
98
|
+
discards
|
|
99
|
+
__ggml_vocab_test__
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
|
112
|
+
__ggml_vocab_test__
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
2550 204 18430 377
|
|
2
|
+
597 2768 298 8564
|
|
3
|
+
|
|
4
|
+
1437
|
|
5
|
+
1437 1437
|
|
6
|
+
1437 1437 1437
|
|
7
|
+
50117
|
|
8
|
+
50118
|
|
9
|
+
50140
|
|
10
|
+
50140 50118
|
|
11
|
+
50117 50118
|
|
12
|
+
31414 232
|
|
13
|
+
20920 232
|
|
14
|
+
31414 623
|
|
15
|
+
20920 623
|
|
16
|
+
20920 623 328
|
|
17
|
+
31414 6 232 328
|
|
18
|
+
20920 6 232 328
|
|
19
|
+
42 16 8103 18164 27 4 49317
|
|
20
|
+
605 40976 262 10109 18474 385 29 36807 6455
|
|
21
|
+
36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
|
|
22
|
+
1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
|
|
23
|
+
6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
|
|
24
|
+
31414
|
|
25
|
+
20920
|
|
26
|
+
1437 20920
|
|
27
|
+
1437 1437 20920
|
|
28
|
+
1437 1437 1437 20920
|
|
29
|
+
1437 1437 1437 20920 50118 1437 1437 1437 20920
|
|
30
|
+
36
|
|
31
|
+
50118 5457
|
|
32
|
+
108 3567
|
|
33
|
+
31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
|
|
34
|
+
32376 12846
|
|
35
|
+
246
|
|
36
|
+
3103
|
|
37
|
+
25631
|
|
38
|
+
46152
|
|
39
|
+
3103 25631
|
|
40
|
+
46152 3103
|
|
41
|
+
46152 25631
|
|
42
|
+
46152 46152
|
|
43
|
+
46152 3103 25631
|
|
44
|
+
347 1376 2023 12410 102 16376 1376 2023 6382 90
|
|
45
|
+
9553 5954
|
|
46
|
+
50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
set(TARGET llama-vdot)
|
|
2
2
|
add_executable(${TARGET} vdot.cpp)
|
|
3
3
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
4
|
-
target_compile_features(${TARGET} PRIVATE
|
|
4
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
5
5
|
|
|
6
6
|
set(TARGET llama-q8dot)
|
|
7
7
|
add_executable(${TARGET} q8dot.cpp)
|
|
8
8
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
9
|
-
target_compile_features(${TARGET} PRIVATE
|
|
9
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <type_traits>
|
|
12
12
|
|
|
13
13
|
#include <ggml.h>
|
|
14
|
+
#include <ggml-cpu.h>
|
|
14
15
|
|
|
15
16
|
constexpr int kVecSize = 1 << 16;
|
|
16
17
|
|
|
@@ -136,7 +137,7 @@ int main(int argc, char** argv) {
|
|
|
136
137
|
|
|
137
138
|
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
|
138
139
|
|
|
139
|
-
auto funcs =
|
|
140
|
+
const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
|
|
140
141
|
|
|
141
142
|
Stat simple, ggml;
|
|
142
143
|
|
|
@@ -156,8 +157,8 @@ int main(int argc, char** argv) {
|
|
|
156
157
|
|
|
157
158
|
t1 = std::chrono::high_resolution_clock::now();
|
|
158
159
|
float fs;
|
|
159
|
-
if (type == 0) funcs
|
|
160
|
-
else funcs
|
|
160
|
+
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
|
161
|
+
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
|
161
162
|
t2 = std::chrono::high_resolution_clock::now();
|
|
162
163
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
|
163
164
|
if (iloop > 3) ggml.addResult(fs, t);
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <array>
|
|
10
10
|
|
|
11
11
|
#include <ggml.h>
|
|
12
|
+
#include <ggml-cpu.h>
|
|
12
13
|
|
|
13
14
|
#if defined(_MSC_VER)
|
|
14
15
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
@@ -236,7 +237,7 @@ int main(int argc, char** argv) {
|
|
|
236
237
|
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
|
237
238
|
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
|
238
239
|
|
|
239
|
-
auto
|
|
240
|
+
const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
|
|
240
241
|
|
|
241
242
|
std::vector<block_q4_0> q40;
|
|
242
243
|
std::vector<block_q4_1> q41;
|
|
@@ -261,9 +262,9 @@ int main(int argc, char** argv) {
|
|
|
261
262
|
// Note, we do not include this in the timing as in practical application
|
|
262
263
|
// we already have the quantized model weights.
|
|
263
264
|
if (useQ4_1) {
|
|
264
|
-
|
|
265
|
+
funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
|
|
265
266
|
} else {
|
|
266
|
-
|
|
267
|
+
funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
|
|
267
268
|
}
|
|
268
269
|
|
|
269
270
|
// Now measure time the dot product needs using the "scalar" version above
|
|
@@ -282,10 +283,10 @@ int main(int argc, char** argv) {
|
|
|
282
283
|
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
|
283
284
|
}
|
|
284
285
|
else {
|
|
285
|
-
auto vdot =
|
|
286
|
-
vdot
|
|
287
|
-
if (useQ4_1)
|
|
288
|
-
else
|
|
286
|
+
const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
|
|
287
|
+
vdot->from_float(y1.data(), q8.data(), kVecSize);
|
|
288
|
+
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
|
289
|
+
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
|
289
290
|
}
|
|
290
291
|
sumq += result;
|
|
291
292
|
t2 = std::chrono::high_resolution_clock::now();
|
|
@@ -1,9 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
if (WIN32)
|
|
3
|
-
if (BUILD_SHARED_LIBS)
|
|
4
|
-
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
|
5
|
-
endif()
|
|
6
|
-
endif()
|
|
1
|
+
llama_add_compile_flags()
|
|
7
2
|
|
|
8
3
|
#
|
|
9
4
|
# libraries
|
|
@@ -23,11 +18,12 @@ add_library(llama
|
|
|
23
18
|
)
|
|
24
19
|
|
|
25
20
|
target_include_directories(llama PUBLIC . ../include)
|
|
26
|
-
target_compile_features (llama PUBLIC
|
|
21
|
+
target_compile_features (llama PUBLIC cxx_std_17) # don't bump
|
|
27
22
|
|
|
28
23
|
target_link_libraries(llama PUBLIC ggml)
|
|
29
24
|
|
|
30
25
|
if (BUILD_SHARED_LIBS)
|
|
31
26
|
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
32
|
-
target_compile_definitions(llama PRIVATE
|
|
27
|
+
target_compile_definitions(llama PRIVATE LLAMA_BUILD)
|
|
28
|
+
target_compile_definitions(llama PUBLIC LLAMA_SHARED)
|
|
33
29
|
endif()
|