@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -23,6 +23,10 @@
|
|
|
23
23
|
#include "ggml-cuda.h"
|
|
24
24
|
#include "ggml-sycl.h"
|
|
25
25
|
|
|
26
|
+
#ifdef GGML_USE_CANN
|
|
27
|
+
#include "ggml-cann.h"
|
|
28
|
+
#endif
|
|
29
|
+
|
|
26
30
|
// utils
|
|
27
31
|
static uint64_t get_time_ns() {
|
|
28
32
|
using clock = std::chrono::high_resolution_clock;
|
|
@@ -41,20 +45,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
|
|
|
41
45
|
return str.str();
|
|
42
46
|
}
|
|
43
47
|
|
|
44
|
-
template<class T>
|
|
45
|
-
static std::vector<T> split(const std::string & str, char delim) {
|
|
46
|
-
std::vector<T> values;
|
|
47
|
-
std::istringstream str_stream(str);
|
|
48
|
-
std::string token;
|
|
49
|
-
while (std::getline(str_stream, token, delim)) {
|
|
50
|
-
T value;
|
|
51
|
-
std::istringstream token_stream(token);
|
|
52
|
-
token_stream >> value;
|
|
53
|
-
values.push_back(value);
|
|
54
|
-
}
|
|
55
|
-
return values;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
48
|
template<typename T, typename F>
|
|
59
49
|
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
|
60
50
|
std::vector<std::string> str_values;
|
|
@@ -134,22 +124,51 @@ static std::string get_gpu_info() {
|
|
|
134
124
|
id += "/";
|
|
135
125
|
}
|
|
136
126
|
}
|
|
127
|
+
#endif
|
|
128
|
+
#ifdef GGML_USE_CANN
|
|
129
|
+
uint32_t count = ggml_backend_cann_get_device_count();
|
|
130
|
+
for (uint32_t i = 0; i < count; i++) {
|
|
131
|
+
char buf[128];
|
|
132
|
+
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
|
|
133
|
+
id += buf;
|
|
134
|
+
if (i < count - 1) {
|
|
135
|
+
id += "/";
|
|
136
|
+
}
|
|
137
|
+
}
|
|
137
138
|
#endif
|
|
138
139
|
// TODO: other backends
|
|
139
140
|
return id;
|
|
140
141
|
}
|
|
141
142
|
|
|
142
143
|
// command line params
|
|
143
|
-
enum output_formats {CSV, JSON, MARKDOWN, SQL};
|
|
144
|
+
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
|
144
145
|
|
|
145
146
|
static const char * output_format_str(output_formats format) {
|
|
146
147
|
switch (format) {
|
|
148
|
+
case NONE: return "none";
|
|
147
149
|
case CSV: return "csv";
|
|
148
150
|
case JSON: return "json";
|
|
149
151
|
case MARKDOWN: return "md";
|
|
150
152
|
case SQL: return "sql";
|
|
151
|
-
default:
|
|
153
|
+
default: GGML_ABORT("invalid output format");
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
static bool output_format_from_str(const std::string & s, output_formats & format) {
|
|
158
|
+
if (s == "none") {
|
|
159
|
+
format = NONE;
|
|
160
|
+
} else if (s == "csv") {
|
|
161
|
+
format = CSV;
|
|
162
|
+
} else if (s == "json") {
|
|
163
|
+
format = JSON;
|
|
164
|
+
} else if (s == "md") {
|
|
165
|
+
format = MARKDOWN;
|
|
166
|
+
} else if (s == "sql") {
|
|
167
|
+
format = SQL;
|
|
168
|
+
} else {
|
|
169
|
+
return false;
|
|
152
170
|
}
|
|
171
|
+
return true;
|
|
153
172
|
}
|
|
154
173
|
|
|
155
174
|
static const char * split_mode_str(llama_split_mode mode) {
|
|
@@ -157,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
|
|
|
157
176
|
case LLAMA_SPLIT_MODE_NONE: return "none";
|
|
158
177
|
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
|
159
178
|
case LLAMA_SPLIT_MODE_ROW: return "row";
|
|
160
|
-
default:
|
|
179
|
+
default: GGML_ABORT("invalid split mode");
|
|
161
180
|
}
|
|
162
181
|
}
|
|
163
182
|
|
|
@@ -178,6 +197,7 @@ struct cmd_params {
|
|
|
178
197
|
std::vector<ggml_type> type_v;
|
|
179
198
|
std::vector<int> n_threads;
|
|
180
199
|
std::vector<int> n_gpu_layers;
|
|
200
|
+
std::vector<std::string> rpc_servers;
|
|
181
201
|
std::vector<llama_split_mode> split_mode;
|
|
182
202
|
std::vector<int> main_gpu;
|
|
183
203
|
std::vector<bool> no_kv_offload;
|
|
@@ -189,30 +209,33 @@ struct cmd_params {
|
|
|
189
209
|
int reps;
|
|
190
210
|
bool verbose;
|
|
191
211
|
output_formats output_format;
|
|
212
|
+
output_formats output_format_stderr;
|
|
192
213
|
};
|
|
193
214
|
|
|
194
215
|
static const cmd_params cmd_params_defaults = {
|
|
195
|
-
/* model
|
|
196
|
-
/* n_prompt
|
|
197
|
-
/* n_gen
|
|
198
|
-
/* n_pg
|
|
199
|
-
/* n_batch
|
|
200
|
-
/* n_ubatch
|
|
201
|
-
/* type_k
|
|
202
|
-
/* type_v
|
|
203
|
-
/* n_threads
|
|
204
|
-
/* n_gpu_layers
|
|
205
|
-
/*
|
|
206
|
-
/*
|
|
207
|
-
/*
|
|
208
|
-
/*
|
|
209
|
-
/*
|
|
210
|
-
/*
|
|
211
|
-
/*
|
|
212
|
-
/*
|
|
213
|
-
/*
|
|
214
|
-
/*
|
|
215
|
-
/*
|
|
216
|
+
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
|
217
|
+
/* n_prompt */ {512},
|
|
218
|
+
/* n_gen */ {128},
|
|
219
|
+
/* n_pg */ {},
|
|
220
|
+
/* n_batch */ {2048},
|
|
221
|
+
/* n_ubatch */ {512},
|
|
222
|
+
/* type_k */ {GGML_TYPE_F16},
|
|
223
|
+
/* type_v */ {GGML_TYPE_F16},
|
|
224
|
+
/* n_threads */ {cpu_get_num_math()},
|
|
225
|
+
/* n_gpu_layers */ {99},
|
|
226
|
+
/* rpc_servers */ {""},
|
|
227
|
+
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
228
|
+
/* main_gpu */ {0},
|
|
229
|
+
/* no_kv_offload */ {false},
|
|
230
|
+
/* flash_attn */ {false},
|
|
231
|
+
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
|
232
|
+
/* use_mmap */ {true},
|
|
233
|
+
/* embeddings */ {false},
|
|
234
|
+
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
235
|
+
/* reps */ 5,
|
|
236
|
+
/* verbose */ false,
|
|
237
|
+
/* output_format */ MARKDOWN,
|
|
238
|
+
/* output_format_stderr */ NONE,
|
|
216
239
|
};
|
|
217
240
|
|
|
218
241
|
static void print_usage(int /* argc */, char ** argv) {
|
|
@@ -230,6 +253,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
230
253
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
231
254
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
|
232
255
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
256
|
+
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
|
233
257
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
234
258
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
235
259
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
@@ -240,6 +264,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
240
264
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
241
265
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
242
266
|
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
|
267
|
+
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
|
243
268
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
|
244
269
|
printf("\n");
|
|
245
270
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
|
@@ -281,7 +306,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
281
306
|
|
|
282
307
|
params.verbose = cmd_params_defaults.verbose;
|
|
283
308
|
params.output_format = cmd_params_defaults.output_format;
|
|
309
|
+
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
|
284
310
|
params.reps = cmd_params_defaults.reps;
|
|
311
|
+
params.numa = cmd_params_defaults.numa;
|
|
285
312
|
|
|
286
313
|
for (int i = 1; i < argc; i++) {
|
|
287
314
|
arg = argv[i];
|
|
@@ -297,28 +324,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
297
324
|
invalid_param = true;
|
|
298
325
|
break;
|
|
299
326
|
}
|
|
300
|
-
auto p =
|
|
327
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
301
328
|
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
302
329
|
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
303
330
|
if (++i >= argc) {
|
|
304
331
|
invalid_param = true;
|
|
305
332
|
break;
|
|
306
333
|
}
|
|
307
|
-
auto p =
|
|
334
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
308
335
|
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
309
336
|
} else if (arg == "-n" || arg == "--n-gen") {
|
|
310
337
|
if (++i >= argc) {
|
|
311
338
|
invalid_param = true;
|
|
312
339
|
break;
|
|
313
340
|
}
|
|
314
|
-
auto p =
|
|
341
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
315
342
|
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
316
343
|
} else if (arg == "-pg") {
|
|
317
344
|
if (++i >= argc) {
|
|
318
345
|
invalid_param = true;
|
|
319
346
|
break;
|
|
320
347
|
}
|
|
321
|
-
auto p =
|
|
348
|
+
auto p = string_split<std::string>(argv[i], ',');
|
|
322
349
|
if (p.size() != 2) {
|
|
323
350
|
invalid_param = true;
|
|
324
351
|
break;
|
|
@@ -329,21 +356,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
329
356
|
invalid_param = true;
|
|
330
357
|
break;
|
|
331
358
|
}
|
|
332
|
-
auto p =
|
|
359
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
333
360
|
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
|
334
361
|
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
335
362
|
if (++i >= argc) {
|
|
336
363
|
invalid_param = true;
|
|
337
364
|
break;
|
|
338
365
|
}
|
|
339
|
-
auto p =
|
|
366
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
340
367
|
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
|
341
368
|
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
342
369
|
if (++i >= argc) {
|
|
343
370
|
invalid_param = true;
|
|
344
371
|
break;
|
|
345
372
|
}
|
|
346
|
-
auto p =
|
|
373
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
347
374
|
std::vector<ggml_type> types;
|
|
348
375
|
for (const auto & t : p) {
|
|
349
376
|
ggml_type gt = ggml_type_from_name(t);
|
|
@@ -359,7 +386,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
359
386
|
invalid_param = true;
|
|
360
387
|
break;
|
|
361
388
|
}
|
|
362
|
-
auto p =
|
|
389
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
363
390
|
std::vector<ggml_type> types;
|
|
364
391
|
for (const auto & t : p) {
|
|
365
392
|
ggml_type gt = ggml_type_from_name(t);
|
|
@@ -375,21 +402,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
375
402
|
invalid_param = true;
|
|
376
403
|
break;
|
|
377
404
|
}
|
|
378
|
-
auto p =
|
|
405
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
379
406
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
380
407
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
381
408
|
if (++i >= argc) {
|
|
382
409
|
invalid_param = true;
|
|
383
410
|
break;
|
|
384
411
|
}
|
|
385
|
-
auto p =
|
|
412
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
386
413
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
|
414
|
+
} else if (arg == "-rpc" || arg == "--rpc") {
|
|
415
|
+
if (++i >= argc) {
|
|
416
|
+
invalid_param = true;
|
|
417
|
+
break;
|
|
418
|
+
}
|
|
419
|
+
params.rpc_servers.push_back(argv[i]);
|
|
387
420
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
|
388
421
|
if (++i >= argc) {
|
|
389
422
|
invalid_param = true;
|
|
390
423
|
break;
|
|
391
424
|
}
|
|
392
|
-
auto p =
|
|
425
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
393
426
|
std::vector<llama_split_mode> modes;
|
|
394
427
|
for (const auto & m : p) {
|
|
395
428
|
llama_split_mode mode;
|
|
@@ -411,13 +444,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
411
444
|
invalid_param = true;
|
|
412
445
|
break;
|
|
413
446
|
}
|
|
414
|
-
params.main_gpu =
|
|
447
|
+
params.main_gpu = string_split<int>(argv[i], split_delim);
|
|
415
448
|
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
416
449
|
if (++i >= argc) {
|
|
417
450
|
invalid_param = true;
|
|
418
451
|
break;
|
|
419
452
|
}
|
|
420
|
-
auto p =
|
|
453
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
421
454
|
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
422
455
|
} else if (arg == "--numa") {
|
|
423
456
|
if (++i >= argc) {
|
|
@@ -435,28 +468,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
435
468
|
invalid_param = true;
|
|
436
469
|
break;
|
|
437
470
|
}
|
|
438
|
-
auto p =
|
|
471
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
439
472
|
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
|
440
473
|
} else if (arg == "-mmp" || arg == "--mmap") {
|
|
441
474
|
if (++i >= argc) {
|
|
442
475
|
invalid_param = true;
|
|
443
476
|
break;
|
|
444
477
|
}
|
|
445
|
-
auto p =
|
|
478
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
446
479
|
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
|
447
480
|
} else if (arg == "-embd" || arg == "--embeddings") {
|
|
448
481
|
if (++i >= argc) {
|
|
449
482
|
invalid_param = true;
|
|
450
483
|
break;
|
|
451
484
|
}
|
|
452
|
-
auto p =
|
|
485
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
453
486
|
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
|
454
487
|
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
455
488
|
if (++i >= argc) {
|
|
456
489
|
invalid_param = true;
|
|
457
490
|
break;
|
|
458
491
|
}
|
|
459
|
-
for (auto ts :
|
|
492
|
+
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
|
|
460
493
|
// split string by ; and /
|
|
461
494
|
const std::regex regex{R"([;/]+)"};
|
|
462
495
|
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
|
|
@@ -484,18 +517,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
484
517
|
invalid_param = true;
|
|
485
518
|
break;
|
|
486
519
|
}
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
params.output_format = JSON;
|
|
491
|
-
} else if (argv[i] == std::string("md")) {
|
|
492
|
-
params.output_format = MARKDOWN;
|
|
493
|
-
} else if (argv[i] == std::string("sql")) {
|
|
494
|
-
params.output_format = SQL;
|
|
495
|
-
} else {
|
|
520
|
+
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
|
521
|
+
} else if (arg == "-oe" || arg == "--output-err") {
|
|
522
|
+
if (++i >= argc) {
|
|
496
523
|
invalid_param = true;
|
|
497
524
|
break;
|
|
498
525
|
}
|
|
526
|
+
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
|
499
527
|
} else if (arg == "-v" || arg == "--verbose") {
|
|
500
528
|
params.verbose = true;
|
|
501
529
|
} else {
|
|
@@ -519,6 +547,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
519
547
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
|
520
548
|
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
|
521
549
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
|
550
|
+
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
|
|
522
551
|
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
|
523
552
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
|
524
553
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
|
@@ -541,6 +570,7 @@ struct cmd_params_instance {
|
|
|
541
570
|
ggml_type type_v;
|
|
542
571
|
int n_threads;
|
|
543
572
|
int n_gpu_layers;
|
|
573
|
+
std::string rpc_servers;
|
|
544
574
|
llama_split_mode split_mode;
|
|
545
575
|
int main_gpu;
|
|
546
576
|
bool no_kv_offload;
|
|
@@ -553,6 +583,9 @@ struct cmd_params_instance {
|
|
|
553
583
|
llama_model_params mparams = llama_model_default_params();
|
|
554
584
|
|
|
555
585
|
mparams.n_gpu_layers = n_gpu_layers;
|
|
586
|
+
if (!rpc_servers.empty()) {
|
|
587
|
+
mparams.rpc_servers = rpc_servers.c_str();
|
|
588
|
+
}
|
|
556
589
|
mparams.split_mode = split_mode;
|
|
557
590
|
mparams.main_gpu = main_gpu;
|
|
558
591
|
mparams.tensor_split = tensor_split.data();
|
|
@@ -564,6 +597,7 @@ struct cmd_params_instance {
|
|
|
564
597
|
bool equal_mparams(const cmd_params_instance & other) const {
|
|
565
598
|
return model == other.model &&
|
|
566
599
|
n_gpu_layers == other.n_gpu_layers &&
|
|
600
|
+
rpc_servers == other.rpc_servers &&
|
|
567
601
|
split_mode == other.split_mode &&
|
|
568
602
|
main_gpu == other.main_gpu &&
|
|
569
603
|
use_mmap == other.use_mmap &&
|
|
@@ -592,6 +626,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
592
626
|
// this ordering minimizes the number of times that each model needs to be reloaded
|
|
593
627
|
for (const auto & m : params.model)
|
|
594
628
|
for (const auto & nl : params.n_gpu_layers)
|
|
629
|
+
for (const auto & rpc : params.rpc_servers)
|
|
595
630
|
for (const auto & sm : params.split_mode)
|
|
596
631
|
for (const auto & mg : params.main_gpu)
|
|
597
632
|
for (const auto & ts : params.tensor_split)
|
|
@@ -618,6 +653,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
618
653
|
/* .type_v = */ tv,
|
|
619
654
|
/* .n_threads = */ nt,
|
|
620
655
|
/* .n_gpu_layers = */ nl,
|
|
656
|
+
/* .rpc_servers = */ rpc,
|
|
621
657
|
/* .split_mode = */ sm,
|
|
622
658
|
/* .main_gpu = */ mg,
|
|
623
659
|
/* .no_kv_offload= */ nkvo,
|
|
@@ -643,6 +679,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
643
679
|
/* .type_v = */ tv,
|
|
644
680
|
/* .n_threads = */ nt,
|
|
645
681
|
/* .n_gpu_layers = */ nl,
|
|
682
|
+
/* .rpc_servers = */ rpc,
|
|
646
683
|
/* .split_mode = */ sm,
|
|
647
684
|
/* .main_gpu = */ mg,
|
|
648
685
|
/* .no_kv_offload= */ nkvo,
|
|
@@ -668,6 +705,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
668
705
|
/* .type_v = */ tv,
|
|
669
706
|
/* .n_threads = */ nt,
|
|
670
707
|
/* .n_gpu_layers = */ nl,
|
|
708
|
+
/* .rpc_servers = */ rpc,
|
|
671
709
|
/* .split_mode = */ sm,
|
|
672
710
|
/* .main_gpu = */ mg,
|
|
673
711
|
/* .no_kv_offload= */ nkvo,
|
|
@@ -687,7 +725,6 @@ struct test {
|
|
|
687
725
|
static const std::string build_commit;
|
|
688
726
|
static const int build_number;
|
|
689
727
|
static const bool cuda;
|
|
690
|
-
static const bool opencl;
|
|
691
728
|
static const bool vulkan;
|
|
692
729
|
static const bool kompute;
|
|
693
730
|
static const bool metal;
|
|
@@ -703,6 +740,7 @@ struct test {
|
|
|
703
740
|
int n_batch;
|
|
704
741
|
int n_ubatch;
|
|
705
742
|
int n_threads;
|
|
743
|
+
bool has_rpc;
|
|
706
744
|
ggml_type type_k;
|
|
707
745
|
ggml_type type_v;
|
|
708
746
|
int n_gpu_layers;
|
|
@@ -728,6 +766,7 @@ struct test {
|
|
|
728
766
|
n_batch = inst.n_batch;
|
|
729
767
|
n_ubatch = inst.n_ubatch;
|
|
730
768
|
n_threads = inst.n_threads;
|
|
769
|
+
has_rpc = !inst.rpc_servers.empty();
|
|
731
770
|
type_k = inst.type_k;
|
|
732
771
|
type_v = inst.type_v;
|
|
733
772
|
n_gpu_layers = inst.n_gpu_layers;
|
|
@@ -775,9 +814,6 @@ struct test {
|
|
|
775
814
|
if (cuda) {
|
|
776
815
|
return GGML_CUDA_NAME;
|
|
777
816
|
}
|
|
778
|
-
if (opencl) {
|
|
779
|
-
return "OpenCL";
|
|
780
|
-
}
|
|
781
817
|
if (vulkan) {
|
|
782
818
|
return "Vulkan";
|
|
783
819
|
}
|
|
@@ -803,7 +839,7 @@ struct test {
|
|
|
803
839
|
static const std::vector<std::string> & get_fields() {
|
|
804
840
|
static const std::vector<std::string> fields = {
|
|
805
841
|
"build_commit", "build_number",
|
|
806
|
-
"cuda", "
|
|
842
|
+
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
|
807
843
|
"cpu_info", "gpu_info",
|
|
808
844
|
"model_filename", "model_type", "model_size", "model_n_params",
|
|
809
845
|
"n_batch", "n_ubatch",
|
|
@@ -829,7 +865,7 @@ struct test {
|
|
|
829
865
|
field == "avg_ns" || field == "stddev_ns") {
|
|
830
866
|
return INT;
|
|
831
867
|
}
|
|
832
|
-
if (field == "cuda" || field == "
|
|
868
|
+
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
|
833
869
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
|
834
870
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
|
835
871
|
return BOOL;
|
|
@@ -858,8 +894,8 @@ struct test {
|
|
|
858
894
|
}
|
|
859
895
|
std::vector<std::string> values = {
|
|
860
896
|
build_commit, std::to_string(build_number),
|
|
861
|
-
std::to_string(cuda), std::to_string(
|
|
862
|
-
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
|
|
897
|
+
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
|
898
|
+
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
|
863
899
|
cpu_info, gpu_info,
|
|
864
900
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
|
865
901
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
|
@@ -887,7 +923,6 @@ struct test {
|
|
|
887
923
|
const std::string test::build_commit = LLAMA_COMMIT;
|
|
888
924
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
|
889
925
|
const bool test::cuda = !!ggml_cpu_has_cuda();
|
|
890
|
-
const bool test::opencl = !!ggml_cpu_has_clblast();
|
|
891
926
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
|
892
927
|
const bool test::kompute = !!ggml_cpu_has_kompute();
|
|
893
928
|
const bool test::metal = !!ggml_cpu_has_metal();
|
|
@@ -1011,6 +1046,27 @@ struct markdown_printer : public printer {
|
|
|
1011
1046
|
if (field == "n_gpu_layers") {
|
|
1012
1047
|
return 3;
|
|
1013
1048
|
}
|
|
1049
|
+
if (field == "n_threads") {
|
|
1050
|
+
return 7;
|
|
1051
|
+
}
|
|
1052
|
+
if (field == "n_batch") {
|
|
1053
|
+
return 7;
|
|
1054
|
+
}
|
|
1055
|
+
if (field == "n_ubatch") {
|
|
1056
|
+
return 8;
|
|
1057
|
+
}
|
|
1058
|
+
if (field == "type_k" || field == "type_v") {
|
|
1059
|
+
return 6;
|
|
1060
|
+
}
|
|
1061
|
+
if (field == "split_mode") {
|
|
1062
|
+
return 5;
|
|
1063
|
+
}
|
|
1064
|
+
if (field == "flash_attn") {
|
|
1065
|
+
return 2;
|
|
1066
|
+
}
|
|
1067
|
+
if (field == "use_mmap") {
|
|
1068
|
+
return 4;
|
|
1069
|
+
}
|
|
1014
1070
|
if (field == "test") {
|
|
1015
1071
|
return 13;
|
|
1016
1072
|
}
|
|
@@ -1138,6 +1194,9 @@ struct markdown_printer : public printer {
|
|
|
1138
1194
|
value = buf;
|
|
1139
1195
|
} else if (field == "backend") {
|
|
1140
1196
|
value = test::get_backend();
|
|
1197
|
+
if (t.has_rpc) {
|
|
1198
|
+
value += "+RPC";
|
|
1199
|
+
}
|
|
1141
1200
|
} else if (field == "test") {
|
|
1142
1201
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
|
1143
1202
|
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
|
@@ -1254,6 +1313,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
|
|
|
1254
1313
|
(void) user_data;
|
|
1255
1314
|
}
|
|
1256
1315
|
|
|
1316
|
+
static std::unique_ptr<printer> create_printer(output_formats format) {
|
|
1317
|
+
switch (format) {
|
|
1318
|
+
case NONE:
|
|
1319
|
+
return nullptr;
|
|
1320
|
+
case CSV:
|
|
1321
|
+
return std::unique_ptr<printer>(new csv_printer());
|
|
1322
|
+
case JSON:
|
|
1323
|
+
return std::unique_ptr<printer>(new json_printer());
|
|
1324
|
+
case MARKDOWN:
|
|
1325
|
+
return std::unique_ptr<printer>(new markdown_printer());
|
|
1326
|
+
case SQL:
|
|
1327
|
+
return std::unique_ptr<printer>(new sql_printer());
|
|
1328
|
+
}
|
|
1329
|
+
GGML_ABORT("fatal error");
|
|
1330
|
+
}
|
|
1331
|
+
|
|
1257
1332
|
int main(int argc, char ** argv) {
|
|
1258
1333
|
// try to set locale for unicode characters in markdown
|
|
1259
1334
|
setlocale(LC_CTYPE, ".UTF-8");
|
|
@@ -1280,26 +1355,18 @@ int main(int argc, char ** argv) {
|
|
|
1280
1355
|
llama_numa_init(params.numa);
|
|
1281
1356
|
|
|
1282
1357
|
// initialize printer
|
|
1283
|
-
std::unique_ptr<printer> p;
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
case SQL:
|
|
1295
|
-
p.reset(new sql_printer());
|
|
1296
|
-
break;
|
|
1297
|
-
default:
|
|
1298
|
-
assert(false);
|
|
1299
|
-
exit(1);
|
|
1358
|
+
std::unique_ptr<printer> p = create_printer(params.output_format);
|
|
1359
|
+
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
|
1360
|
+
|
|
1361
|
+
if (p) {
|
|
1362
|
+
p->fout = stdout;
|
|
1363
|
+
p->print_header(params);
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if (p_err) {
|
|
1367
|
+
p_err->fout = stderr;
|
|
1368
|
+
p_err->print_header(params);
|
|
1300
1369
|
}
|
|
1301
|
-
p->fout = stdout;
|
|
1302
|
-
p->print_header(params);
|
|
1303
1370
|
|
|
1304
1371
|
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
|
1305
1372
|
|
|
@@ -1357,7 +1424,15 @@ int main(int argc, char ** argv) {
|
|
|
1357
1424
|
t.samples_ns.push_back(t_ns);
|
|
1358
1425
|
}
|
|
1359
1426
|
|
|
1360
|
-
p
|
|
1427
|
+
if (p) {
|
|
1428
|
+
p->print_test(t);
|
|
1429
|
+
fflush(p->fout);
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
if (p_err) {
|
|
1433
|
+
p_err->print_test(t);
|
|
1434
|
+
fflush(p_err->fout);
|
|
1435
|
+
}
|
|
1361
1436
|
|
|
1362
1437
|
llama_print_timings(ctx);
|
|
1363
1438
|
|
|
@@ -1366,7 +1441,13 @@ int main(int argc, char ** argv) {
|
|
|
1366
1441
|
|
|
1367
1442
|
llama_free_model(lmodel);
|
|
1368
1443
|
|
|
1369
|
-
p
|
|
1444
|
+
if (p) {
|
|
1445
|
+
p->print_footer();
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
if (p_err) {
|
|
1449
|
+
p_err->print_footer();
|
|
1450
|
+
}
|
|
1370
1451
|
|
|
1371
1452
|
llama_backend_free();
|
|
1372
1453
|
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
plugins {
|
|
2
|
+
id("com.android.application")
|
|
3
|
+
id("org.jetbrains.kotlin.android")
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
android {
|
|
7
|
+
namespace = "com.example.llama"
|
|
8
|
+
compileSdk = 34
|
|
9
|
+
|
|
10
|
+
defaultConfig {
|
|
11
|
+
applicationId = "com.example.llama"
|
|
12
|
+
minSdk = 33
|
|
13
|
+
targetSdk = 34
|
|
14
|
+
versionCode = 1
|
|
15
|
+
versionName = "1.0"
|
|
16
|
+
|
|
17
|
+
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
|
18
|
+
vectorDrawables {
|
|
19
|
+
useSupportLibrary = true
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
buildTypes {
|
|
24
|
+
release {
|
|
25
|
+
isMinifyEnabled = false
|
|
26
|
+
proguardFiles(
|
|
27
|
+
getDefaultProguardFile("proguard-android-optimize.txt"),
|
|
28
|
+
"proguard-rules.pro"
|
|
29
|
+
)
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
compileOptions {
|
|
33
|
+
sourceCompatibility = JavaVersion.VERSION_1_8
|
|
34
|
+
targetCompatibility = JavaVersion.VERSION_1_8
|
|
35
|
+
}
|
|
36
|
+
kotlinOptions {
|
|
37
|
+
jvmTarget = "1.8"
|
|
38
|
+
}
|
|
39
|
+
buildFeatures {
|
|
40
|
+
compose = true
|
|
41
|
+
}
|
|
42
|
+
composeOptions {
|
|
43
|
+
kotlinCompilerExtensionVersion = "1.5.1"
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
dependencies {
|
|
48
|
+
|
|
49
|
+
implementation("androidx.core:core-ktx:1.12.0")
|
|
50
|
+
implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
|
|
51
|
+
implementation("androidx.activity:activity-compose:1.8.2")
|
|
52
|
+
implementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
|
53
|
+
implementation("androidx.compose.ui:ui")
|
|
54
|
+
implementation("androidx.compose.ui:ui-graphics")
|
|
55
|
+
implementation("androidx.compose.ui:ui-tooling-preview")
|
|
56
|
+
implementation("androidx.compose.material3:material3")
|
|
57
|
+
implementation(project(":llama"))
|
|
58
|
+
testImplementation("junit:junit:4.13.2")
|
|
59
|
+
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
|
60
|
+
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
|
61
|
+
androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
|
|
62
|
+
androidTestImplementation("androidx.compose.ui:ui-test-junit4")
|
|
63
|
+
debugImplementation("androidx.compose.ui:ui-tooling")
|
|
64
|
+
debugImplementation("androidx.compose.ui:ui-test-manifest")
|
|
65
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
|
2
|
+
plugins {
|
|
3
|
+
id("com.android.application") version "8.2.0" apply false
|
|
4
|
+
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
|
5
|
+
id("com.android.library") version "8.2.0" apply false
|
|
6
|
+
}
|