@fugood/llama.node 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -3
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +3 -3
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -116,45 +116,37 @@ static inline void server_log(const char * level, const char * function, int lin
|
|
|
116
116
|
// chat template utils
|
|
117
117
|
//
|
|
118
118
|
|
|
119
|
-
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
|
120
|
-
inline bool verify_custom_template(const std::string & tmpl) {
|
|
121
|
-
llama_chat_message chat[] = {{"user", "test"}};
|
|
122
|
-
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
123
|
-
return res >= 0;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
119
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
|
127
120
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
|
128
|
-
|
|
129
|
-
// vector holding all allocated string to be passed to llama_chat_apply_template
|
|
130
|
-
std::vector<std::string> str(messages.size() * 2);
|
|
131
|
-
std::vector<llama_chat_message> chat(messages.size());
|
|
121
|
+
std::vector<llama_chat_msg> chat;
|
|
132
122
|
|
|
133
123
|
for (size_t i = 0; i < messages.size(); ++i) {
|
|
134
124
|
const auto & curr_msg = messages[i];
|
|
135
|
-
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
|
136
|
-
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
|
137
|
-
alloc_size += str[i*2 + 1].length();
|
|
138
|
-
chat[i].role = str[i*2 + 0].c_str();
|
|
139
|
-
chat[i].content = str[i*2 + 1].c_str();
|
|
140
|
-
}
|
|
141
125
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
126
|
+
std::string role = json_value(curr_msg, "role", std::string(""));
|
|
127
|
+
|
|
128
|
+
std::string content;
|
|
129
|
+
if (curr_msg.contains("content")) {
|
|
130
|
+
if (curr_msg["content"].is_string()) {
|
|
131
|
+
content = curr_msg["content"].get<std::string>();
|
|
132
|
+
} else if (curr_msg["content"].is_array()) {
|
|
133
|
+
for (const auto & part : curr_msg["content"]) {
|
|
134
|
+
if (part.contains("text")) {
|
|
135
|
+
content += "\n" + part["text"].get<std::string>();
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
} else {
|
|
139
|
+
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
|
140
|
+
}
|
|
141
|
+
} else {
|
|
142
|
+
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
|
143
|
+
}
|
|
147
144
|
|
|
148
|
-
|
|
149
|
-
if ((size_t) res > buf.size()) {
|
|
150
|
-
buf.resize(res);
|
|
151
|
-
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
145
|
+
chat.push_back({role, content});
|
|
152
146
|
}
|
|
153
147
|
|
|
154
|
-
|
|
155
|
-
|
|
148
|
+
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
|
156
149
|
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
|
157
|
-
|
|
158
150
|
return formatted_chat;
|
|
159
151
|
}
|
|
160
152
|
|
|
@@ -260,6 +252,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
|
|
|
260
252
|
return i;
|
|
261
253
|
}
|
|
262
254
|
|
|
255
|
+
static size_t common_part(const std::string & a, const std::string & b) {
|
|
256
|
+
size_t i;
|
|
257
|
+
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
258
|
+
|
|
259
|
+
return i;
|
|
260
|
+
}
|
|
261
|
+
|
|
263
262
|
static bool ends_with(const std::string & str, const std::string & suffix) {
|
|
264
263
|
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
|
265
264
|
}
|
|
@@ -6,28 +6,27 @@
|
|
|
6
6
|
#include <string>
|
|
7
7
|
#include <vector>
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
10
|
+
gpt_params_print_usage(argc, argv, params);
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
LOG_TEE("\nexample usage:\n");
|
|
13
|
+
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
|
14
|
+
LOG_TEE("\n");
|
|
15
|
+
}
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
}
|
|
17
|
+
int main(int argc, char ** argv) {
|
|
18
|
+
gpt_params params;
|
|
20
19
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
20
|
+
params.prompt = "Hello my name is";
|
|
21
|
+
params.n_predict = 32;
|
|
24
22
|
|
|
25
|
-
if (params
|
|
26
|
-
|
|
23
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
24
|
+
print_usage(argc, argv, params);
|
|
25
|
+
return 1;
|
|
27
26
|
}
|
|
28
27
|
|
|
29
28
|
// total length of the sequence including the prompt
|
|
30
|
-
const int
|
|
29
|
+
const int n_predict = params.n_predict;
|
|
31
30
|
|
|
32
31
|
// init LLM
|
|
33
32
|
|
|
@@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
|
|
|
36
35
|
|
|
37
36
|
// initialize the model
|
|
38
37
|
|
|
39
|
-
llama_model_params model_params =
|
|
40
|
-
|
|
41
|
-
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
|
38
|
+
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
|
42
39
|
|
|
43
40
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
44
41
|
|
|
@@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
|
|
|
49
46
|
|
|
50
47
|
// initialize the context
|
|
51
48
|
|
|
52
|
-
llama_context_params ctx_params =
|
|
53
|
-
|
|
54
|
-
ctx_params.seed = 1234;
|
|
55
|
-
ctx_params.n_ctx = 2048;
|
|
56
|
-
ctx_params.n_threads = params.n_threads;
|
|
57
|
-
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
|
49
|
+
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
|
58
50
|
|
|
59
51
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
60
52
|
|
|
@@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
|
|
|
69
61
|
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
|
70
62
|
|
|
71
63
|
const int n_ctx = llama_n_ctx(ctx);
|
|
72
|
-
const int n_kv_req = tokens_list.size() + (
|
|
64
|
+
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
|
73
65
|
|
|
74
|
-
LOG_TEE("\n%s:
|
|
66
|
+
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
|
75
67
|
|
|
76
68
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
|
77
69
|
if (n_kv_req > n_ctx) {
|
|
78
70
|
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
|
79
|
-
LOG_TEE("%s: either reduce
|
|
71
|
+
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
|
80
72
|
return 1;
|
|
81
73
|
}
|
|
82
74
|
|
|
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
|
|
|
115
107
|
|
|
116
108
|
const auto t_main_start = ggml_time_us();
|
|
117
109
|
|
|
118
|
-
while (n_cur <=
|
|
110
|
+
while (n_cur <= n_predict) {
|
|
119
111
|
// sample the next token
|
|
120
112
|
{
|
|
121
113
|
auto n_vocab = llama_n_vocab(model);
|
|
@@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
|
|
|
134
126
|
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
135
127
|
|
|
136
128
|
// is it an end of generation?
|
|
137
|
-
if (llama_token_is_eog(model, new_token_id) || n_cur ==
|
|
129
|
+
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
|
138
130
|
LOG_TEE("\n");
|
|
139
131
|
|
|
140
132
|
break;
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Copyright (C) 2024 Intel Corporation
|
|
3
3
|
# SPDX-License-Identifier: MIT
|
|
4
4
|
|
|
5
|
-
set(TARGET ls-sycl-device)
|
|
5
|
+
set(TARGET llama-ls-sycl-device)
|
|
6
6
|
add_executable(${TARGET} ls-sycl-device.cpp)
|
|
7
7
|
install(TARGETS ${TARGET} RUNTIME)
|
|
8
8
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
# MIT license
|
|
3
|
+
# Copyright (C) 2024 Intel Corporation
|
|
4
|
+
# SPDX-License-Identifier: MIT
|
|
5
|
+
|
|
6
|
+
mkdir -p build
|
|
7
|
+
cd build
|
|
8
|
+
source /opt/intel/oneapi/setvars.sh
|
|
9
|
+
|
|
10
|
+
#for FP16
|
|
11
|
+
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
|
12
|
+
|
|
13
|
+
#for FP32
|
|
14
|
+
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
|
15
|
+
|
|
16
|
+
#build example/main
|
|
17
|
+
#cmake --build . --config Release --target main
|
|
18
|
+
|
|
19
|
+
#build example/llama-bench
|
|
20
|
+
#cmake --build . --config Release --target llama-bench
|
|
21
|
+
|
|
22
|
+
#build all binary
|
|
23
|
+
cmake --build . --config Release -j -v
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# MIT license
|
|
4
|
+
# Copyright (C) 2024 Intel Corporation
|
|
5
|
+
# SPDX-License-Identifier: MIT
|
|
6
|
+
|
|
7
|
+
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
8
|
+
source /opt/intel/oneapi/setvars.sh
|
|
9
|
+
|
|
10
|
+
if [ $# -gt 0 ]; then
|
|
11
|
+
GGML_SYCL_DEVICE=$1
|
|
12
|
+
GGML_SYCL_SINGLE_GPU=1
|
|
13
|
+
else
|
|
14
|
+
GGML_SYCL_DEVICE=0
|
|
15
|
+
GGML_SYCL_SINGLE_GPU=0
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
#export GGML_SYCL_DEBUG=1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
|
22
|
+
|
|
23
|
+
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
|
24
|
+
echo "use $GGML_SYCL_DEVICE as main GPU"
|
|
25
|
+
#use signle GPU only
|
|
26
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
|
27
|
+
else
|
|
28
|
+
#use multiple GPUs with same max compute units
|
|
29
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
|
30
|
+
fi
|
|
31
|
+
|
|
32
|
+
#use main GPU only
|
|
33
|
+
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
|
34
|
+
|
|
35
|
+
#use multiple GPUs with same max compute units
|
|
36
|
+
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
|
|
2
|
+
:: MIT license
|
|
3
|
+
:: Copyright (C) 2024 Intel Corporation
|
|
4
|
+
:: SPDX-License-Identifier: MIT
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
IF not exist build (mkdir build)
|
|
8
|
+
cd build
|
|
9
|
+
if %errorlevel% neq 0 goto ERROR
|
|
10
|
+
|
|
11
|
+
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
12
|
+
if %errorlevel% neq 0 goto ERROR
|
|
13
|
+
|
|
14
|
+
:: for FP16
|
|
15
|
+
:: faster for long-prompt inference
|
|
16
|
+
:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
|
17
|
+
|
|
18
|
+
:: for FP32
|
|
19
|
+
cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
|
20
|
+
if %errorlevel% neq 0 goto ERROR
|
|
21
|
+
:: build example/main only
|
|
22
|
+
:: make main
|
|
23
|
+
|
|
24
|
+
:: build all binary
|
|
25
|
+
cmake --build . -j
|
|
26
|
+
if %errorlevel% neq 0 goto ERROR
|
|
27
|
+
|
|
28
|
+
cd ..
|
|
29
|
+
exit /B 0
|
|
30
|
+
|
|
31
|
+
:ERROR
|
|
32
|
+
echo comomand error: %errorlevel%
|
|
33
|
+
exit /B %errorlevel%
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
:: MIT license
|
|
2
|
+
:: Copyright (C) 2024 Intel Corporation
|
|
3
|
+
:: SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
6
|
+
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
|
@@ -29,7 +29,9 @@ static void print_usage_information(const char * argv0, FILE * stream) {
|
|
|
29
29
|
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
|
30
30
|
fprintf(stream, " --stdin read prompt from standard input.\n");
|
|
31
31
|
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
|
32
|
+
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
|
|
32
33
|
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
|
34
|
+
fprintf(stream, " --show-count print the total number of tokens.\n");
|
|
33
35
|
}
|
|
34
36
|
|
|
35
37
|
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
|
@@ -161,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
|
|
161
163
|
printf(">");
|
|
162
164
|
return;
|
|
163
165
|
}
|
|
164
|
-
|
|
166
|
+
GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
|
|
165
167
|
}
|
|
166
168
|
|
|
167
169
|
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
|
@@ -194,7 +196,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
194
196
|
// variables where to put any arguments we see.
|
|
195
197
|
bool printing_ids = false;
|
|
196
198
|
bool no_bos = false;
|
|
199
|
+
bool no_parse_special = false;
|
|
197
200
|
bool disable_logging = false;
|
|
201
|
+
bool show_token_count = false;
|
|
198
202
|
const char * model_path = NULL;
|
|
199
203
|
const char * prompt_path = NULL;
|
|
200
204
|
const char * prompt_arg = NULL;
|
|
@@ -227,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
227
231
|
else if (arg == "--no-bos") {
|
|
228
232
|
no_bos = true;
|
|
229
233
|
}
|
|
234
|
+
else if (arg == "--no-parse-special") {
|
|
235
|
+
no_parse_special = true;
|
|
236
|
+
}
|
|
230
237
|
else if (arg == "-p" || arg == "--prompt") {
|
|
231
238
|
if (prompt_set) {
|
|
232
239
|
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
|
@@ -249,6 +256,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
249
256
|
else if (arg == "--log-disable") {
|
|
250
257
|
disable_logging = true;
|
|
251
258
|
}
|
|
259
|
+
else if (arg == "--show-count") {
|
|
260
|
+
show_token_count = true;
|
|
261
|
+
}
|
|
252
262
|
else {
|
|
253
263
|
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
|
254
264
|
return 1;
|
|
@@ -354,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
354
364
|
|
|
355
365
|
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
|
356
366
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
|
367
|
+
const bool parse_special = !no_parse_special;
|
|
357
368
|
|
|
358
369
|
std::vector<llama_token> tokens;
|
|
359
|
-
tokens = ::llama_tokenize(model, prompt, add_bos,
|
|
370
|
+
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
|
|
360
371
|
|
|
361
372
|
if (printing_ids) {
|
|
362
373
|
printf("[");
|
|
@@ -384,6 +395,9 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
384
395
|
printf("]\n");
|
|
385
396
|
}
|
|
386
397
|
|
|
398
|
+
if (show_token_count) {
|
|
399
|
+
printf("Total number of tokens: %ld\n", tokens.size());
|
|
400
|
+
}
|
|
387
401
|
// silence valgrind
|
|
388
402
|
llama_free(ctx);
|
|
389
403
|
llama_free_model(model);
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
|
2
|
+
project("ggml" C CXX)
|
|
3
|
+
include(CheckIncludeFileCXX)
|
|
4
|
+
|
|
5
|
+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
6
|
+
|
|
7
|
+
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
|
8
|
+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
|
9
|
+
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
|
10
|
+
endif()
|
|
11
|
+
|
|
12
|
+
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
|
13
|
+
set(GGML_STANDALONE ON)
|
|
14
|
+
|
|
15
|
+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
|
16
|
+
|
|
17
|
+
# configure project version
|
|
18
|
+
# TODO
|
|
19
|
+
else()
|
|
20
|
+
set(GGML_STANDALONE OFF)
|
|
21
|
+
endif()
|
|
22
|
+
|
|
23
|
+
if (EMSCRIPTEN)
|
|
24
|
+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
|
25
|
+
|
|
26
|
+
option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
|
|
27
|
+
else()
|
|
28
|
+
if (MINGW)
|
|
29
|
+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
|
30
|
+
else()
|
|
31
|
+
set(BUILD_SHARED_LIBS_DEFAULT ON)
|
|
32
|
+
endif()
|
|
33
|
+
endif()
|
|
34
|
+
|
|
35
|
+
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
|
36
|
+
|
|
37
|
+
#
|
|
38
|
+
# option list
|
|
39
|
+
#
|
|
40
|
+
|
|
41
|
+
# TODO: mark all options as advanced when not GGML_STANDALONE
|
|
42
|
+
|
|
43
|
+
if (APPLE)
|
|
44
|
+
set(GGML_METAL_DEFAULT ON)
|
|
45
|
+
set(GGML_BLAS_DEFAULT ON)
|
|
46
|
+
set(GGML_BLAS_VENDOR_DEFAULT "Apple")
|
|
47
|
+
else()
|
|
48
|
+
set(GGML_METAL_DEFAULT OFF)
|
|
49
|
+
set(GGML_BLAS_DEFAULT OFF)
|
|
50
|
+
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
|
51
|
+
endif()
|
|
52
|
+
|
|
53
|
+
if (CMAKE_CROSSCOMPILING)
|
|
54
|
+
set(GGML_NATIVE_DEFAULT OFF)
|
|
55
|
+
else()
|
|
56
|
+
set(GGML_NATIVE_DEFAULT ON)
|
|
57
|
+
endif()
|
|
58
|
+
|
|
59
|
+
# general
|
|
60
|
+
option(GGML_STATIC "ggml: static link libraries" OFF)
|
|
61
|
+
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
|
|
62
|
+
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
|
63
|
+
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
|
64
|
+
|
|
65
|
+
# debug
|
|
66
|
+
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
|
|
67
|
+
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
|
|
68
|
+
option(GGML_GPROF "ggml: enable gprof" OFF)
|
|
69
|
+
|
|
70
|
+
# build
|
|
71
|
+
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
|
|
72
|
+
|
|
73
|
+
# sanitizers
|
|
74
|
+
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
|
|
75
|
+
option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
|
|
76
|
+
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
|
77
|
+
|
|
78
|
+
# instruction set specific
|
|
79
|
+
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
|
|
80
|
+
set(INS_ENB OFF)
|
|
81
|
+
else()
|
|
82
|
+
set(INS_ENB ON)
|
|
83
|
+
endif()
|
|
84
|
+
|
|
85
|
+
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
86
|
+
|
|
87
|
+
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
88
|
+
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
89
|
+
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
|
90
|
+
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
|
91
|
+
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
|
92
|
+
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
|
93
|
+
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
|
94
|
+
if (NOT MSVC)
|
|
95
|
+
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
|
|
96
|
+
endif()
|
|
97
|
+
option(GGML_LASX "ggml: enable lasx" ON)
|
|
98
|
+
option(GGML_LSX "ggml: enable lsx" ON)
|
|
99
|
+
option(GGML_SVE "ggml: enable SVE" OFF)
|
|
100
|
+
|
|
101
|
+
if (WIN32)
|
|
102
|
+
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
|
|
103
|
+
endif()
|
|
104
|
+
|
|
105
|
+
# ggml core
|
|
106
|
+
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
|
107
|
+
|
|
108
|
+
# 3rd party libs / backends
|
|
109
|
+
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
|
110
|
+
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
|
111
|
+
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
|
112
|
+
"ggml: BLAS library vendor")
|
|
113
|
+
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
|
|
114
|
+
|
|
115
|
+
option(GGML_CUDA "ggml: use CUDA" OFF)
|
|
116
|
+
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
117
|
+
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
|
118
|
+
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
119
|
+
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
120
|
+
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
|
121
|
+
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
|
122
|
+
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
123
|
+
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
|
124
|
+
"ggml: iters./thread per block for Q2_K/Q6_K")
|
|
125
|
+
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
126
|
+
"ggml: max. batch size for using peer access")
|
|
127
|
+
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
128
|
+
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
|
129
|
+
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
|
130
|
+
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
|
131
|
+
|
|
132
|
+
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
|
133
|
+
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
|
134
|
+
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
135
|
+
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
136
|
+
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
137
|
+
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
138
|
+
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
|
139
|
+
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
|
140
|
+
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
141
|
+
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
|
142
|
+
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
143
|
+
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
144
|
+
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
145
|
+
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
|
146
|
+
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|
147
|
+
"ggml: metal minimum macOS version")
|
|
148
|
+
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
|
149
|
+
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
|
150
|
+
option(GGML_RPC "ggml: use RPC" OFF)
|
|
151
|
+
option(GGML_SYCL "ggml: use SYCL" OFF)
|
|
152
|
+
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
|
153
|
+
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
154
|
+
"ggml: sycl target device")
|
|
155
|
+
|
|
156
|
+
# extra artifacts
|
|
157
|
+
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
|
158
|
+
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
|
159
|
+
|
|
160
|
+
#
|
|
161
|
+
# dependencies
|
|
162
|
+
#
|
|
163
|
+
|
|
164
|
+
set(CMAKE_C_STANDARD 11)
|
|
165
|
+
set(CMAKE_C_STANDARD_REQUIRED true)
|
|
166
|
+
|
|
167
|
+
if (GGML_SYCL)
|
|
168
|
+
set(CMAKE_CXX_STANDARD 17)
|
|
169
|
+
else()
|
|
170
|
+
set(CMAKE_CXX_STANDARD 11)
|
|
171
|
+
endif()
|
|
172
|
+
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
|
173
|
+
|
|
174
|
+
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
175
|
+
|
|
176
|
+
find_package(Threads REQUIRED)
|
|
177
|
+
|
|
178
|
+
#
|
|
179
|
+
# build the library
|
|
180
|
+
#
|
|
181
|
+
|
|
182
|
+
add_subdirectory(src)
|
|
183
|
+
|
|
184
|
+
#
|
|
185
|
+
# tests and examples
|
|
186
|
+
#
|
|
187
|
+
|
|
188
|
+
if (GGML_BUILD_TESTS)
|
|
189
|
+
enable_testing()
|
|
190
|
+
add_subdirectory(tests)
|
|
191
|
+
endif ()
|
|
192
|
+
|
|
193
|
+
if (GGML_BUILD_EXAMPLES)
|
|
194
|
+
add_subdirectory(examples)
|
|
195
|
+
endif ()
|
|
196
|
+
|
|
197
|
+
#
|
|
198
|
+
# install
|
|
199
|
+
#
|
|
200
|
+
|
|
201
|
+
include(GNUInstallDirs)
|
|
202
|
+
include(CMakePackageConfigHelpers)
|
|
203
|
+
|
|
204
|
+
# all public headers
|
|
205
|
+
set(GGML_PUBLIC_HEADERS
|
|
206
|
+
include/ggml.h
|
|
207
|
+
include/ggml-alloc.h
|
|
208
|
+
include/ggml-backend.h
|
|
209
|
+
include/ggml-blas.h
|
|
210
|
+
include/ggml-cuda.h
|
|
211
|
+
include/ggml.h
|
|
212
|
+
include/ggml-kompute.h
|
|
213
|
+
include/ggml-metal.h
|
|
214
|
+
include/ggml-rpc.h
|
|
215
|
+
include/ggml-sycl.h
|
|
216
|
+
include/ggml-vulkan.h)
|
|
217
|
+
|
|
218
|
+
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
219
|
+
#if (GGML_METAL)
|
|
220
|
+
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
|
|
221
|
+
#endif()
|
|
222
|
+
install(TARGETS ggml PUBLIC_HEADER)
|
|
223
|
+
|
|
224
|
+
if (BUILD_SHARED_LIBS)
|
|
225
|
+
install(TARGETS ggml LIBRARY)
|
|
226
|
+
endif()
|
|
227
|
+
|
|
228
|
+
if (GGML_METAL)
|
|
229
|
+
install(
|
|
230
|
+
FILES src/ggml-metal.metal
|
|
231
|
+
PERMISSIONS
|
|
232
|
+
OWNER_READ
|
|
233
|
+
OWNER_WRITE
|
|
234
|
+
GROUP_READ
|
|
235
|
+
WORLD_READ
|
|
236
|
+
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
237
|
+
|
|
238
|
+
if (NOT GGML_METAL_EMBED_LIBRARY)
|
|
239
|
+
install(
|
|
240
|
+
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
|
241
|
+
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
242
|
+
)
|
|
243
|
+
endif()
|
|
244
|
+
endif()
|
|
245
|
+
|
|
246
|
+
if (GGML_STANDALONE)
|
|
247
|
+
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
|
248
|
+
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
|
249
|
+
@ONLY)
|
|
250
|
+
|
|
251
|
+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
|
252
|
+
DESTINATION share/pkgconfig)
|
|
253
|
+
endif()
|