@fugood/llama.node 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -1
- package/package.json +1 -1
- package/patches/llama.patch +12 -12
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/LlamaContext.cpp +33 -1
- package/src/LlamaContext.h +1 -0
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/.github/workflows/bench.yml +310 -0
- package/src/llama.cpp/.github/workflows/build.yml +1315 -0
- package/src/llama.cpp/.github/workflows/close-issue.yml +23 -0
- package/src/llama.cpp/.github/workflows/docker.yml +116 -0
- package/src/llama.cpp/.github/workflows/editorconfig.yml +27 -0
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +44 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +17 -0
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +65 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +72 -0
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +22 -0
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +36 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +35 -0
- package/src/llama.cpp/.github/workflows/python-lint.yml +23 -0
- package/src/llama.cpp/.github/workflows/python-type-check.yml +38 -0
- package/src/llama.cpp/.github/workflows/server.yml +183 -0
- package/src/llama.cpp/CMakeLists.txt +91 -1245
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +1 -1
- package/src/llama.cpp/cmake/build-info.cmake +58 -0
- package/src/llama.cpp/cmake/git-vars.cmake +22 -0
- package/src/llama.cpp/common/CMakeLists.txt +4 -3
- package/src/llama.cpp/common/build-info.cpp.in +4 -0
- package/src/llama.cpp/common/common.cpp +1116 -877
- package/src/llama.cpp/common/common.h +191 -77
- package/src/llama.cpp/common/grammar-parser.cpp +118 -31
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +346 -65
- package/src/llama.cpp/common/log.h +1 -1
- package/src/llama.cpp/common/ngram-cache.h +10 -3
- package/src/llama.cpp/common/sampling.cpp +19 -10
- package/src/llama.cpp/docs/build.md +353 -0
- package/src/llama.cpp/examples/CMakeLists.txt +22 -22
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +6 -6
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +52 -55
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +20 -72
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/chat-13B.bat +57 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/{finetune → cvector-generator}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/cvector-generator/completions.txt +582 -0
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +503 -0
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +48 -0
- package/src/llama.cpp/examples/cvector-generator/negative.txt +4 -0
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +325 -0
- package/src/llama.cpp/examples/cvector-generator/positive.txt +4 -0
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +35 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +94 -46
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +4 -6
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +344 -386
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +30 -25
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/gguf.cpp +5 -0
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +15 -0
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +46 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +295 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +52 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +221 -0
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +24 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +42 -0
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +7093 -0
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +693 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +3 -3
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +6 -2
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +137 -176
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +38 -153
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +175 -94
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +65 -0
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +6 -0
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +68 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +11 -7
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +18 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +6 -5
- package/src/llama.cpp/examples/llava/android/build_64.sh +8 -0
- package/src/llama.cpp/examples/llava/clip.cpp +23 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +8 -6
- package/src/llama.cpp/examples/llava/requirements.txt +3 -2
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +2 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +2 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -2
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +98 -75
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +4 -5
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +2 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +23 -43
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +13 -10
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +37 -34
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +26 -77
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +14 -7
- package/src/llama.cpp/examples/server/CMakeLists.txt +26 -2
- package/src/llama.cpp/examples/server/server.cpp +274 -671
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +28 -29
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +21 -29
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +2 -1
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +23 -0
- package/src/llama.cpp/examples/sycl/run-llama2.sh +36 -0
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +33 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +9 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +16 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +253 -0
- package/src/llama.cpp/{cmake → ggml/cmake}/FindSIMD.cmake +6 -6
- package/src/llama.cpp/{ggml-backend.h → ggml/include/ggml-backend.h} +22 -17
- package/src/llama.cpp/ggml/include/ggml-blas.h +23 -0
- package/src/llama.cpp/ggml/include/ggml-cann.h +125 -0
- package/src/llama.cpp/{ggml-cuda.h → ggml/include/ggml-cuda.h} +3 -0
- package/src/llama.cpp/{ggml-metal.h → ggml/include/ggml-metal.h} +1 -2
- package/src/llama.cpp/{ggml-sycl.h → ggml/include/ggml-sycl.h} +3 -10
- package/src/llama.cpp/{ggml.h → ggml/include/ggml.h} +80 -85
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1329 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2193 -0
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +39 -0
- package/src/llama.cpp/{ggml-alloc.c → ggml/src/ggml-alloc.c} +100 -49
- package/src/llama.cpp/{ggml-backend-impl.h → ggml/src/ggml-backend-impl.h} +20 -8
- package/src/llama.cpp/{ggml-backend.c → ggml/src/ggml-backend.c} +307 -167
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +367 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +198 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +230 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +2944 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +592 -0
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +282 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +223 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +180 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +193 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +208 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +206 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +2023 -0
- package/src/llama.cpp/{ggml-common.h → ggml/src/ggml-common.h} +41 -7
- package/src/llama.cpp/{ggml-impl.h → ggml/src/ggml-impl.h} +113 -9
- package/src/llama.cpp/{ggml-kompute.cpp → ggml/src/ggml-kompute.cpp} +33 -18
- package/src/llama.cpp/{ggml-quants.c → ggml/src/ggml-quants.c} +1460 -940
- package/src/llama.cpp/{ggml-quants.h → ggml/src/ggml-quants.h} +19 -20
- package/src/llama.cpp/{ggml-rpc.cpp → ggml/src/ggml-rpc.cpp} +95 -72
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +53 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +355 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +547 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +698 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +3011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +3031 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +33 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1027 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +27 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +374 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +35 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +66 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +275 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +251 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +24 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +1140 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +5314 -0
- package/src/llama.cpp/{ggml-vulkan.cpp → ggml/src/ggml-vulkan.cpp} +1781 -1868
- package/src/llama.cpp/{ggml.c → ggml/src/ggml.c} +1245 -2087
- package/src/llama.cpp/{sgemm.cpp → ggml/src/llamafile/sgemm.cpp} +21 -24
- package/src/llama.cpp/{sgemm.h → ggml/src/llamafile/sgemm.h} +1 -1
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +552 -0
- package/src/llama.cpp/{llama.h → include/llama.h} +175 -100
- package/src/llama.cpp/models/.editorconfig +1 -0
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/requirements/requirements-all.txt +12 -0
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +3 -0
- package/src/llama.cpp/requirements/{requirements-convert.txt → requirements-convert_legacy_llama.txt} +1 -1
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-pydantic.txt +3 -0
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +1 -0
- package/src/llama.cpp/requirements.txt +5 -4
- package/src/llama.cpp/scripts/build-info.sh +30 -0
- package/src/llama.cpp/scripts/install-oneapi.bat +19 -0
- package/src/llama.cpp/src/CMakeLists.txt +33 -0
- package/src/llama.cpp/src/llama-grammar.cpp +539 -0
- package/src/llama.cpp/src/llama-grammar.h +39 -0
- package/src/llama.cpp/src/llama-impl.h +26 -0
- package/src/llama.cpp/src/llama-sampling.cpp +635 -0
- package/src/llama.cpp/src/llama-sampling.h +56 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1721 -0
- package/src/llama.cpp/src/llama-vocab.h +130 -0
- package/src/llama.cpp/{llama.cpp → src/llama.cpp} +5979 -5260
- package/src/llama.cpp/{unicode-data.cpp → src/unicode-data.cpp} +851 -802
- package/src/llama.cpp/{unicode.cpp → src/unicode.cpp} +52 -30
- package/src/llama.cpp/{unicode.h → src/unicode.h} +5 -1
- package/src/llama.cpp/tests/CMakeLists.txt +19 -20
- package/src/llama.cpp/tests/test-backend-ops.cpp +245 -67
- package/src/llama.cpp/tests/test-chat-template.cpp +57 -3
- package/src/llama.cpp/tests/test-double-float.cpp +2 -2
- package/src/llama.cpp/tests/test-grad0.cpp +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +978 -31
- package/src/llama.cpp/tests/test-grammar-parser.cpp +423 -158
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +508 -135
- package/src/llama.cpp/tests/test-llama-grammar.cpp +15 -9
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -1
- package/src/llama.cpp/tests/test-quantize-perf.cpp +1 -1
- package/src/llama.cpp/tests/test-rope.cpp +3 -4
- package/src/llama.cpp/tests/test-sampling.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +6 -6
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +20 -15
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +22 -11
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +0 -188
- package/src/llama.cpp/examples/finetune/finetune.cpp +0 -1862
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +0 -55
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +0 -1253
- package/src/llama.cpp/ggml-opencl.cpp +0 -2305
- package/src/llama.cpp/ggml-opencl.h +0 -36
- package/src/llama.cpp/ggml-sycl.cpp +0 -17340
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +0 -81211
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +0 -1
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +0 -24
- /package/src/llama.cpp/{ggml-alloc.h → ggml/include/ggml-alloc.h} +0 -0
- /package/src/llama.cpp/{ggml-kompute.h → ggml/include/ggml-kompute.h} +0 -0
- /package/src/llama.cpp/{ggml-rpc.h → ggml/include/ggml-rpc.h} +0 -0
- /package/src/llama.cpp/{ggml-vulkan.h → ggml/include/ggml-vulkan.h} +0 -0
- /package/src/llama.cpp/{unicode-data.h → src/unicode-data.h} +0 -0
|
@@ -12,25 +12,25 @@ extern "C" {
|
|
|
12
12
|
#endif
|
|
13
13
|
|
|
14
14
|
// Quantization
|
|
15
|
-
void
|
|
16
|
-
void
|
|
17
|
-
void
|
|
18
|
-
void
|
|
19
|
-
void
|
|
20
|
-
void
|
|
21
|
-
|
|
22
|
-
void
|
|
23
|
-
void
|
|
24
|
-
void
|
|
25
|
-
void
|
|
26
|
-
void
|
|
27
|
-
void
|
|
28
|
-
|
|
29
|
-
void
|
|
30
|
-
void
|
|
31
|
-
void
|
|
32
|
-
void
|
|
33
|
-
void
|
|
15
|
+
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
|
16
|
+
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
|
17
|
+
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
|
18
|
+
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
|
19
|
+
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
|
20
|
+
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
|
21
|
+
|
|
22
|
+
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
|
23
|
+
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
|
24
|
+
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
|
25
|
+
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
|
26
|
+
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
|
27
|
+
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
|
28
|
+
|
|
29
|
+
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
|
30
|
+
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
|
31
|
+
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
|
32
|
+
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
|
33
|
+
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
|
34
34
|
|
|
35
35
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
36
36
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
@@ -130,4 +130,3 @@ void iq3xs_free_impl(int grid_size);
|
|
|
130
130
|
#ifdef __cplusplus
|
|
131
131
|
}
|
|
132
132
|
#endif
|
|
133
|
-
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include <string>
|
|
7
7
|
#include <vector>
|
|
8
8
|
#include <memory>
|
|
9
|
+
#include <mutex>
|
|
9
10
|
#include <unordered_map>
|
|
10
11
|
#include <unordered_set>
|
|
11
12
|
#ifdef _WIN32
|
|
@@ -47,6 +48,7 @@ struct socket_t {
|
|
|
47
48
|
sockfd_t fd;
|
|
48
49
|
socket_t(sockfd_t fd) : fd(fd) {}
|
|
49
50
|
~socket_t() {
|
|
51
|
+
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
|
50
52
|
#ifdef _WIN32
|
|
51
53
|
closesocket(this->fd);
|
|
52
54
|
#else
|
|
@@ -71,9 +73,13 @@ struct rpc_tensor {
|
|
|
71
73
|
uint64_t view_offs;
|
|
72
74
|
uint64_t data;
|
|
73
75
|
char name[GGML_MAX_NAME];
|
|
76
|
+
|
|
77
|
+
char padding[4];
|
|
74
78
|
};
|
|
75
79
|
#pragma pack(pop)
|
|
76
80
|
|
|
81
|
+
static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
|
|
82
|
+
|
|
77
83
|
// RPC commands
|
|
78
84
|
enum rpc_cmd {
|
|
79
85
|
ALLOC_BUFFER = 0,
|
|
@@ -97,7 +103,7 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
|
|
97
103
|
}
|
|
98
104
|
|
|
99
105
|
struct ggml_backend_rpc_buffer_type_context {
|
|
100
|
-
std::
|
|
106
|
+
std::string endpoint;
|
|
101
107
|
std::string name;
|
|
102
108
|
size_t alignment;
|
|
103
109
|
size_t max_size;
|
|
@@ -106,8 +112,6 @@ struct ggml_backend_rpc_buffer_type_context {
|
|
|
106
112
|
struct ggml_backend_rpc_context {
|
|
107
113
|
std::string endpoint;
|
|
108
114
|
std::string name;
|
|
109
|
-
std::shared_ptr<socket_t> sock;
|
|
110
|
-
ggml_backend_buffer_type_t buft;
|
|
111
115
|
};
|
|
112
116
|
|
|
113
117
|
struct ggml_backend_rpc_buffer_context {
|
|
@@ -231,14 +235,13 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
|
|
231
235
|
return true;
|
|
232
236
|
}
|
|
233
237
|
|
|
234
|
-
static bool parse_endpoint(const
|
|
235
|
-
|
|
236
|
-
size_t pos = str.find(':');
|
|
238
|
+
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
|
|
239
|
+
size_t pos = endpoint.find(':');
|
|
237
240
|
if (pos == std::string::npos) {
|
|
238
241
|
return false;
|
|
239
242
|
}
|
|
240
|
-
host =
|
|
241
|
-
port = std::stoi(
|
|
243
|
+
host = endpoint.substr(0, pos);
|
|
244
|
+
port = std::stoi(endpoint.substr(pos + 1));
|
|
242
245
|
return true;
|
|
243
246
|
}
|
|
244
247
|
|
|
@@ -273,6 +276,44 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
|
|
273
276
|
|
|
274
277
|
// RPC client-side implementation
|
|
275
278
|
|
|
279
|
+
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|
280
|
+
static std::mutex mutex;
|
|
281
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
282
|
+
static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
|
|
283
|
+
static bool initialized = false;
|
|
284
|
+
|
|
285
|
+
auto it = sockets.find(endpoint);
|
|
286
|
+
if (it != sockets.end()) {
|
|
287
|
+
if (auto sock = it->second.lock()) {
|
|
288
|
+
return sock;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
std::string host;
|
|
292
|
+
int port;
|
|
293
|
+
if (!parse_endpoint(endpoint, host, port)) {
|
|
294
|
+
return nullptr;
|
|
295
|
+
}
|
|
296
|
+
#ifdef _WIN32
|
|
297
|
+
if (!initialized) {
|
|
298
|
+
WSADATA wsaData;
|
|
299
|
+
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
|
300
|
+
if (res != 0) {
|
|
301
|
+
return nullptr;
|
|
302
|
+
}
|
|
303
|
+
initialized = true;
|
|
304
|
+
}
|
|
305
|
+
#else
|
|
306
|
+
UNUSED(initialized);
|
|
307
|
+
#endif
|
|
308
|
+
auto sock = socket_connect(host.c_str(), port);
|
|
309
|
+
if (sock == nullptr) {
|
|
310
|
+
return nullptr;
|
|
311
|
+
}
|
|
312
|
+
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
|
313
|
+
sockets[endpoint] = sock;
|
|
314
|
+
return sock;
|
|
315
|
+
}
|
|
316
|
+
|
|
276
317
|
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
|
277
318
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
|
278
319
|
return ctx->name.c_str();
|
|
@@ -442,7 +483,8 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
|
442
483
|
std::vector<uint8_t> input(input_size, 0);
|
|
443
484
|
memcpy(input.data(), &size, sizeof(size));
|
|
444
485
|
std::vector<uint8_t> output;
|
|
445
|
-
|
|
486
|
+
auto sock = get_socket(buft_ctx->endpoint);
|
|
487
|
+
bool status = send_rpc_cmd(sock, ALLOC_BUFFER, input, output);
|
|
446
488
|
GGML_ASSERT(status);
|
|
447
489
|
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
|
448
490
|
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
|
@@ -453,7 +495,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
|
|
453
495
|
if (remote_ptr != 0) {
|
|
454
496
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
|
455
497
|
ggml_backend_rpc_buffer_interface,
|
|
456
|
-
new ggml_backend_rpc_buffer_context{
|
|
498
|
+
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
|
457
499
|
remote_size);
|
|
458
500
|
return buffer;
|
|
459
501
|
} else {
|
|
@@ -502,26 +544,15 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
|
|
|
502
544
|
return ggml_nbytes(tensor);
|
|
503
545
|
}
|
|
504
546
|
|
|
505
|
-
GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
506
|
-
if (!ggml_backend_is_rpc(backend)) {
|
|
507
|
-
return false;
|
|
508
|
-
}
|
|
509
|
-
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
510
|
-
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
511
|
-
return buft_ctx->sock == rpc_ctx->sock;
|
|
512
|
-
}
|
|
513
|
-
|
|
514
547
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
|
515
548
|
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
|
516
549
|
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
|
|
517
550
|
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
|
|
518
551
|
/* .get_max_size = */ ggml_backend_rpc_get_max_size,
|
|
519
552
|
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
|
|
520
|
-
/* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
|
|
521
553
|
/* .is_host = */ NULL,
|
|
522
554
|
};
|
|
523
555
|
|
|
524
|
-
|
|
525
556
|
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
|
526
557
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
527
558
|
|
|
@@ -530,16 +561,13 @@ GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
|
|
530
561
|
|
|
531
562
|
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
|
532
563
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
533
|
-
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context;
|
|
534
|
-
delete buft_ctx;
|
|
535
|
-
delete rpc_ctx->buft;
|
|
536
564
|
delete rpc_ctx;
|
|
537
565
|
delete backend;
|
|
538
566
|
}
|
|
539
567
|
|
|
540
568
|
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
|
541
569
|
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
|
542
|
-
return ctx->
|
|
570
|
+
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
|
543
571
|
}
|
|
544
572
|
|
|
545
573
|
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
|
@@ -575,9 +603,8 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
|
|
|
575
603
|
int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
|
|
576
604
|
output.resize(output_size, 0);
|
|
577
605
|
memcpy(output.data(), &n_nodes, sizeof(n_nodes));
|
|
578
|
-
uint64_t * out_nodes = (uint64_t *)(output.data() + sizeof(n_nodes));
|
|
579
606
|
for (uint32_t i = 0; i < n_nodes; i++) {
|
|
580
|
-
|
|
607
|
+
memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
|
|
581
608
|
}
|
|
582
609
|
uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
|
|
583
610
|
*out_ntensors = n_tensors;
|
|
@@ -590,7 +617,8 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
|
|
590
617
|
std::vector<uint8_t> input;
|
|
591
618
|
serialize_graph(cgraph, input);
|
|
592
619
|
std::vector<uint8_t> output;
|
|
593
|
-
|
|
620
|
+
auto sock = get_socket(rpc_ctx->endpoint);
|
|
621
|
+
bool status = send_rpc_cmd(sock, GRAPH_COMPUTE, input, output);
|
|
594
622
|
GGML_ASSERT(status);
|
|
595
623
|
GGML_ASSERT(output.size() == 1);
|
|
596
624
|
return (enum ggml_status)output[0];
|
|
@@ -599,8 +627,17 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
|
|
599
627
|
GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
600
628
|
UNUSED(backend);
|
|
601
629
|
UNUSED(op);
|
|
602
|
-
|
|
603
|
-
return
|
|
630
|
+
//TODO: call the remote backend and cache the results
|
|
631
|
+
return true;
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
635
|
+
if (buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
|
636
|
+
return false;
|
|
637
|
+
}
|
|
638
|
+
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
639
|
+
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
640
|
+
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
|
604
641
|
}
|
|
605
642
|
|
|
606
643
|
static ggml_backend_i ggml_backend_rpc_interface = {
|
|
@@ -613,9 +650,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
|
613
650
|
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
|
614
651
|
/* .graph_plan_create = */ NULL,
|
|
615
652
|
/* .graph_plan_free = */ NULL,
|
|
653
|
+
/* .graph_plan_update = */ NULL,
|
|
616
654
|
/* .graph_plan_compute = */ NULL,
|
|
617
655
|
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
|
618
656
|
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
|
657
|
+
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
|
619
658
|
/* .offload_op = */ NULL,
|
|
620
659
|
/* .event_new = */ NULL,
|
|
621
660
|
/* .event_free = */ NULL,
|
|
@@ -624,65 +663,48 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|
|
624
663
|
/* .event_synchronize = */ NULL,
|
|
625
664
|
};
|
|
626
665
|
|
|
627
|
-
static std::unordered_map<std::string, ggml_backend_t> instances;
|
|
628
|
-
|
|
629
666
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
#ifdef _WIN32
|
|
640
|
-
{
|
|
641
|
-
WSADATA wsaData;
|
|
642
|
-
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
|
643
|
-
if (res != 0) {
|
|
644
|
-
return nullptr;
|
|
645
|
-
}
|
|
646
|
-
}
|
|
647
|
-
#endif
|
|
648
|
-
fprintf(stderr, "Connecting to %s\n", endpoint);
|
|
649
|
-
std::string host;
|
|
650
|
-
int port;
|
|
651
|
-
if (!parse_endpoint(endpoint, host, port)) {
|
|
652
|
-
return nullptr;
|
|
653
|
-
}
|
|
654
|
-
auto sock = socket_connect(host.c_str(), port);
|
|
667
|
+
static std::mutex mutex;
|
|
668
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
669
|
+
// NOTE: buffer types are allocated and never freed; this is by design
|
|
670
|
+
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
|
|
671
|
+
auto it = buft_map.find(endpoint);
|
|
672
|
+
if (it != buft_map.end()) {
|
|
673
|
+
return it->second;
|
|
674
|
+
}
|
|
675
|
+
auto sock = get_socket(endpoint);
|
|
655
676
|
if (sock == nullptr) {
|
|
656
677
|
return nullptr;
|
|
657
678
|
}
|
|
658
679
|
size_t alignment = get_alignment(sock);
|
|
659
680
|
size_t max_size = get_max_size(sock);
|
|
660
681
|
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
|
661
|
-
/* .
|
|
662
|
-
/* .name
|
|
682
|
+
/* .endpoint = */ endpoint,
|
|
683
|
+
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
|
663
684
|
/* .alignment = */ alignment,
|
|
664
|
-
/* .max_size
|
|
685
|
+
/* .max_size = */ max_size
|
|
665
686
|
};
|
|
666
687
|
|
|
667
688
|
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
|
668
689
|
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
|
669
690
|
/* .context = */ buft_ctx
|
|
670
691
|
};
|
|
692
|
+
buft_map[endpoint] = buft;
|
|
693
|
+
return buft;
|
|
694
|
+
}
|
|
671
695
|
|
|
696
|
+
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|
672
697
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
|
673
|
-
/* .endpoint
|
|
674
|
-
/* .name
|
|
675
|
-
/* .sock = */ sock,
|
|
676
|
-
/* .buft = */ buft
|
|
698
|
+
/* .endpoint = */ endpoint,
|
|
699
|
+
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
|
677
700
|
};
|
|
678
701
|
|
|
679
|
-
|
|
702
|
+
ggml_backend_t backend = new ggml_backend {
|
|
680
703
|
/* .guid = */ ggml_backend_rpc_guid(),
|
|
681
704
|
/* .interface = */ ggml_backend_rpc_interface,
|
|
682
705
|
/* .context = */ ctx
|
|
683
706
|
};
|
|
684
|
-
|
|
685
|
-
return instances[endpoint];
|
|
707
|
+
return backend;
|
|
686
708
|
}
|
|
687
709
|
|
|
688
710
|
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
|
@@ -706,14 +728,13 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
|
|
706
728
|
}
|
|
707
729
|
|
|
708
730
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
|
709
|
-
|
|
710
|
-
if (
|
|
731
|
+
auto sock = get_socket(endpoint);
|
|
732
|
+
if (sock == nullptr) {
|
|
711
733
|
*free = 0;
|
|
712
734
|
*total = 0;
|
|
713
735
|
return;
|
|
714
736
|
}
|
|
715
|
-
|
|
716
|
-
get_device_memory(ctx->sock, free, total);
|
|
737
|
+
get_device_memory(sock, free, total);
|
|
717
738
|
}
|
|
718
739
|
|
|
719
740
|
// RPC server-side implementation
|
|
@@ -1018,7 +1039,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
|
|
|
1018
1039
|
}
|
|
1019
1040
|
std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
|
|
1020
1041
|
for (uint32_t i = 0; i < n_nodes; i++) {
|
|
1021
|
-
|
|
1042
|
+
int64_t id;
|
|
1043
|
+
memcpy(&id, &nodes[i], sizeof(id));
|
|
1044
|
+
graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
|
|
1022
1045
|
}
|
|
1023
1046
|
ggml_status status = ggml_backend_graph_compute(backend, graph);
|
|
1024
1047
|
// output serialization format: | status (1 byte) |
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
//
|
|
2
|
+
// MIT license
|
|
3
|
+
// Copyright (C) 2024 Intel Corporation
|
|
4
|
+
// SPDX-License-Identifier: MIT
|
|
5
|
+
//
|
|
6
|
+
|
|
7
|
+
//
|
|
8
|
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
9
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
10
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
11
|
+
//
|
|
12
|
+
|
|
13
|
+
#ifndef GGML_SYCL_BACKEND_HPP
|
|
14
|
+
#define GGML_SYCL_BACKEND_HPP
|
|
15
|
+
|
|
16
|
+
#include "concat.hpp"
|
|
17
|
+
#include "common.hpp"
|
|
18
|
+
#include "convert.hpp"
|
|
19
|
+
#include "dequantize.hpp"
|
|
20
|
+
#include "dmmv.hpp"
|
|
21
|
+
#include "mmq.hpp"
|
|
22
|
+
#include "mmvq.hpp"
|
|
23
|
+
#include "rope.hpp"
|
|
24
|
+
#include "norm.hpp"
|
|
25
|
+
#include "softmax.hpp"
|
|
26
|
+
|
|
27
|
+
#endif // GGML_SYCL_BACKEND_HPP
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
//
|
|
2
|
+
// MIT license
|
|
3
|
+
// Copyright (C) 2024 Intel Corporation
|
|
4
|
+
// SPDX-License-Identifier: MIT
|
|
5
|
+
//
|
|
6
|
+
|
|
7
|
+
//
|
|
8
|
+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
9
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
10
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
11
|
+
//
|
|
12
|
+
|
|
13
|
+
#include "common.hpp"
|
|
14
|
+
|
|
15
|
+
int get_current_device_id() {
|
|
16
|
+
return dpct::dev_mgr::instance().current_device_id();
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
void* ggml_sycl_host_malloc(size_t size) try {
|
|
20
|
+
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
|
|
21
|
+
return nullptr;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
void* ptr = nullptr;
|
|
25
|
+
// allow to use dpct::get_in_order_queue() for host malloc
|
|
26
|
+
dpct::err0 err = CHECK_TRY_ERROR(
|
|
27
|
+
ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue()));
|
|
28
|
+
|
|
29
|
+
if (err != 0) {
|
|
30
|
+
// clear the error
|
|
31
|
+
fprintf(
|
|
32
|
+
stderr,
|
|
33
|
+
"WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
|
34
|
+
size / 1024.0 / 1024.0,
|
|
35
|
+
"syclGetErrorString is not supported");
|
|
36
|
+
return nullptr;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return ptr;
|
|
40
|
+
} catch (sycl::exception const& exc) {
|
|
41
|
+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
42
|
+
<< ", line:" << __LINE__ << std::endl;
|
|
43
|
+
std::exit(1);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
void ggml_sycl_host_free(void* ptr) try {
|
|
47
|
+
// allow to use dpct::get_in_order_queue() for host malloc
|
|
48
|
+
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
|
|
49
|
+
} catch (sycl::exception const& exc) {
|
|
50
|
+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
51
|
+
<< ", line:" << __LINE__ << std::endl;
|
|
52
|
+
std::exit(1);
|
|
53
|
+
}
|