@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
set(TARGET llama-quantize)
|
|
2
2
|
add_executable(${TARGET} quantize.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
-
target_link_libraries(${TARGET} PRIVATE llama
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
|
6
6
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
@@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|
|
26
26
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
|
27
27
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
|
28
28
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
|
29
|
+
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
|
30
|
+
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
|
29
31
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
|
30
32
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
|
31
33
|
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
|
@@ -61,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
|
|
|
61
63
|
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
|
|
62
64
|
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
|
|
63
65
|
|
|
66
|
+
static bool striequals(const char * a, const char * b) {
|
|
67
|
+
while (*a && *b) {
|
|
68
|
+
if (std::tolower(*a) != std::tolower(*b)) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
a++; b++;
|
|
72
|
+
}
|
|
73
|
+
return *a == *b;
|
|
74
|
+
}
|
|
75
|
+
|
|
64
76
|
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
|
65
77
|
std::string ftype_str;
|
|
66
78
|
|
|
@@ -68,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|
|
68
80
|
ftype_str.push_back(std::toupper(ch));
|
|
69
81
|
}
|
|
70
82
|
for (auto & it : QUANT_OPTIONS) {
|
|
71
|
-
if (it.name
|
|
83
|
+
if (striequals(it.name.c_str(), ftype_str.c_str())) {
|
|
72
84
|
ftype = it.ftype;
|
|
73
85
|
ftype_str_out = it.name;
|
|
74
86
|
return true;
|
|
@@ -91,7 +103,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
|
|
91
103
|
}
|
|
92
104
|
|
|
93
105
|
// usage:
|
|
94
|
-
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
|
106
|
+
// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
|
95
107
|
//
|
|
96
108
|
[[noreturn]]
|
|
97
109
|
static void usage(const char * executable) {
|
|
@@ -104,7 +116,7 @@ static void usage(const char * executable) {
|
|
|
104
116
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
|
105
117
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
|
106
118
|
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
|
107
|
-
printf(" --keep-split: will generate
|
|
119
|
+
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
|
108
120
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
|
109
121
|
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
|
110
122
|
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
|
@@ -223,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
|
|
|
223
235
|
}
|
|
224
236
|
|
|
225
237
|
static ggml_type parse_ggml_type(const char * arg) {
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
auto type = ggml_type(j);
|
|
238
|
+
for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
|
|
239
|
+
auto type = (ggml_type)i;
|
|
229
240
|
const auto * name = ggml_type_name(type);
|
|
230
|
-
if (name &&
|
|
231
|
-
|
|
241
|
+
if (name && striequals(name, arg)) {
|
|
242
|
+
return type;
|
|
232
243
|
}
|
|
233
244
|
}
|
|
234
|
-
|
|
245
|
+
fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
|
|
246
|
+
return GGML_TYPE_COUNT;
|
|
235
247
|
}
|
|
236
248
|
|
|
237
249
|
int main(int argc, char ** argv) {
|
|
@@ -252,12 +264,18 @@ int main(int argc, char ** argv) {
|
|
|
252
264
|
} else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
|
|
253
265
|
if (arg_idx < argc-1) {
|
|
254
266
|
params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
|
|
267
|
+
if (params.output_tensor_type == GGML_TYPE_COUNT) {
|
|
268
|
+
usage(argv[0]);
|
|
269
|
+
}
|
|
255
270
|
} else {
|
|
256
271
|
usage(argv[0]);
|
|
257
272
|
}
|
|
258
273
|
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
|
|
259
274
|
if (arg_idx < argc-1) {
|
|
260
275
|
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
|
|
276
|
+
if (params.token_embedding_type == GGML_TYPE_COUNT) {
|
|
277
|
+
usage(argv[0]);
|
|
278
|
+
}
|
|
261
279
|
} else {
|
|
262
280
|
usage(argv[0]);
|
|
263
281
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#define LLAMA_API_INTERNAL
|
|
2
1
|
#include "common.h"
|
|
3
2
|
#include "ggml.h"
|
|
4
3
|
#include "llama.h"
|
|
4
|
+
#include "llama-impl.h"
|
|
5
5
|
|
|
6
6
|
#include <algorithm>
|
|
7
7
|
#include <cassert>
|
|
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
|
|
142
142
|
}
|
|
143
143
|
|
|
144
144
|
static void test_roundtrip_on_chunk(
|
|
145
|
-
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const
|
|
145
|
+
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
|
|
146
146
|
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
|
147
147
|
) {
|
|
148
148
|
if (layer->type == GGML_TYPE_F16) {
|
|
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
|
|
|
156
156
|
if (use_reference) {
|
|
157
157
|
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
|
|
158
158
|
} else {
|
|
159
|
-
|
|
159
|
+
qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
|
|
160
160
|
}
|
|
161
161
|
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
|
|
162
162
|
|
|
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
|
|
|
166
166
|
|
|
167
167
|
// Run quantization function for a single layer and update error stats
|
|
168
168
|
static void test_roundtrip_on_layer(
|
|
169
|
-
std::string & name, bool print_layer_stats, const
|
|
169
|
+
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
|
|
170
170
|
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
|
171
171
|
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
|
172
172
|
) {
|
|
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
|
|
|
187
187
|
int num_chunks = (nelements + chunk_size - 1)/chunk_size;
|
|
188
188
|
|
|
189
189
|
if (num_chunks < 2 || max_thread < 2) {
|
|
190
|
-
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
|
|
190
|
+
test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
|
|
191
191
|
output_scratch.data(), print_layer_stats ? layer_error : total_error);
|
|
192
192
|
} else {
|
|
193
193
|
auto & stats = print_layer_stats ? layer_error : total_error;
|
|
194
194
|
std::mutex mutex;
|
|
195
195
|
uint64_t counter = 0;
|
|
196
|
-
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
|
|
196
|
+
auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
|
|
197
197
|
&quantized_scratch, &output_scratch, chunk_size] () {
|
|
198
198
|
error_stats local_stats {};
|
|
199
199
|
while (true) {
|
|
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
|
|
|
205
205
|
}
|
|
206
206
|
lock.unlock();
|
|
207
207
|
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
|
|
208
|
-
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
|
|
208
|
+
test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
|
|
209
209
|
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
|
|
210
210
|
}
|
|
211
211
|
};
|
|
@@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
|
|
|
319
319
|
}
|
|
320
320
|
|
|
321
321
|
auto cparams = llama_context_default_params();
|
|
322
|
-
cparams.n_ctx
|
|
323
|
-
cparams.seed = 1;
|
|
322
|
+
cparams.n_ctx = 256;
|
|
324
323
|
|
|
325
324
|
ctx = llama_new_context_with_model(model, cparams);
|
|
326
325
|
|
|
@@ -372,8 +371,9 @@ int main(int argc, char ** argv) {
|
|
|
372
371
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
|
373
372
|
continue;
|
|
374
373
|
}
|
|
375
|
-
|
|
376
|
-
|
|
374
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
375
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
376
|
+
if (qfns_cpu->from_float && qfns->to_float) {
|
|
377
377
|
if (params.verbose) {
|
|
378
378
|
printf("testing %s ...\n", ggml_type_name(type));
|
|
379
379
|
}
|
|
@@ -394,7 +394,7 @@ int main(int argc, char ** argv) {
|
|
|
394
394
|
test_roundtrip_on_layer(
|
|
395
395
|
layer_name,
|
|
396
396
|
params.per_layer_stats,
|
|
397
|
-
qfns,
|
|
397
|
+
*qfns, *qfns_cpu,
|
|
398
398
|
params.reference,
|
|
399
399
|
kv_tensor.second,
|
|
400
400
|
input_scratch,
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <algorithm>
|
|
5
7
|
#include <fstream>
|
|
8
|
+
#include <iostream> // TODO: remove me
|
|
6
9
|
|
|
7
|
-
static void print_usage(int
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
|
12
|
-
LOG_TEE("\n");
|
|
10
|
+
static void print_usage(int, char ** argv) {
|
|
11
|
+
LOG("\nexample usage:\n");
|
|
12
|
+
LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
|
13
|
+
LOG("\n");
|
|
13
14
|
}
|
|
14
15
|
|
|
15
16
|
struct chunk {
|
|
@@ -18,7 +19,7 @@ struct chunk {
|
|
|
18
19
|
// original file position
|
|
19
20
|
size_t filepos;
|
|
20
21
|
// original text data
|
|
21
|
-
std::string textdata
|
|
22
|
+
std::string textdata;
|
|
22
23
|
// tokenized text data
|
|
23
24
|
std::vector<llama_token> tokens;
|
|
24
25
|
// embedding
|
|
@@ -32,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
|
|
32
33
|
std::ifstream f(filename.c_str());
|
|
33
34
|
|
|
34
35
|
if (!f.is_open()) {
|
|
35
|
-
|
|
36
|
+
LOG_ERR("could not open file %s\n", filename.c_str());
|
|
36
37
|
return chunks;
|
|
37
38
|
}
|
|
38
39
|
|
|
39
40
|
chunk current_chunk;
|
|
40
41
|
char buffer[1024];
|
|
41
42
|
int64_t filepos = 0;
|
|
42
|
-
std::string current
|
|
43
|
+
std::string current;
|
|
43
44
|
while (f.read(buffer, 1024)) {
|
|
44
45
|
current += std::string(buffer, f.gcount());
|
|
45
46
|
size_t pos;
|
|
@@ -76,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
|
|
76
77
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
|
77
78
|
size_t n_tokens = tokens.size();
|
|
78
79
|
for (size_t i = 0; i < n_tokens; i++) {
|
|
79
|
-
|
|
80
|
+
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
|
80
81
|
}
|
|
81
82
|
}
|
|
82
83
|
|
|
@@ -85,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
85
86
|
llama_kv_cache_clear(ctx);
|
|
86
87
|
|
|
87
88
|
// run model
|
|
88
|
-
|
|
89
|
+
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
|
89
90
|
if (llama_decode(ctx, batch) < 0) {
|
|
90
|
-
|
|
91
|
+
LOG_ERR("%s : failed to decode\n", __func__);
|
|
91
92
|
}
|
|
92
93
|
|
|
93
94
|
for (int i = 0; i < batch.n_tokens; i++) {
|
|
@@ -100,42 +101,41 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
100
101
|
if (embd == NULL) {
|
|
101
102
|
embd = llama_get_embeddings_ith(ctx, i);
|
|
102
103
|
if (embd == NULL) {
|
|
103
|
-
|
|
104
|
+
LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
|
|
104
105
|
continue;
|
|
105
106
|
}
|
|
106
107
|
}
|
|
107
108
|
|
|
108
109
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
|
109
|
-
|
|
110
|
+
common_embd_normalize(embd, out, n_embd);
|
|
110
111
|
}
|
|
111
112
|
}
|
|
112
113
|
|
|
113
114
|
int main(int argc, char ** argv) {
|
|
114
|
-
|
|
115
|
+
common_params params;
|
|
115
116
|
|
|
116
|
-
if (!
|
|
117
|
-
print_usage(argc, argv, params);
|
|
117
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
|
118
118
|
return 1;
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
+
common_init();
|
|
122
|
+
|
|
121
123
|
// For BERT models, batch size must be equal to ubatch size
|
|
122
124
|
params.n_ubatch = params.n_batch;
|
|
123
125
|
params.embedding = true;
|
|
124
126
|
|
|
125
127
|
if (params.chunk_size <= 0) {
|
|
126
|
-
|
|
128
|
+
LOG_ERR("chunk_size must be positive\n");
|
|
127
129
|
return 1;
|
|
128
130
|
}
|
|
129
131
|
if (params.context_files.empty()) {
|
|
130
|
-
|
|
132
|
+
LOG_ERR("context_files must be specified\n");
|
|
131
133
|
return 1;
|
|
132
134
|
}
|
|
133
135
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
printf("processing files:\n");
|
|
136
|
+
LOG_INF("processing files:\n");
|
|
137
137
|
for (auto & context_file : params.context_files) {
|
|
138
|
-
|
|
138
|
+
LOG_INF("%s\n", context_file.c_str());
|
|
139
139
|
}
|
|
140
140
|
|
|
141
141
|
std::vector<chunk> chunks;
|
|
@@ -143,18 +143,19 @@ int main(int argc, char ** argv) {
|
|
|
143
143
|
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
|
144
144
|
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
|
145
145
|
}
|
|
146
|
-
|
|
146
|
+
LOG_INF("Number of chunks: %ld\n", chunks.size());
|
|
147
147
|
|
|
148
148
|
llama_backend_init();
|
|
149
149
|
llama_numa_init(params.numa);
|
|
150
150
|
|
|
151
|
-
llama_model * model;
|
|
152
|
-
llama_context * ctx;
|
|
153
|
-
|
|
154
151
|
// load the model
|
|
155
|
-
|
|
152
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
153
|
+
|
|
154
|
+
llama_model * model = llama_init.model;
|
|
155
|
+
llama_context * ctx = llama_init.context;
|
|
156
|
+
|
|
156
157
|
if (model == NULL) {
|
|
157
|
-
|
|
158
|
+
LOG_ERR("%s: unable to load model\n", __func__);
|
|
158
159
|
return 1;
|
|
159
160
|
}
|
|
160
161
|
|
|
@@ -163,19 +164,19 @@ int main(int argc, char ** argv) {
|
|
|
163
164
|
|
|
164
165
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
165
166
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
166
|
-
|
|
167
|
+
LOG_ERR("%s: pooling type NONE not supported\n", __func__);
|
|
167
168
|
return 1;
|
|
168
169
|
}
|
|
169
170
|
|
|
170
171
|
if (n_ctx > n_ctx_train) {
|
|
171
|
-
|
|
172
|
+
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
|
172
173
|
__func__, n_ctx_train, n_ctx);
|
|
173
174
|
}
|
|
174
175
|
|
|
175
176
|
// print system information
|
|
176
177
|
{
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
LOG_INF("\n");
|
|
179
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
179
180
|
}
|
|
180
181
|
|
|
181
182
|
// max batch size
|
|
@@ -184,9 +185,9 @@ int main(int argc, char ** argv) {
|
|
|
184
185
|
|
|
185
186
|
// tokenize the prompts and trim
|
|
186
187
|
for (auto & chunk : chunks) {
|
|
187
|
-
auto inp =
|
|
188
|
+
auto inp = common_tokenize(ctx, chunk.textdata, true, false);
|
|
188
189
|
if (inp.size() > n_batch) {
|
|
189
|
-
|
|
190
|
+
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
|
190
191
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
|
191
192
|
return 1;
|
|
192
193
|
}
|
|
@@ -200,12 +201,12 @@ int main(int argc, char ** argv) {
|
|
|
200
201
|
// tokenization stats
|
|
201
202
|
if (params.verbose_prompt) {
|
|
202
203
|
for (int i = 0; i < (int) chunks.size(); i++) {
|
|
203
|
-
|
|
204
|
-
|
|
204
|
+
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
|
205
|
+
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
|
205
206
|
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
|
206
|
-
|
|
207
|
+
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
|
207
208
|
}
|
|
208
|
-
|
|
209
|
+
LOG_INF("\n\n");
|
|
209
210
|
}
|
|
210
211
|
}
|
|
211
212
|
|
|
@@ -231,7 +232,7 @@ int main(int argc, char ** argv) {
|
|
|
231
232
|
if (batch.n_tokens + n_toks > n_batch) {
|
|
232
233
|
float * out = emb + p * n_embd;
|
|
233
234
|
batch_decode(ctx, batch, out, s, n_embd);
|
|
234
|
-
|
|
235
|
+
common_batch_clear(batch);
|
|
235
236
|
p += s;
|
|
236
237
|
s = 0;
|
|
237
238
|
}
|
|
@@ -252,26 +253,27 @@ int main(int argc, char ** argv) {
|
|
|
252
253
|
chunks[i].tokens.clear();
|
|
253
254
|
}
|
|
254
255
|
|
|
256
|
+
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
|
|
257
|
+
|
|
255
258
|
// start loop, receive query and return top k similar chunks based on cosine similarity
|
|
256
259
|
std::string query;
|
|
257
260
|
while (true) {
|
|
258
|
-
|
|
261
|
+
LOG("Enter query: ");
|
|
259
262
|
std::getline(std::cin, query);
|
|
260
|
-
std::vector<int32_t> query_tokens =
|
|
263
|
+
std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
|
|
261
264
|
|
|
262
|
-
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
|
|
263
265
|
batch_add_seq(query_batch, query_tokens, 0);
|
|
264
266
|
|
|
265
267
|
std::vector<float> query_emb(n_embd, 0);
|
|
266
268
|
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
|
267
269
|
|
|
268
|
-
|
|
270
|
+
common_batch_clear(query_batch);
|
|
269
271
|
|
|
270
272
|
// compute cosine similarities
|
|
271
273
|
{
|
|
272
274
|
std::vector<std::pair<int, float>> similarities;
|
|
273
275
|
for (int i = 0; i < n_chunks; i++) {
|
|
274
|
-
float sim =
|
|
276
|
+
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
|
275
277
|
similarities.push_back(std::make_pair(i, sim));
|
|
276
278
|
}
|
|
277
279
|
|
|
@@ -280,19 +282,22 @@ int main(int argc, char ** argv) {
|
|
|
280
282
|
return a.second > b.second;
|
|
281
283
|
});
|
|
282
284
|
|
|
283
|
-
|
|
285
|
+
LOG("Top %d similar chunks:\n", params.sparams.top_k);
|
|
284
286
|
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
287
|
+
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
|
288
|
+
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
|
289
|
+
LOG("similarity: %f\n", similarities[i].second);
|
|
290
|
+
LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
|
291
|
+
LOG("--------------------\n");
|
|
290
292
|
}
|
|
291
293
|
}
|
|
292
294
|
}
|
|
293
295
|
|
|
296
|
+
LOG("\n");
|
|
297
|
+
llama_perf_context_print(ctx);
|
|
298
|
+
|
|
294
299
|
// clean up
|
|
295
|
-
|
|
300
|
+
llama_batch_free(query_batch);
|
|
296
301
|
llama_free(ctx);
|
|
297
302
|
llama_free_model(model);
|
|
298
303
|
llama_backend_free();
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#include "ggml-cpu.h"
|
|
2
|
+
|
|
1
3
|
#ifdef GGML_USE_CUDA
|
|
2
4
|
#include "ggml-cuda.h"
|
|
3
5
|
#endif
|
|
@@ -6,6 +8,10 @@
|
|
|
6
8
|
#include "ggml-metal.h"
|
|
7
9
|
#endif
|
|
8
10
|
|
|
11
|
+
#ifdef GGML_USE_VULKAN
|
|
12
|
+
#include "ggml-vulkan.h"
|
|
13
|
+
#endif
|
|
14
|
+
|
|
9
15
|
#include "ggml-rpc.h"
|
|
10
16
|
#ifdef _WIN32
|
|
11
17
|
# include <windows.h>
|
|
@@ -16,7 +22,7 @@
|
|
|
16
22
|
#include <stdio.h>
|
|
17
23
|
|
|
18
24
|
struct rpc_server_params {
|
|
19
|
-
std::string host = "
|
|
25
|
+
std::string host = "127.0.0.1";
|
|
20
26
|
int port = 50052;
|
|
21
27
|
size_t backend_mem = 0;
|
|
22
28
|
};
|
|
@@ -79,6 +85,12 @@ static ggml_backend_t create_backend() {
|
|
|
79
85
|
if (!backend) {
|
|
80
86
|
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
|
81
87
|
}
|
|
88
|
+
#elif GGML_USE_VULKAN
|
|
89
|
+
fprintf(stderr, "%s: using Vulkan backend\n", __func__);
|
|
90
|
+
backend = ggml_backend_vk_init(0); // init device 0
|
|
91
|
+
if (!backend) {
|
|
92
|
+
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
|
93
|
+
}
|
|
82
94
|
#endif
|
|
83
95
|
|
|
84
96
|
// if there aren't GPU Backends fallback to CPU backend
|
|
@@ -92,6 +104,8 @@ static ggml_backend_t create_backend() {
|
|
|
92
104
|
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|
93
105
|
#ifdef GGML_USE_CUDA
|
|
94
106
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
|
107
|
+
#elif GGML_USE_VULKAN
|
|
108
|
+
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
|
95
109
|
#else
|
|
96
110
|
#ifdef _WIN32
|
|
97
111
|
MEMORYSTATUSEX status;
|
|
@@ -114,6 +128,17 @@ int main(int argc, char * argv[]) {
|
|
|
114
128
|
fprintf(stderr, "Invalid parameters\n");
|
|
115
129
|
return 1;
|
|
116
130
|
}
|
|
131
|
+
|
|
132
|
+
if (params.host != "127.0.0.1") {
|
|
133
|
+
fprintf(stderr, "\n");
|
|
134
|
+
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
|
|
135
|
+
fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
|
|
136
|
+
fprintf(stderr, " Never expose the RPC server to an open network!\n");
|
|
137
|
+
fprintf(stderr, " This is an experimental feature and is not secure!\n");
|
|
138
|
+
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
|
|
139
|
+
fprintf(stderr, "\n");
|
|
140
|
+
}
|
|
141
|
+
|
|
117
142
|
ggml_backend_t backend = create_backend();
|
|
118
143
|
if (!backend) {
|
|
119
144
|
fprintf(stderr, "Failed to create backend\n");
|
|
@@ -128,7 +153,7 @@ int main(int argc, char * argv[]) {
|
|
|
128
153
|
get_backend_memory(&free_mem, &total_mem);
|
|
129
154
|
}
|
|
130
155
|
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
|
131
|
-
|
|
156
|
+
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
|
|
132
157
|
ggml_backend_free(backend);
|
|
133
158
|
return 0;
|
|
134
159
|
}
|