npm - @fugood/llama.node - Versions diffs - 0.3.1 → 0.3.3 - Mend

@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

package/CMakeLists.txt +1 -8
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +4 -2
package/src/DetokenizeWorker.cpp +1 -1
package/src/EmbeddingWorker.cpp +2 -2
package/src/LlamaCompletionWorker.cpp +10 -10
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +14 -17
package/src/TokenizeWorker.cpp +1 -1
package/src/common.hpp +5 -4
package/src/llama.cpp/.github/workflows/build.yml +137 -29
package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
package/src/llama.cpp/.github/workflows/docker.yml +46 -34
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +7 -0
package/src/llama.cpp/CMakeLists.txt +26 -11
package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
package/src/llama.cpp/common/CMakeLists.txt +10 -10
package/src/llama.cpp/common/arg.cpp +2041 -0
package/src/llama.cpp/common/arg.h +77 -0
package/src/llama.cpp/common/common.cpp +523 -1861
package/src/llama.cpp/common/common.h +234 -106
package/src/llama.cpp/common/console.cpp +3 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +401 -0
package/src/llama.cpp/common/log.h +66 -698
package/src/llama.cpp/common/ngram-cache.cpp +39 -36
package/src/llama.cpp/common/ngram-cache.h +19 -19
package/src/llama.cpp/common/sampling.cpp +356 -350
package/src/llama.cpp/common/sampling.h +62 -139
package/src/llama.cpp/common/stb_image.h +5990 -6398
package/src/llama.cpp/docs/build.md +72 -17
package/src/llama.cpp/examples/CMakeLists.txt +1 -2
package/src/llama.cpp/examples/batched/batched.cpp +49 -65
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
package/src/llama.cpp/examples/infill/infill.cpp +131 -192
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +686 -150
package/src/llama.cpp/examples/llava/clip.h +11 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
package/src/llama.cpp/examples/llava/llava.cpp +146 -26
package/src/llama.cpp/examples/llava/llava.h +2 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
package/src/llama.cpp/examples/llava/requirements.txt +1 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
package/src/llama.cpp/examples/main/main.cpp +216 -313
package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
package/src/llama.cpp/examples/server/server.cpp +1347 -1531
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
package/src/llama.cpp/examples/server/utils.hpp +396 -107
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +132 -106
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
package/src/llama.cpp/ggml/include/ggml.h +272 -505
package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
package/src/llama.cpp/include/llama.h +296 -285
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
package/src/llama.cpp/src/CMakeLists.txt +2 -1
package/src/llama.cpp/src/llama-grammar.cpp +721 -122
package/src/llama.cpp/src/llama-grammar.h +120 -15
package/src/llama.cpp/src/llama-impl.h +156 -1
package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
package/src/llama.cpp/src/llama-sampling.h +39 -47
package/src/llama.cpp/src/llama-vocab.cpp +390 -127
package/src/llama.cpp/src/llama-vocab.h +60 -20
package/src/llama.cpp/src/llama.cpp +6215 -3263
package/src/llama.cpp/src/unicode-data.cpp +6 -4
package/src/llama.cpp/src/unicode-data.h +4 -4
package/src/llama.cpp/src/unicode.cpp +15 -7
package/src/llama.cpp/tests/CMakeLists.txt +4 -2
package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
package/src/llama.cpp/tests/test-barrier.cpp +94 -0
package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
package/src/llama.cpp/tests/test-log.cpp +39 -0
package/src/llama.cpp/tests/test-opt.cpp +853 -142
package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
package/src/llama.cpp/tests/test-rope.cpp +2 -1
package/src/llama.cpp/tests/test-sampling.cpp +226 -142
package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
package/patches/llama.patch +0 -22
package/src/llama.cpp/.github/workflows/bench.yml +0 -310
package/src/llama.cpp/common/grammar-parser.cpp +0 -536
package/src/llama.cpp/common/grammar-parser.h +0 -29
package/src/llama.cpp/common/train.cpp +0 -1513
package/src/llama.cpp/common/train.h +0 -233
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
/package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
/package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0

package/src/llama.cpp/examples/quantize/CMakeLists.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)

package/src/llama.cpp/examples/quantize/quantize.cpp CHANGED Viewed

@@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
     { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
     { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
+    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
+    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
     { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
     { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
     { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
@@ -61,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
+static bool striequals(const char * a, const char * b) {
+    while (*a && *b) {
+        if (std::tolower(*a) != std::tolower(*b)) {
+            return false;
+        }
+        a++; b++;
+    }
+    return *a == *b;
+}
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
@@ -68,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
         ftype_str.push_back(std::toupper(ch));
     }
     for (auto & it : QUANT_OPTIONS) {
-        if (it.name == ftype_str) {
+        if (striequals(it.name.c_str(), ftype_str.c_str())) {
             ftype = it.ftype;
             ftype_str_out = it.name;
             return true;
@@ -91,7 +103,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
@@ -104,7 +116,7 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --keep-split: will generate quatized model in the same shards as input");
+    printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
     printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -223,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
 }
 static ggml_type parse_ggml_type(const char * arg) {
-    ggml_type result = GGML_TYPE_COUNT;
-    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
-        auto type = ggml_type(j);
+    for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
+        auto type = (ggml_type)i;
         const auto * name = ggml_type_name(type);
-        if (name && strcmp(arg, name) == 0) {
-            result = type; break;
+        if (name && striequals(name, arg)) {
+            return type;
         }
     }
-    return result;
+    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    return GGML_TYPE_COUNT;
 }
 int main(int argc, char ** argv) {
@@ -252,12 +264,18 @@ int main(int argc, char ** argv) {
         } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
             if (arg_idx < argc-1) {
                 params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
             } else {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
             if (arg_idx < argc-1) {
                 params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.token_embedding_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
             } else {
                 usage(argv[0]);
             }

package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp CHANGED Viewed

@@ -1,7 +1,7 @@
-#define LLAMA_API_INTERNAL
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "llama-impl.h"
 #include <algorithm>
 #include <cassert>
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 }
 static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
     float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
 ) {
     if (layer->type == GGML_TYPE_F16) {
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
     if (use_reference) {
         qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
     } else {
-        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
+        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
     }
     qfns.to_float(quantized_scratch, output_scratch, chunk_size);
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
 // Run quantization function for a single layer and update error stats
 static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
     const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
     std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
 ) {
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
     int num_chunks = (nelements + chunk_size - 1)/chunk_size;
     if (num_chunks < 2 || max_thread < 2) {
-        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
                 output_scratch.data(), print_layer_stats ? layer_error : total_error);
     } else {
         auto & stats = print_layer_stats ? layer_error : total_error;
         std::mutex mutex;
         uint64_t counter = 0;
-        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+        auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
              &quantized_scratch, &output_scratch, chunk_size] () {
             error_stats local_stats {};
             while (true) {
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
                 }
                 lock.unlock();
                 uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
-                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
                         quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
             }
         };
@@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
         }
         auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
-        cparams.seed       = 1;
+        cparams.n_ctx = 256;
         ctx = llama_new_context_with_model(model, cparams);
@@ -372,8 +371,9 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (qfns.from_float && qfns.to_float) {
+        const auto *  qfns     = ggml_get_type_traits(type);
+        const auto *  qfns_cpu = ggml_get_type_traits_cpu(type);
+        if (qfns_cpu->from_float && qfns->to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
             }
@@ -394,7 +394,7 @@ int main(int argc, char ** argv) {
                 test_roundtrip_on_layer(
                         layer_name,
                         params.per_layer_stats,
-                        qfns,
+                        *qfns, *qfns_cpu,
                         params.reference,
                         kv_tensor.second,
                         input_scratch,

package/src/llama.cpp/examples/retrieval/retrieval.cpp CHANGED Viewed

@@ -1,15 +1,16 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 #include <algorithm>
 #include <fstream>
+#include <iostream> // TODO: remove me
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    LOG_TEE("\n");
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG("\n");
 }
 struct chunk {
@@ -18,7 +19,7 @@ struct chunk {
     // original file position
     size_t filepos;
     // original text data
-    std::string textdata = "";
+    std::string textdata;
     // tokenized text data
     std::vector<llama_token> tokens;
     // embedding
@@ -32,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
     std::ifstream f(filename.c_str());
     if (!f.is_open()) {
-        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        LOG_ERR("could not open file %s\n", filename.c_str());
         return chunks;
     }
     chunk current_chunk;
     char buffer[1024];
     int64_t filepos = 0;
-    std::string current = "";
+    std::string current;
     while (f.read(buffer, 1024)) {
         current += std::string(buffer, f.gcount());
         size_t pos;
@@ -76,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
     size_t n_tokens = tokens.size();
     for (size_t i = 0; i < n_tokens; i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
+        common_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
@@ -85,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     llama_kv_cache_clear(ctx);
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
     if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+        LOG_ERR("%s : failed to decode\n", __func__);
     }
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -100,42 +101,41 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         if (embd == NULL) {
             embd = llama_get_embeddings_ith(ctx, i);
             if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
                 continue;
             }
         }
         float * out = output + batch.seq_id[i][0] * n_embd;
-        llama_embd_normalize(embd, out, n_embd);
+        common_embd_normalize(embd, out, n_embd);
     }
 }
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
         return 1;
     }
+    common_init();
     // For BERT models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
     params.embedding = true;
     if (params.chunk_size <= 0) {
-        fprintf(stderr, "chunk_size must be positive\n");
+        LOG_ERR("chunk_size must be positive\n");
         return 1;
     }
     if (params.context_files.empty()) {
-        fprintf(stderr, "context_files must be specified\n");
+        LOG_ERR("context_files must be specified\n");
         return 1;
     }
-    print_build_info();
-    printf("processing files:\n");
+    LOG_INF("processing files:\n");
     for (auto & context_file : params.context_files) {
-        printf("%s\n", context_file.c_str());
+        LOG_INF("%s\n", context_file.c_str());
     }
     std::vector<chunk> chunks;
@@ -143,18 +143,19 @@ int main(int argc, char ** argv) {
         std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
     }
-    printf("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %ld\n", chunks.size());
     llama_backend_init();
     llama_numa_init(params.numa);
-    llama_model * model;
-    llama_context * ctx;
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
@@ -163,19 +164,19 @@ int main(int argc, char ** argv) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
     if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
         return 1;
     }
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
     }
     // max batch size
@@ -184,9 +185,9 @@ int main(int argc, char ** argv) {
     // tokenize the prompts and trim
     for (auto & chunk : chunks) {
-        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
+        auto inp = common_tokenize(ctx, chunk.textdata, true, false);
         if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
             return 1;
         }
@@ -200,12 +201,12 @@ int main(int argc, char ** argv) {
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) chunks.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
             for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG_INF("\n\n");
         }
     }
@@ -231,7 +232,7 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + p * n_embd;
             batch_decode(ctx, batch, out, s, n_embd);
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
             p += s;
             s = 0;
         }
@@ -252,26 +253,27 @@ int main(int argc, char ** argv) {
         chunks[i].tokens.clear();
     }
+    struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
     // start loop, receive query and return top k similar chunks based on cosine similarity
     std::string query;
     while (true) {
-        printf("Enter query: ");
+        LOG("Enter query: ");
         std::getline(std::cin, query);
-        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
+        std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
-        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
         batch_add_seq(query_batch, query_tokens, 0);
         std::vector<float> query_emb(n_embd, 0);
         batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
-        llama_batch_clear(query_batch);
+        common_batch_clear(query_batch);
         // compute cosine similarities
         {
             std::vector<std::pair<int, float>> similarities;
             for (int i = 0; i < n_chunks; i++) {
-                float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
                 similarities.push_back(std::make_pair(i, sim));
             }
@@ -280,19 +282,22 @@ int main(int argc, char ** argv) {
                 return a.second > b.second;
             });
-            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            LOG("Top %d similar chunks:\n", params.sparams.top_k);
             for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                printf("similarity: %f\n", similarities[i].second);
-                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                printf("--------------------\n");
+                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                LOG("similarity: %f\n", similarities[i].second);
+                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                LOG("--------------------\n");
             }
         }
     }
+    LOG("\n");
+    llama_perf_context_print(ctx);
     // clean up
-    llama_print_timings(ctx);
+    llama_batch_free(query_batch);
     llama_free(ctx);
     llama_free_model(model);
     llama_backend_free();

package/src/llama.cpp/examples/rpc/rpc-server.cpp CHANGED Viewed

@@ -1,3 +1,5 @@
+#include "ggml-cpu.h"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
@@ -6,6 +8,10 @@
 #include "ggml-metal.h"
 #endif
+#ifdef GGML_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@@ -16,7 +22,7 @@
 #include <stdio.h>
 struct rpc_server_params {
-    std::string host        = "0.0.0.0";
+    std::string host        = "127.0.0.1";
     int         port        = 50052;
     size_t      backend_mem = 0;
 };
@@ -79,6 +85,12 @@ static ggml_backend_t create_backend() {
     if (!backend) {
         fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
     }
+#elif GGML_USE_VULKAN
+    fprintf(stderr, "%s: using Vulkan backend\n", __func__);
+    backend = ggml_backend_vk_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
+    }
 #endif
     // if there aren't GPU Backends fallback to CPU backend
@@ -92,6 +104,8 @@ static ggml_backend_t create_backend() {
 static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
 #ifdef GGML_USE_CUDA
     ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
+#elif GGML_USE_VULKAN
+    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
 #else
     #ifdef _WIN32
         MEMORYSTATUSEX status;
@@ -114,6 +128,17 @@ int main(int argc, char * argv[]) {
         fprintf(stderr, "Invalid parameters\n");
         return 1;
     }
+    if (params.host != "127.0.0.1") {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
+        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
+        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
+        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+        fprintf(stderr, "\n");
+    }
     ggml_backend_t backend = create_backend();
     if (!backend) {
         fprintf(stderr, "Failed to create backend\n");
@@ -128,7 +153,7 @@ int main(int argc, char * argv[]) {
         get_backend_memory(&free_mem, &total_mem);
     }
     printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
-    start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
+    ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
     ggml_backend_free(backend);
     return 0;
 }