npm - @fugood/llama.node - Versions diffs - 0.3.13 → 0.3.15 - Mend

@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +1 -1
package/package.json +1 -1
package/src/LlamaContext.cpp +98 -76
package/src/LlamaContext.h +1 -1
package/src/common.hpp +1 -2
package/src/llama.cpp/.github/workflows/build.yml +89 -10
package/src/llama.cpp/.github/workflows/server.yml +2 -0
package/src/llama.cpp/CMakeLists.txt +9 -1
package/src/llama.cpp/cmake/common.cmake +2 -0
package/src/llama.cpp/common/CMakeLists.txt +3 -3
package/src/llama.cpp/common/arg.cpp +132 -13
package/src/llama.cpp/common/chat.cpp +960 -266
package/src/llama.cpp/common/chat.h +135 -0
package/src/llama.cpp/common/common.cpp +33 -174
package/src/llama.cpp/common/common.h +27 -67
package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
package/src/llama.cpp/common/ngram-cache.cpp +1 -0
package/src/llama.cpp/common/sampling.cpp +45 -7
package/src/llama.cpp/common/speculative.cpp +10 -9
package/src/llama.cpp/common/speculative.h +1 -1
package/src/llama.cpp/docs/build.md +45 -7
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
package/src/llama.cpp/examples/infill/infill.cpp +2 -2
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +373 -107
package/src/llama.cpp/examples/llava/clip.h +19 -3
package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
package/src/llama.cpp/examples/llava/llava.cpp +4 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
package/src/llama.cpp/examples/main/main.cpp +79 -34
package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
package/src/llama.cpp/examples/run/run.cpp +196 -108
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
package/src/llama.cpp/examples/server/server.cpp +113 -101
package/src/llama.cpp/examples/server/utils.hpp +94 -105
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
package/src/llama.cpp/examples/tts/tts.cpp +263 -151
package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
package/src/llama.cpp/ggml/include/ggml.h +29 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
package/src/llama.cpp/ggml/src/ggml.c +93 -5
package/src/llama.cpp/include/llama.h +105 -27
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-all.txt +1 -0
package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
package/src/llama.cpp/requirements.txt +1 -0
package/src/llama.cpp/src/CMakeLists.txt +5 -2
package/src/llama.cpp/src/llama-adapter.cpp +19 -20
package/src/llama.cpp/src/llama-adapter.h +11 -9
package/src/llama.cpp/src/llama-arch.cpp +123 -16
package/src/llama.cpp/src/llama-arch.h +19 -0
package/src/llama.cpp/src/llama-batch.h +2 -2
package/src/llama.cpp/src/llama-chat.cpp +1 -0
package/src/llama.cpp/src/llama-context.cpp +2253 -1222
package/src/llama.cpp/src/llama-context.h +214 -77
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-grammar.cpp +182 -182
package/src/llama.cpp/src/llama-grammar.h +12 -3
package/src/llama.cpp/src/llama-graph.cpp +1662 -0
package/src/llama.cpp/src/llama-graph.h +574 -0
package/src/llama.cpp/src/llama-hparams.cpp +8 -0
package/src/llama.cpp/src/llama-hparams.h +9 -0
package/src/llama.cpp/src/llama-io.cpp +15 -0
package/src/llama.cpp/src/llama-io.h +35 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
package/src/llama.cpp/src/llama-kv-cache.h +178 -109
package/src/llama.cpp/src/llama-memory.cpp +1 -0
package/src/llama.cpp/src/llama-memory.h +21 -0
package/src/llama.cpp/src/llama-mmap.cpp +11 -1
package/src/llama.cpp/src/llama-model.cpp +8230 -122
package/src/llama.cpp/src/llama-model.h +34 -1
package/src/llama.cpp/src/llama-quant.cpp +10 -1
package/src/llama.cpp/src/llama-sampling.cpp +43 -10
package/src/llama.cpp/src/llama-vocab.cpp +12 -0
package/src/llama.cpp/src/llama.cpp +51 -9837
package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
package/src/llama.cpp/tests/test-chat.cpp +593 -395
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
package/src/llama.cpp/Sources/llama/llama.h +0 -4
package/src/llama.cpp/common/chat.hpp +0 -55
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
/package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0

package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt CHANGED Viewed

@@ -1,8 +1,4 @@
 find_package (Threads REQUIRED)
-find_program(GLSLC_EXECUTABLE glslc)
-if(NOT GLSLC_EXECUTABLE)
-    message(FATAL_ERROR "glslc not found.")
-endif()
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)

package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp CHANGED Viewed

@@ -325,11 +325,17 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
     string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
     for (const auto& tname : type_names) {
+        std::string load_vec_quant = "2";
+        if ((tname == "q4_0") || (tname == "q4_1"))
+            load_vec_quant = "8";
+        else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl"))
+            load_vec_quant = "4";
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
         // For unaligned, load one at a time for f32/f16, or two at a time for quants
-        std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
+        std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant;
         // For aligned matmul loads
-        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
+        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant;
         // don't generate f32 variants for coopmat2
         if (!coopmat2) {
@@ -396,7 +402,7 @@ void process_shaders() {
     for (const auto& tname : type_names) {
         // mul mat vec
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
-        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
+        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
         string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
         string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
@@ -427,6 +433,8 @@ void process_shaders() {
     string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
@@ -477,14 +485,17 @@ void process_shaders() {
     string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
@@ -518,6 +529,8 @@ void process_shaders() {
     string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
+    string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
     string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
     for (auto &c : compiles) {

package/src/llama.cpp/ggml/src/ggml.c CHANGED Viewed

@@ -240,7 +240,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
 void * ggml_aligned_malloc(size_t size) {
+#if defined(__s390x__)
+    const int alignment = 256;
+#else
     const int alignment = 64;
+#endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
     return _aligned_malloc(size, alignment);
@@ -561,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
 #endif
 }
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
@@ -925,6 +929,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "RMS_NORM",
     "RMS_NORM_BACK",
     "GROUP_NORM",
+    "L2_NORM",
     "MUL_MAT",
     "MUL_MAT_ID",
@@ -973,6 +978,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "ADD_REL_POS",
     "RWKV_WKV6",
     "GATED_LINEAR_ATTN",
+    "RWKV_WKV7",
     "UNARY",
@@ -992,7 +998,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1022,6 +1028,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "rms_norm(x)",
     "rms_norm_back(x)",
     "group_norm(x)",
+    "l2_norm(x)",
     "X*Y",
     "X[i]*Y",
@@ -1070,6 +1077,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "add_rel_pos(x)",
     "rwkv_wkv6(k, v, r, tf, td, s)",
     "gated_linear_attn(k, v, q, gate, s)",
+    "rwkv_wkv7(r, w, k, v, a, b, s)",
     "unary(x)",
@@ -1089,7 +1097,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
-static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
+static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -2328,6 +2336,7 @@ struct ggml_tensor * ggml_concat(
     struct ggml_tensor  * b,
     int                   dim) {
     GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
+    GGML_ASSERT(a->type == b->type);
     int64_t ne[GGML_MAX_DIMS];
     for (int d = 0; d < GGML_MAX_DIMS; ++d) {
@@ -2681,6 +2690,37 @@ struct ggml_tensor * ggml_group_norm_inplace(
     return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
 }
+// ggml_l2_norm
+static struct ggml_tensor * ggml_l2_norm_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps,
+        bool                  inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    ggml_set_op_params_f32(result, 0, eps);
+    result->op     = GGML_OP_L2_NORM;
+    result->src[0] = a;
+    return result;
+}
+struct ggml_tensor * ggml_l2_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps) {
+    return ggml_l2_norm_impl(ctx, a, eps, false);
+}
+struct ggml_tensor * ggml_l2_norm_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 eps) {
+    return ggml_l2_norm_impl(ctx, a, eps, true);
+}
 // ggml_mul_mat
 static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@@ -4715,6 +4755,54 @@ struct ggml_tensor * ggml_gated_linear_attn(
     return result;
 }
+// ggml_rwkv_wkv7
+struct ggml_tensor * ggml_rwkv_wkv7(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * r,
+        struct ggml_tensor  * w,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * state) {
+    GGML_ASSERT(ggml_is_contiguous(r));
+    GGML_ASSERT(ggml_is_contiguous(w));
+    GGML_ASSERT(ggml_is_contiguous(k));
+    GGML_ASSERT(ggml_is_contiguous(v));
+    GGML_ASSERT(ggml_is_contiguous(a));
+    GGML_ASSERT(ggml_is_contiguous(b));
+    GGML_ASSERT(ggml_is_contiguous(state));
+    const int64_t S = k->ne[0];
+    const int64_t H = k->ne[1];
+    const int64_t n_tokens = k->ne[2];
+    const int64_t n_seqs = state->ne[1];
+    {
+        GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
+        GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
+        GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
+        GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
+        GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
+        GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
+    }
+    // concat output and new_state
+    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    result->op     = GGML_OP_RWKV_WKV7;
+    result->src[0] = r;
+    result->src[1] = w;
+    result->src[2] = k;
+    result->src[3] = v;
+    result->src[4] = a;
+    result->src[5] = b;
+    result->src[6] = state;
+    return result;
+}
 // ggml_unary
 static struct ggml_tensor * ggml_unary_impl(

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -60,6 +60,7 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
+    struct llama_kv_cache;
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -105,6 +106,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
     };
     enum llama_rope_type {
@@ -468,7 +470,8 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@@ -477,6 +480,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -584,7 +588,7 @@ extern "C" {
     // KV cache
     //
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
@@ -639,13 +643,19 @@ extern "C" {
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_self_n_tokens instead");
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_self_used_cells instead");
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
+    LLAMA_API void llama_kv_self_clear(
             struct llama_context * ctx);
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -653,7 +663,7 @@ extern "C" {
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_self_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -663,7 +673,7 @@ extern "C" {
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
+    LLAMA_API void llama_kv_self_seq_cp(
             struct llama_context * ctx,
                     llama_seq_id   seq_id_src,
                     llama_seq_id   seq_id_dst,
@@ -671,17 +681,17 @@ extern "C" {
                        llama_pos   p1);
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
+    LLAMA_API void llama_kv_self_seq_keep(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    LLAMA_API void llama_kv_self_seq_add(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -691,10 +701,10 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
+    LLAMA_API void llama_kv_self_seq_div(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
@@ -702,24 +712,76 @@ extern "C" {
                              int   d);
     // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
-    //       how to avoid this?
+                     llama_seq_id   seq_id);
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
+    //   - explicitly with llama_kv_self_update()
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    // Check if the context supports KV cache shifting
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
-    // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
     //
     // State / sessions
@@ -883,6 +945,10 @@ extern "C" {
     // If set to true, the model will only attend to the past tokens
     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
+    // Set whether the model is in warmup mode or not
+    // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+    LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
@@ -1203,17 +1269,29 @@ extern "C" {
                           const char * grammar_str,
                           const char * grammar_root);
-    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
-    /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
-    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
-    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
             const struct llama_vocab * vocab,
                           const char * grammar_str,
                           const char * grammar_root,
                          const char ** trigger_words,
                                 size_t num_trigger_words,
                    const llama_token * trigger_tokens,
-                                size_t num_trigger_tokens);
+                                size_t num_trigger_tokens),
+        "use llama_sampler_init_grammar_lazy_patterns instead");
+    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+    /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens);
     /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
     LLAMA_API struct llama_sampler * llama_sampler_init_penalties(

package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp ADDED Viewed

@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__

package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out ADDED Viewed

@@ -0,0 +1,46 @@
+ 1165 220 19 220 27124 5503
+ 37 19194 259
+ 220
+ 256
+ 271
+ 197
+ 198
+ 279
+ 2499
+ 2775
+ 13225 2375
+ 32949 2375
+ 13225 5922
+ 32949 5922
+ 32949 5922 0
+ 13225 11 2375 0
+ 32949 11 2375 0
+ 495 382 9552 99 247 13 17159
+ 86 45404 220 22 10191 2852 22924 4750 6916
+ 3907 53641 1235 185386 8118
+ 11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
+ 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
+ 13225
+ 32949
+ 220 32949
+ 256 32949
+ 271 32949
+ 271 32949 198 271 32949
+ 350
+ 198 314
+ 6 6837
+ 13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
+ 147475
+ 18
+ 2546
+ 15517
+ 15517 18
+ 15517 2546
+ 15517 15517
+ 15517 15517 18
+ 15517 15517 2546
+ 15517 15517 15517
+ 34 60213 53904
+ 2960 3098
+ 126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43

package/src/llama.cpp/requirements/requirements-all.txt CHANGED Viewed

@@ -10,3 +10,4 @@
 -r ./requirements-convert_hf_to_gguf_update.txt
 -r ./requirements-convert_legacy_llama.txt
 -r ./requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements-tool_bench.txt

package/src/llama.cpp/requirements/requirements-tool_bench.txt ADDED Viewed

@@ -0,0 +1,12 @@
+aiohttp~=3.9.3
+pytest~=8.3.3
+huggingface_hub~=0.23.2
+matplotlib~=3.10.0
+numpy~=1.26.4
+openai~=1.55.3
+pandas~=2.2.3
+prometheus-client~=0.20.0
+requests~=2.32.3
+wget~=3.2
+typer~=0.15.1
+seaborn~=0.13.2