npm - @fugood/llama.node - Versions diffs - 1.2.0-rc.0 → 1.2.0 - Mend

@fugood/llama.node 1.2.0-rc.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/package.json +14 -14
package/src/llama.cpp/common/chat.cpp +139 -0
package/src/llama.cpp/common/chat.h +1 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
package/src/llama.cpp/ggml/include/ggml.h +6 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -0
package/src/llama.cpp/src/llama-arch.cpp +1 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +5 -3
package/src/llama.cpp/src/llama-graph.cpp +3 -2
package/src/llama.cpp/src/llama-hparams.h +1 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
package/src/llama.cpp/src/llama-kv-cache.h +8 -0
package/src/llama.cpp/src/llama-model.cpp +58 -96
package/src/llama.cpp/src/llama-quant.cpp +3 -3
package/src/llama.cpp/src/llama.cpp +53 -10

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.2.0-rc.0",
+  "version": "1.2.0",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,19 +72,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.2.0-rc.0",
-    "@fugood/node-llama-linux-x64-vulkan": "1.2.0-rc.0",
-    "@fugood/node-llama-linux-x64-cuda": "1.2.0-rc.0",
-    "@fugood/node-llama-linux-arm64": "1.2.0-rc.0",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.2.0-rc.0",
-    "@fugood/node-llama-linux-arm64-cuda": "1.2.0-rc.0",
-    "@fugood/node-llama-win32-x64": "1.2.0-rc.0",
-    "@fugood/node-llama-win32-x64-vulkan": "1.2.0-rc.0",
-    "@fugood/node-llama-win32-x64-cuda": "1.2.0-rc.0",
-    "@fugood/node-llama-win32-arm64": "1.2.0-rc.0",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.2.0-rc.0",
-    "@fugood/node-llama-darwin-x64": "1.2.0-rc.0",
-    "@fugood/node-llama-darwin-arm64": "1.2.0-rc.0"
+    "@fugood/node-llama-linux-x64": "1.2.0",
+    "@fugood/node-llama-linux-x64-vulkan": "1.2.0",
+    "@fugood/node-llama-linux-x64-cuda": "1.2.0",
+    "@fugood/node-llama-linux-arm64": "1.2.0",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.2.0",
+    "@fugood/node-llama-linux-arm64-cuda": "1.2.0",
+    "@fugood/node-llama-win32-x64": "1.2.0",
+    "@fugood/node-llama-win32-x64-vulkan": "1.2.0",
+    "@fugood/node-llama-win32-x64-cuda": "1.2.0",
+    "@fugood/node-llama-win32-arm64": "1.2.0",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.2.0",
+    "@fugood/node-llama-darwin-x64": "1.2.0",
+    "@fugood/node-llama-darwin-arm64": "1.2.0"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -618,6 +618,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
+        case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
         case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
@@ -685,11 +686,13 @@ static void parse_json_tool_calls(
         size_t from = std::string::npos;
         auto first = true;
         while (true) {
+            auto start_pos = builder.pos();
             auto res = function_regex_start_only && first
                 ? builder.try_consume_regex(*function_regex_start_only)
                 : function_regex
                     ? builder.try_find_regex(*function_regex, from)
                     : std::nullopt;
             if (res) {
                 std::string name;
                 if (get_function_name) {
@@ -724,6 +727,8 @@ static void parse_json_tool_calls(
                     return;
                 }
                 throw common_chat_msg_partial_exception("incomplete tool call");
+            } else {
+                builder.move_to(start_pos);
             }
             break;
         }
@@ -1374,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
     }
     return data;
 }
+static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    // Pass thinking context for DeepSeek V3.1 template
+    json additional_context = {
+        {"thinking", inputs.enable_thinking},
+    };
+    auto prompt = apply(tmpl, inputs,
+                       /* messages_override= */ inputs.messages,
+                       /* tools_override= */ std::nullopt,
+                       additional_context);
+    data.prompt = prompt;
+    data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    if (string_ends_with(data.prompt, "<think>")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+    if (inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                    "( \"<｜tool▁call▁begin｜>\" )? \"" + name + "<｜tool▁sep｜>"
+                    "\" " + builder.add_schema(name + "-args", parameters) + " "
+                    "\"<｜tool▁call▁end｜>\""));
+            });
+            // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
+            // so we accept common variants (then it's all constrained)
+            builder.add_rule("root",
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                "( \"<｜tool▁calls▁begin｜>\" | \"<｜tool_calls_begin｜>\" | \"<｜tool calls begin｜>\" | \"<｜tool\\\\_calls\\\\_begin｜>\" | \"<｜tool▁calls｜>\" ) "
+                "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
+                "\"<｜tool▁calls▁end｜>\""
+                " space");
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                // If thinking_forced_open, then we capture the </think> tag in the grammar,
+                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
+                    "(<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)[\\s\\S]*"
+            });
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<｜tool▁calls▁begin｜>",
+                "<｜tool▁call▁begin｜>",
+                "<｜tool▁sep｜>",
+                "<｜tool▁call▁end｜>",
+                "<｜tool▁calls▁end｜>",
+            };
+        });
+    }
+    return data;
+}
 static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
     builder.try_parse_reasoning("<think>", "</think>");
     if (!builder.syntax().parse_tool_calls) {
@@ -1395,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
         tool_calls_end);
 }
+static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
+    static const common_regex function_regex("(?:<｜tool▁call▁begin｜>)?([^\\n<]+)(?:<｜tool▁sep｜>)");
+    static const common_regex close_regex("(?:[\\s]*)?<｜tool▁call▁end｜>");
+    static const common_regex tool_calls_begin("(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)");
+    static const common_regex tool_calls_end("<｜tool▁calls▁end｜>");
+    if (!builder.syntax().parse_tool_calls) {
+        LOG_DBG("%s: not parse_tool_calls\n", __func__);
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    LOG_DBG("%s: parse_tool_calls\n", __func__);
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ tool_calls_begin,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        tool_calls_end);
+}
+static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
+    // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
+    // First try to parse using the standard reasoning parsing method
+    LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+    auto start_pos = builder.pos();
+    auto found_end_think = builder.try_find_literal("</think>");
+    builder.move_to(start_pos);
+    if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+        LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+        common_chat_parse_deepseek_v3_1_content(builder);
+    } else if (builder.try_parse_reasoning("<think>", "</think>")) {
+        // If reasoning was parsed successfully, the remaining content is regular content
+        LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+        // </think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>NAME\n```json\nJSON\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>
+        common_chat_parse_deepseek_v3_1_content(builder);
+    } else {
+        if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+          LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+          common_chat_parse_deepseek_v3_1_content(builder);
+          return;
+        }
+        // If no reasoning tags found, check if we should treat everything as reasoning
+        if (builder.syntax().thinking_forced_open) {
+            // If thinking is forced open but no tags found, treat everything as reasoning
+            LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+            builder.add_reasoning_content(builder.consume_rest());
+        } else {
+            LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+            // <｜tool▁call▁begin｜>NAME<｜tool▁sep｜>JSON<｜tool▁call▁end｜>
+            common_chat_parse_deepseek_v3_1_content(builder);
+        }
+    }
+}
 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     auto prompt = apply(tmpl, inputs);
@@ -2351,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
         }
     }
+    // DeepSeek V3.1: detect based on specific patterns in the template
+    if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
+        params.json_schema.is_null()) {
+        return common_chat_params_init_deepseek_v3_1(tmpl, params);
+    }
     // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
     if (src.find("<｜tool▁calls▁begin｜>") != std::string::npos && params.json_schema.is_null()) {
         return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2523,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
             common_chat_parse_deepseek_r1(builder);
             break;
+        case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
+            common_chat_parse_deepseek_v3_1(builder);
+            break;
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
             common_chat_parse_functionary_v3_2(builder);
             break;

package/src/llama.cpp/common/chat.h CHANGED Viewed

@@ -118,6 +118,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
     COMMON_CHAT_FORMAT_GRANITE,

package/src/llama.cpp/common/json-schema-to-grammar.cpp CHANGED Viewed

@@ -843,9 +843,10 @@ public:
                 _build_object_rule(
                     properties, required, name,
                     schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
-        } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
+        } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
             std::unordered_set<std::string> required;
             std::vector<std::pair<std::string, json>> properties;
+            std::map<std::string, size_t> enum_values;
             std::string hybrid_name = name;
             std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
                 if (comp_schema.contains("$ref")) {
@@ -857,6 +858,14 @@ public:
                             required.insert(prop.key());
                         }
                     }
+                } else if (comp_schema.contains("enum")) {
+                    for (const auto & v : comp_schema["enum"]) {
+                        const auto rule = _generate_constant_rule(v);
+                        if (enum_values.find(rule) == enum_values.end()) {
+                            enum_values[rule] = 0;
+                        }
+                        enum_values[rule] += 1;
+                    }
                 } else {
                   // todo warning
                 }
@@ -870,6 +879,17 @@ public:
                     add_component(t, true);
                 }
             }
+            if (!enum_values.empty()) {
+                std::vector<std::string> enum_intersection;
+                for (const auto & p : enum_values) {
+                    if (p.second == schema["allOf"].size()) {
+                        enum_intersection.push_back(p.first);
+                    }
+                }
+                if (!enum_intersection.empty()) {
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
+                }
+            }
             return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
         } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
             json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];

package/src/llama.cpp/ggml/include/ggml-backend.h CHANGED Viewed

@@ -132,6 +132,8 @@ extern "C" {
         GGML_BACKEND_DEVICE_TYPE_CPU,
         // GPU device using dedicated memory
         GGML_BACKEND_DEVICE_TYPE_GPU,
+        // integrated GPU device using host memory
+        GGML_BACKEND_DEVICE_TYPE_IGPU,
         // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
         GGML_BACKEND_DEVICE_TYPE_ACCEL
     };
@@ -150,11 +152,21 @@ extern "C" {
     // all the device properties
     struct ggml_backend_dev_props {
+        // device name
         const char * name;
+        // device description
         const char * description;
+        // device free memory in bytes
         size_t memory_free;
+        // device total memory in bytes
         size_t memory_total;
+        // device type
         enum ggml_backend_dev_type type;
+        // device id
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   if the id is unknown, this should be NULL
+        const char * device_id;
+        // device capabilities
         struct ggml_backend_dev_caps caps;
     };

package/src/llama.cpp/ggml/include/ggml-cpu.h CHANGED Viewed

@@ -134,6 +134,7 @@ extern "C" {
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

package/src/llama.cpp/ggml/include/ggml-metal.h CHANGED Viewed

@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
-GGML_DEPRECATED(
-        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
 GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -1404,6 +1404,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
+    // note: casting from f32 to i32 will discard the fractional part
     GGML_API struct ggml_tensor * ggml_cast(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1528,7 +1529,11 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
-    // supports 3D: a->ne[2] == b->ne[1]
+    // supports 4D a:
+    // a     [n_embd, ne1, ne2, ne3]
+    // b I32 [n_rows, ne2, ne3, 1]
+    //
+    // return [n_embd, n_rows, ne2, ne3]
     GGML_API struct ggml_tensor * ggml_get_rows(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,  // data

package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt CHANGED Viewed

@@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
                     string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
                     if (NOT ${feature_pos} EQUAL -1)
-                        message(STATUS "ARM feature ${feature} enabled")
+                        # Special handling for MATMUL_INT8 when machine doesn't support i8mm
+                        if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
+                            message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
+                            list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
+                        else()
+                            message(STATUS "ARM feature ${feature} enabled")
+                        endif()
                     endif()
                 endforeach()
             endif()

package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c CHANGED Viewed

@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
 #if defined(__VXE__) || defined(__VXE2__)
     for (int i = 0; i < nb; i++) {
-        __vector float srcv [8];
-        __vector float asrcv[8];
-        __vector float amaxv[8];
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
         for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
         for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         y[i].d = GGML_CPU_FP32_TO_FP16(d);
         for (int j = 0; j < 8; j++) {
-            const __vector float v = vec_mul(srcv[j], vec_splats(id));
-            const __vector int32_t vi = vec_signed(v);
+            const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
+            const int32x4_t vi = vec_signed(v);
             y[i].qs[4*j + 0] = vec_extract(vi, 0);
             y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
 #if defined(__VXE__) || defined(__VXE2__)
     for (int i = 0; i < nb; i++) {
-        __vector float srcv [8];
-        __vector float asrcv[8];
-        __vector float amaxv[8];
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
         for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
         for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         y[i].d = GGML_CPU_FP32_TO_FP16(d);
-        __vector int32_t acc = vec_splats(0);
+        int32x4_t acc = vec_splats(0);
         for (int j = 0; j < 8; j++) {
-            const __vector float v = vec_mul(srcv[j], vec_splats(id));
-            const __vector int32_t vi = vec_signed(v);
+            const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
+            const int32x4_t vi = vec_signed(v);
             y[i].qs[4*j + 0] = vec_extract(vi, 0);
             y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     float sumf = 0;
 #if defined(__VXE__) || defined(__VXE2__)
-    __vector float acc = vec_splats(0.0f);
+    float32x4_t acc = vec_splats(0.0f);
-    const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
-    const __vector int8_t  v_s = vec_splats( (const int8_t)0x08);
+    const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
+    const int8x16_t  v_s = vec_splats( (const int8_t)0x08);
     for (; ib < nb; ++ib) {
-        const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
-        const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
-        const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
+        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
+        const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
+        const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
-        const __vector int8_t v_xls = vec_sub(v_xl, v_s);
-        const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
+        const int8x16_t v_xls = vec_sub(v_xl, v_s);
+        const int8x16_t v_xhs = vec_sub(v_xh, v_s);
-        const __vector int8_t v_yl = vec_xl(0      , y[ib].qs);
-        const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
-        const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
-        const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
-        const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
-        const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
+        const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
+        const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
+        const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
+        const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
-        __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
+        int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
-        const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
-        const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
+        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
         acc = vec_madd(v_xy, v_d, acc);
     }
-    sumf = acc[0] + acc[1] + acc[2] + acc[3];
+    sumf = vec_hsum_f32x4(acc);
     *s = sumf;
 #else
     UNUSED(nb);
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         acc = vec_madd(v_xy, v_d, acc);
     }
-    sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
+    sumf = vec_hsum_f32x4(acc) + summs;
     *s = sumf;
 #else
     UNUSED(nb);
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
     }
-    sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
+    sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
     #pragma GCC unroll 4
     for (; ib < nb; ++ib) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
         const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
-        sumf += vec_hsum(v_acc);
+        sumf += vec_hsum_f32x4(v_acc);
     }
     *s = sumf;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
     }
-    sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
+    sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
     #pragma GCC unroll 4
     for (; ib < nb; ++ib) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
         const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
-        sumf += vec_hsum(v_acc) + summs;
+        sumf += vec_hsum_f32x4(v_acc) + summs;
     }
     *s = sumf;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     float sumf = 0;
 #if defined(__VXE__) || defined(__VXE2__)
-    __vector float acc = vec_splats(0.0f);
+    float32x4_t acc = vec_splats(0.0f);
 #pragma GCC unroll 8
     for (; ib < nb; ++ib) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         acc = vec_madd(v_xy, v_d, acc);
     }
-    sumf = acc[0] + acc[1] + acc[2] + acc[3];
+    sumf = vec_hsum_f32x4(acc);
     *s = sumf;
 #else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
             isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
-            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
-            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
-            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
-            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+            isum += vec_hsum_i32x4(isum0) * scale[0];
+            isum += vec_hsum_i32x4(isum1) * scale[1];
+            isum += vec_hsum_i32x4(isum2) * scale[2];
+            isum += vec_hsum_i32x4(isum3) * scale[3];
             scale += 4;
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
             const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
-            sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
+            sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
             v_y[0] = vec_xl(0 , y0);
             v_y[1] = vec_xl(16, y0);
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
             const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
-            sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
+            sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
         }
         sumf += d * (sumi1 + sumi2);
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
         const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
         const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
-        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+        const int32_t mins = vec_hsum_i32x4(v_mins);
         const uint8_t * scales = (const uint8_t *)utmp;
         const uint8_t * GGML_RESTRICT x0l = x[i].qs;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
             int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
-            sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
-            sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
+            sumi += vec_hsum_i32x4(sumi0) * *scales++;
+            sumi += vec_hsum_i32x4(sumi1) * *scales++;
         }
         sumf += d * sumi - dmin * mins;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
         const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
         const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
-        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+        const int32_t mins = vec_hsum_i32x4(v_mins);
         int32_t isum = 0;
         for (int j = 0; j < QK_K/128; ++j) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
             int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
-            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
-                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
-                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
-                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+            isum += vec_hsum_i32x4(summs0) * scale[0] +
+                    vec_hsum_i32x4(summs1) * scale[1] +
+                    vec_hsum_i32x4(summs2) * scale[2] +
+                    vec_hsum_i32x4(summs3) * scale[3];
             scale += 4;
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
             summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
-            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
-                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
-                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
-                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+            isum += vec_hsum_i32x4(summs0) * scale[0] +
+                    vec_hsum_i32x4(summs1) * scale[1] +
+                    vec_hsum_i32x4(summs2) * scale[2] +
+                    vec_hsum_i32x4(summs3) * scale[3];
             scale += 4;
         }
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
         const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
         const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
-        sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
+        sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
     }
     *s = sumf;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
             h >>= 4;
-            sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
-            sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
+            sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
+            sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
         }
         sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);