npm - @fugood/llama.node - Versions diffs - 1.3.0-rc.6 → 1.3.1 - Mend

@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

package/CMakeLists.txt +12 -2
package/package.json +14 -14
package/scripts/llama.cpp.patch +8 -9
package/src/llama.cpp/common/CMakeLists.txt +2 -0
package/src/llama.cpp/common/arg.cpp +39 -1001
package/src/llama.cpp/common/arg.h +2 -2
package/src/llama.cpp/common/chat.cpp +216 -2
package/src/llama.cpp/common/chat.h +1 -0
package/src/llama.cpp/common/common.cpp +33 -0
package/src/llama.cpp/common/common.h +13 -0
package/src/llama.cpp/common/download.cpp +1054 -0
package/src/llama.cpp/common/download.h +55 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
package/src/llama.cpp/ggml/include/ggml.h +2 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
package/src/llama.cpp/include/llama.h +7 -3
package/src/llama.cpp/src/CMakeLists.txt +95 -0
package/src/llama.cpp/src/llama-arch.cpp +108 -0
package/src/llama.cpp/src/llama-arch.h +11 -0
package/src/llama.cpp/src/llama-batch.cpp +63 -31
package/src/llama.cpp/src/llama-batch.h +12 -1
package/src/llama.cpp/src/llama-chat.cpp +32 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +44 -16
package/src/llama.cpp/src/llama-context.h +5 -5
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +12 -7
package/src/llama.cpp/src/llama-hparams.cpp +11 -1
package/src/llama.cpp/src/llama-hparams.h +6 -0
package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
package/src/llama.cpp/src/llama-kv-cache.h +2 -4
package/src/llama.cpp/src/llama-kv-cells.h +44 -2
package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
package/src/llama.cpp/src/llama-model.cpp +350 -13194
package/src/llama.cpp/src/llama-model.h +9 -2
package/src/llama.cpp/src/llama-quant.cpp +1 -1
package/src/llama.cpp/src/llama-vocab.cpp +5 -0
package/src/llama.cpp/src/llama-vocab.h +1 -0
package/src/llama.cpp/src/models/apertus.cpp +125 -0
package/src/llama.cpp/src/models/arcee.cpp +135 -0
package/src/llama.cpp/src/models/arctic.cpp +138 -0
package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
package/src/llama.cpp/src/models/baichuan.cpp +122 -0
package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
package/src/llama.cpp/src/models/bert.cpp +176 -0
package/src/llama.cpp/src/models/bitnet.cpp +160 -0
package/src/llama.cpp/src/models/bloom.cpp +101 -0
package/src/llama.cpp/src/models/chameleon.cpp +178 -0
package/src/llama.cpp/src/models/chatglm.cpp +132 -0
package/src/llama.cpp/src/models/codeshell.cpp +111 -0
package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
package/src/llama.cpp/src/models/command-r.cpp +122 -0
package/src/llama.cpp/src/models/dbrx.cpp +123 -0
package/src/llama.cpp/src/models/deci.cpp +135 -0
package/src/llama.cpp/src/models/deepseek.cpp +144 -0
package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
package/src/llama.cpp/src/models/dots1.cpp +134 -0
package/src/llama.cpp/src/models/dream.cpp +105 -0
package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
package/src/llama.cpp/src/models/exaone.cpp +114 -0
package/src/llama.cpp/src/models/exaone4.cpp +123 -0
package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
package/src/llama.cpp/src/models/falcon.cpp +120 -0
package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
package/src/llama.cpp/src/models/gemma.cpp +112 -0
package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
package/src/llama.cpp/src/models/glm4.cpp +127 -0
package/src/llama.cpp/src/models/gpt2.cpp +105 -0
package/src/llama.cpp/src/models/gptneox.cpp +144 -0
package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
package/src/llama.cpp/src/models/granite.cpp +211 -0
package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
package/src/llama.cpp/src/models/grok.cpp +159 -0
package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
package/src/llama.cpp/src/models/internlm2.cpp +120 -0
package/src/llama.cpp/src/models/jais.cpp +86 -0
package/src/llama.cpp/src/models/jamba.cpp +106 -0
package/src/llama.cpp/src/models/lfm2.cpp +173 -0
package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
package/src/llama.cpp/src/models/llada.cpp +99 -0
package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
package/src/llama.cpp/src/models/llama.cpp +155 -0
package/src/llama.cpp/src/models/mamba.cpp +55 -0
package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
package/src/llama.cpp/src/models/models.h +481 -0
package/src/llama.cpp/src/models/mpt.cpp +126 -0
package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
package/src/llama.cpp/src/models/nemotron.cpp +122 -0
package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
package/src/llama.cpp/src/models/olmo.cpp +121 -0
package/src/llama.cpp/src/models/olmo2.cpp +150 -0
package/src/llama.cpp/src/models/olmoe.cpp +124 -0
package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
package/src/llama.cpp/src/models/openelm.cpp +124 -0
package/src/llama.cpp/src/models/orion.cpp +123 -0
package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
package/src/llama.cpp/src/models/phi2.cpp +121 -0
package/src/llama.cpp/src/models/phi3.cpp +152 -0
package/src/llama.cpp/src/models/plamo.cpp +110 -0
package/src/llama.cpp/src/models/plamo2.cpp +316 -0
package/src/llama.cpp/src/models/plm.cpp +168 -0
package/src/llama.cpp/src/models/qwen.cpp +108 -0
package/src/llama.cpp/src/models/qwen2.cpp +117 -0
package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
package/src/llama.cpp/src/models/qwen3.cpp +117 -0
package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
package/src/llama.cpp/src/models/refact.cpp +94 -0
package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
package/src/llama.cpp/src/models/smollm3.cpp +128 -0
package/src/llama.cpp/src/models/stablelm.cpp +146 -0
package/src/llama.cpp/src/models/starcoder.cpp +100 -0
package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
package/src/llama.cpp/src/models/xverse.cpp +108 -0

package/src/llama.cpp/common/download.h ADDED Viewed

@@ -0,0 +1,55 @@
+#pragma once
+#include <string>
+struct common_params_model;
+//
+// download functionalities
+//
+struct common_cached_model_info {
+    std::string manifest_path;
+    std::string user;
+    std::string model;
+    std::string tag;
+    size_t      size = 0; // GGUF size in bytes
+    std::string to_string() const {
+        return user + "/" + model + ":" + tag;
+    }
+};
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
+};
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+common_hf_file_res common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & bearer_token,
+    bool offline);
+// returns true if download succeeded
+bool common_download_model(
+    const common_params_model & model,
+    const std::string & bearer_token,
+    bool offline);
+// returns list of cached models
+std::vector<common_cached_model_info> common_list_cached_models();
+// resolve and download model from Docker registry
+// return local path to downloaded model file
+std::string common_docker_resolve_model(const std::string & docker);

package/src/llama.cpp/common/json-schema-to-grammar.cpp CHANGED Viewed

@@ -601,7 +601,10 @@ private:
     }
     std::string _resolve_ref(const std::string & ref) {
-        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        auto it = ref.find('#');
+        std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
+        static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
+        std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
         if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
             _refs_being_resolved.insert(ref);
             json resolved = _refs[ref];
@@ -774,11 +777,24 @@ public:
                         std::vector<std::string> tokens = string_split(pointer, "/");
                         for (size_t i = 1; i < tokens.size(); ++i) {
                             std::string sel = tokens[i];
-                            if (target.is_null() || !target.contains(sel)) {
+                            if (target.is_object() && target.contains(sel)) {
+                                target = target[sel];
+                            } else if (target.is_array()) {
+                                size_t sel_index;
+                                try {
+                                    sel_index = std::stoul(sel);
+                                } catch (const std::invalid_argument & e) {
+                                    sel_index = target.size();
+                                }
+                                if (sel_index >= target.size()) {
+                                    _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                    return;
+                                }
+                                target = target[sel_index];
+                            } else {
                                 _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
                                 return;
                             }
-                            target = target[sel];
                         }
                         _refs[ref] = target;
                     }

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -168,7 +168,7 @@ option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
 option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
 option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
-option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_VXE              "ggml: enable vxe"              ${GGML_NATIVE})
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adr
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
+option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

package/src/llama.cpp/ggml/include/ggml-hexagon.h ADDED Viewed

@@ -0,0 +1,19 @@
+#pragma once
+#include "ggml.h"
+#include "ggml-backend.h"
+#ifdef  __cplusplus
+extern "C" {
+#endif
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
+GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
+#ifdef  __cplusplus
+}
+#endif

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -242,6 +242,7 @@
 #define GGML_ROPE_TYPE_NEOX   2
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24
+#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000
 #define GGML_MROPE_SECTIONS   4
@@ -2107,6 +2108,7 @@ extern "C" {
     enum ggml_scale_mode {
         GGML_SCALE_MODE_NEAREST  = 0,
         GGML_SCALE_MODE_BILINEAR = 1,
+        GGML_SCALE_MODE_BICUBIC  = 2,
         GGML_SCALE_MODE_COUNT
     };

package/src/llama.cpp/ggml/src/CMakeLists.txt CHANGED Viewed

@@ -308,6 +308,10 @@ function(ggml_add_cpu_backend_variant tag_name)
             set(GGML_INTERNAL_${feat} ON)
         endforeach()
     elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
+        foreach (feat VXE2 NNPA)
+            set(GGML_INTERNAL_${feat} OFF)
+        endforeach()
         foreach (feat ${ARGN})
             set(GGML_INTERNAL_${feat} ON)
         endforeach()
@@ -377,9 +381,8 @@ if (GGML_CPU_ALL_VARIANTS)
         endif()
     elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
         if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-            ggml_add_cpu_backend_variant(s390x_z15  Z15 VXE)
-            # ggml_add_cpu_backend_variant(s390x_z16  Z16 VXE)
-            # ggml_add_cpu_backend_variant(s390x_z17  Z17 VXE)
+            ggml_add_cpu_backend_variant(z15    Z15 VXE2)
+            ggml_add_cpu_backend_variant(z16    Z16 VXE2 NNPA)
         else()
             message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
         endif()
@@ -402,6 +405,7 @@ ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
+ggml_add_backend(Hexagon)
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt CHANGED Viewed

@@ -504,11 +504,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             endforeach()
         endif()
-        if (GGML_VXE OR GGML_INTERNAL_VXE)
-            message(STATUS "VX/VXE/VXE2 enabled")
+        if (GGML_VXE OR GGML_INTERNAL_VXE2)
+            message(STATUS "VXE2 enabled")
             list(APPEND ARCH_FLAGS -mvx -mzvector)
-            list(APPEND ARCH_DEFINITIONS GGML_VXE)
+            list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
         endif()
+        if (GGML_INTERNAL_NNPA)
+            message(STATUS "NNPA enabled")
+            list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
+        endif()
+        ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
     elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
         message(STATUS "Wasm detected")
         list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)

package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c CHANGED Viewed

@@ -700,7 +700,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     for (; ib + 1 < nb; ib += 2) {
         // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+        const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
         const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
@@ -714,11 +715,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
         bx_1 = __lsx_vsub_b(bx_1, off);
         const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
         // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
+        const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
+        const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
         const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);

package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c CHANGED Viewed

@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
             uint8_t *patmp = atmp;
             int vsums;
-            int tmp;
+            int tmp, t1, t2, t3, t4, t5, t6, t7;
             __asm__ __volatile__(
                 "vsetivli zero, 16, e8, m1\n\t"
                 "vmv.v.x v8, zero\n\t"
+                "lb zero, 15(%[sc])\n\t"
                 "vle8.v v1, (%[sc])\n\t"
+                "vle8.v v2, (%[bsums])\n\t"
+                "addi %[tmp], %[bsums], 16\n\t"
                 "vand.vi v0, v1, 0xF\n\t"
                 "vsrl.vi v1, v1, 4\n\t"
+                "vle8.v v3, (%[tmp])\n\t"
                 "vse8.v v0, (%[scale])\n\t"
                 "vsetivli zero, 16, e16, m2\n\t"
-                "vle16.v v2, (%[bsums])\n\t"
                 "vzext.vf2 v0, v1\n\t"
                 "vwmul.vv v4, v0, v2\n\t"
                 "vsetivli zero, 16, e32, m4\n\t"
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             for (int j = 0; j < QK_K/128; ++j) {
                 __asm__ __volatile__(
-                    "vsetvli zero, %[vl32], e8, m2\n\t"
+                    "lb zero, 31(%[q2])\n\t"
+                    "addi %[tmp], %[q2], 16\n\t"
+                    "addi %[t1], %[q8], 16\n\t"
+                    "vsetivli zero, 16, e8, m1\n\t"
                     "vle8.v v0, (%[q2])\n\t"
+                    "vle8.v v1, (%[tmp])\n\t"
                     "vsrl.vi v2, v0, 2\n\t"
+                    "vsrl.vi v3, v1, 2\n\t"
                     "vsrl.vi v4, v0, 4\n\t"
+                    "addi %[tmp], %[q8], 32\n\t"
+                    "vle8.v v8, (%[q8])\n\t"
+                    "vle8.v v9, (%[t1])\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vsrl.vi v5, v1, 4\n\t"
                     "vsrl.vi v6, v0, 6\n\t"
+                    "vsrl.vi v7, v1, 6\n\t"
+                    "vle8.v v10, (%[tmp])\n\t"
+                    "vle8.v v11, (%[t1])\n\t"
+                    "addi %[tmp], %[tmp], 32\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
                     "vand.vi v0, v0, 0x3\n\t"
+                    "vand.vi v1, v1, 0x3\n\t"
                     "vand.vi v2, v2, 0x3\n\t"
+                    "vle8.v v12, (%[tmp])\n\t"
+                    "vle8.v v13, (%[t1])\n\t"
+                    "addi %[tmp], %[tmp], 32\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vand.vi v3, v3, 0x3\n\t"
                     "vand.vi v4, v4, 0x3\n\t"
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
-                    "vle8.v v8, (%[q8])\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vand.vi v5, v5, 0x3\n\t"
+                    "vle8.v v14, (%[tmp])\n\t"
+                    "vle8.v v15, (%[t1])\n\t"
                     "vwmul.vv v16, v0, v8\n\t"
+                    "vwmul.vv v18, v1, v9\n\t"
+                    "vwmul.vv v20, v2, v10\n\t"
+                    "vwmul.vv v22, v3, v11\n\t"
                     "vwmul.vv v24, v4, v12\n\t"
-                    "vsetivli zero, 16, e16, m2\n\t"
+                    "vwmul.vv v26, v5, v13\n\t"
+                    "vwmul.vv v28, v6, v14\n\t"
+                    "vwmul.vv v30, v7, v15\n\t"
+                    "vsetivli zero, 8, e16, m1\n\t"
                     "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
+                    "lbu %[tmp], 0(%[scale])\n\t"
+                    "vwredsum.vs v8, v16, v0\n\t"
                     "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
+                    "lbu %[t1], 1(%[scale])\n\t"
+                    "vwredsum.vs v10, v20, v0\n\t"
+                    "vwredsum.vs v11, v22, v0\n\t"
+                    "lbu %[t2], 2(%[scale])\n\t"
+                    "vwredsum.vs v12, v24, v0\n\t"
+                    "vwredsum.vs v13, v26, v0\n\t"
+                    "lbu %[t3], 3(%[scale])\n\t"
+                    "vwredsum.vs v14, v28, v0\n\t"
+                    "vwredsum.vs v15, v30, v0\n\t"
+                    "lbu %[t4], 4(%[scale])\n\t"
+                    "vwredsum.vs v8, v17, v8\n\t"
+                    "vwredsum.vs v9, v19, v9\n\t"
+                    "lbu %[t5], 5(%[scale])\n\t"
+                    "vwredsum.vs v10, v21, v10\n\t"
+                    "vwredsum.vs v11, v23, v11\n\t"
+                    "lbu %[t6], 6(%[scale])\n\t"
+                    "vwredsum.vs v12, v25, v12\n\t"
+                    "vwredsum.vs v13, v27, v13\n\t"
+                    "lbu %[t7], 7(%[scale])\n\t"
+                    "vwredsum.vs v14, v29, v14\n\t"
+                    "vwredsum.vs v15, v31, v15\n\t"
                     "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vzext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmul.vx v0, v8, %[tmp]\n\t"
+                    "vmul.vx v1, v9, %[t1]\n\t"
+                    "vmacc.vx v0, %[t2], v10\n\t"
+                    "vmacc.vx v1, %[t3], v11\n\t"
+                    "vmacc.vx v0, %[t4], v12\n\t"
+                    "vmacc.vx v1, %[t5], v13\n\t"
+                    "vmacc.vx v0, %[t6], v14\n\t"
+                    "vmacc.vx v1, %[t7], v15\n\t"
                     "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
+                    "vmv.x.s %[t1], v1\n\t"
+                    "add %[isum], %[isum], %[tmp]\n\t"
+                    "add %[isum], %[isum], %[t1]"
+                    : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+                    , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
+                    , [isum] "+&r" (isum)
                     : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
-                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
                     : "memory"
                     , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
                     , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             const  int8_t * restrict q8 = y[i].qs;
             int8_t * scale = (int8_t *)utmp;
-            int tmp;
+            int tmp, t1, t2, t3, t4, t5, t6, t7;
             __asm__ __volatile__(
                 "vsetivli zero, 12, e8, m1\n\t"
                 "vle8.v v0, (%[s6b])\n\t"
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             int isum = 0;
             for (int j = 0; j < QK_K; j += 128) {
                 __asm__ __volatile__(
+                    "lb zero, 31(%[q3])\n\t"
                     "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
                     "vle8.v v8, (%[q3])\n\t"
                     "vsrl.vi v10, v8, 2\n\t"
                     "vsrl.vi v12, v8, 4\n\t"
                     "vsrl.vi v14, v8, 6\n\t"
+                    "lb zero, 64(%[q8])\n\t"
                     "vand.vi v8, v8, 3\n\t"
                     "vand.vi v10, v10, 3\n\t"
                     "vand.vi v12, v12, 3\n\t"
                     "vle8.v v2, (%[qh])\n\t"
+                    "lb zero, 127(%[q8])\n\t"
                     "vand.vx v4, v2, %[m]\n\t"
                     "slli %[m], %[m], 1\n\t"
                     "vmseq.vx v0, v4, zero\n\t"
                     "vadd.vi v8, v8, -4, v0.t\n\t"
+                    "lb zero, 0(%[q8])\n\t"
                     "vand.vx v4, v2, %[m]\n\t"
                     "slli %[m], %[m], 1\n\t"
                     "vmseq.vx v0, v4, zero\n\t"
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
                     "vadd.vi v14, v14, -4, v0.t\n\t"
                     "vsetvli zero, %[vl128], e8, m8\n\t"
                     "vle8.v v0, (%[q8])\n\t"
+                    "lb %[tmp], 0(%[scale])\n\t"
+                    "lb %[t1], 1(%[scale])\n\t"
+                    "lb %[t2], 2(%[scale])\n\t"
+                    "lb %[t3], 3(%[scale])\n\t"
                     "vsetvli zero, %[vl64], e8, m4\n\t"
                     "vwmul.vv v16, v0, v8\n\t"
                     "vwmul.vv v24, v4, v12\n\t"
                     "vsetivli zero, 16, e16, m2\n\t"
                     "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
+                    "vwredsum.vs v8, v16, v0\n\t"
+                    "lb %[t4], 4(%[scale])\n\t"
+                    "lb %[t5], 5(%[scale])\n\t"
                     "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
+                    "vwredsum.vs v10, v20, v0\n\t"
+                    "vwredsum.vs v11, v22, v0\n\t"
+                    "vwredsum.vs v12, v24, v0\n\t"
+                    "lb %[t6], 6(%[scale])\n\t"
+                    "lb %[t7], 7(%[scale])\n\t"
+                    "vwredsum.vs v13, v26, v0\n\t"
+                    "vwredsum.vs v14, v28, v0\n\t"
+                    "vwredsum.vs v15, v30, v0\n\t"
                     "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vsext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmul.vx v0, v8, %[tmp]\n\t"
+                    "vmul.vx v1, v9, %[t1]\n\t"
+                    "vmacc.vx v0, %[t2], v10\n\t"
+                    "vmacc.vx v1, %[t3], v11\n\t"
+                    "vmacc.vx v0, %[t4], v12\n\t"
+                    "vmacc.vx v1, %[t5], v13\n\t"
+                    "vmacc.vx v0, %[t6], v14\n\t"
+                    "vmacc.vx v1, %[t7], v15\n\t"
                     "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
+                    "vmv.x.s %[t1], v1\n\t"
+                    "add %[isum], %[isum], %[tmp]\n\t"
+                    "add %[isum], %[isum], %[t1]"
+                    : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+                    , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
+                    , [m] "+&r" (m), [isum] "+&r" (isum)
                     : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
                     , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
                     : "memory"

package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp ADDED Viewed

@@ -0,0 +1,50 @@
+#include "ggml-backend-impl.h"
+#if defined(__s390x__)
+#include <sys/auxv.h>
+// find hwcap bits in asm/elf.h
+#ifndef HWCAP_VXRS_EXT2
+#define HWCAP_VXRS_EXT2 (1 << 15)
+#endif
+#ifndef HWCAP_NNPA
+#define HWCAP_NNPA (1 << 20)
+#endif
+struct s390x_features {
+    bool has_vxe2 = false;
+    bool has_nnpa = false;
+    s390x_features() {
+        uint32_t hwcap = getauxval(AT_HWCAP);
+        // NOTE: use hwcap2 with DFLT for z17 and later
+        // uint32_t hwcap2 = getauxval(AT_HWCAP2);
+        has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
+        has_nnpa = !!(hwcap & HWCAP_NNPA);
+    }
+};
+static int ggml_backend_cpu_s390x_score() {
+    int score = 1;
+    s390x_features sf;
+// IBM z15 / LinuxONE 3
+#ifdef GGML_USE_VXE2
+    if (!sf.has_vxe2) { return 0; }
+    score += 1 << 1;
+#endif
+// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
+#ifdef GGML_USE_NNPA
+    if (!sf.has_nnpa) { return 0; }
+    score += 1 << 2;
+#endif
+    return score;
+}
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
+#endif  // __s390x__

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h CHANGED Viewed

@@ -500,13 +500,15 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
 #endif
-#if defined(__loongarch_asx)
+#if defined(__loongarch_sx)
 /* float type data load instructions */
 static __m128 __lsx_vreplfr2vr_s(const float val) {
     v4f32 res = {val, val, val, val};
     return (__m128)res;
 }
+#endif
+#if defined(__loongarch_asx)
 static __m256 __lasx_xvreplfr2vr_s(const float val) {
     v8f32 res = {val, val, val, val, val, val, val, val};
     return (__m256)res;

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -1613,13 +1613,8 @@ static void ggml_compute_forward_mul_mat_id(
             chunk_size = 64;
         }
-#if defined(__aarch64__)
-        // disable for ARM
-        const bool disable_chunking = true;
-#else
         // disable for NUMA
         const bool disable_chunking = ggml_is_numa();
-#endif // defined(__aarch64__)
         int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
         int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;