npm - @fugood/llama.node - Versions diffs - 0.3.17 → 0.4.0 - Mend

@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

package/CMakeLists.txt +3 -1
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +39 -2
package/lib/index.js +132 -1
package/lib/index.ts +203 -3
package/package.json +2 -1
package/src/EmbeddingWorker.cpp +1 -1
package/src/LlamaCompletionWorker.cpp +366 -19
package/src/LlamaCompletionWorker.h +30 -10
package/src/LlamaContext.cpp +213 -5
package/src/LlamaContext.h +12 -0
package/src/common.hpp +15 -0
package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
package/src/llama.cpp/.github/workflows/build.yml +41 -762
package/src/llama.cpp/.github/workflows/docker.yml +5 -2
package/src/llama.cpp/.github/workflows/release.yml +716 -0
package/src/llama.cpp/.github/workflows/server.yml +12 -12
package/src/llama.cpp/CMakeLists.txt +5 -17
package/src/llama.cpp/cmake/build-info.cmake +8 -2
package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
package/src/llama.cpp/common/CMakeLists.txt +31 -3
package/src/llama.cpp/common/arg.cpp +48 -29
package/src/llama.cpp/common/chat.cpp +128 -106
package/src/llama.cpp/common/chat.h +2 -0
package/src/llama.cpp/common/common.cpp +37 -1
package/src/llama.cpp/common/common.h +18 -9
package/src/llama.cpp/common/llguidance.cpp +1 -0
package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
package/src/llama.cpp/common/minja/minja.hpp +69 -36
package/src/llama.cpp/common/regex-partial.cpp +204 -0
package/src/llama.cpp/common/regex-partial.h +56 -0
package/src/llama.cpp/common/sampling.cpp +57 -50
package/src/llama.cpp/examples/CMakeLists.txt +2 -23
package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/training/finetune.cpp +96 -0
package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
package/src/llama.cpp/ggml/include/ggml.h +10 -7
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
package/src/llama.cpp/ggml/src/ggml.c +29 -20
package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
package/src/llama.cpp/include/llama.h +52 -11
package/src/llama.cpp/requirements/requirements-all.txt +3 -3
package/src/llama.cpp/scripts/xxd.cmake +1 -1
package/src/llama.cpp/src/CMakeLists.txt +1 -0
package/src/llama.cpp/src/llama-adapter.cpp +6 -0
package/src/llama.cpp/src/llama-arch.cpp +3 -0
package/src/llama.cpp/src/llama-batch.cpp +5 -1
package/src/llama.cpp/src/llama-batch.h +2 -1
package/src/llama.cpp/src/llama-chat.cpp +17 -7
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +389 -501
package/src/llama.cpp/src/llama-context.h +44 -32
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +20 -38
package/src/llama.cpp/src/llama-graph.h +12 -8
package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
package/src/llama.cpp/src/llama-kv-cache.h +271 -85
package/src/llama.cpp/src/llama-memory.h +11 -1
package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
package/src/llama.cpp/src/llama-model-saver.h +37 -0
package/src/llama.cpp/src/llama-model.cpp +316 -69
package/src/llama.cpp/src/llama-model.h +8 -1
package/src/llama.cpp/src/llama-quant.cpp +15 -13
package/src/llama.cpp/src/llama-sampling.cpp +18 -6
package/src/llama.cpp/src/llama-vocab.cpp +42 -4
package/src/llama.cpp/src/llama-vocab.h +6 -0
package/src/llama.cpp/src/llama.cpp +14 -0
package/src/llama.cpp/tests/CMakeLists.txt +10 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
package/src/llama.cpp/tests/test-chat.cpp +3 -1
package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
package/src/llama.cpp/tests/test-opt.cpp +33 -21
package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
package/src/llama.cpp/tests/test-sampling.cpp +1 -1
package/src/llama.cpp/tools/CMakeLists.txt +39 -0
package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
package/src/llama.cpp/tools/mtmd/clip.h +99 -0
package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
package/src/llama.cpp/examples/infill/infill.cpp +0 -590
package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
package/src/llama.cpp/examples/llava/clip.h +0 -135
package/src/llama.cpp/examples/llava/llava.cpp +0 -586
package/src/llama.cpp/examples/llava/llava.h +0 -49
package/src/llama.cpp/examples/llava/mtmd.h +0 -168
package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
/package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
/package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
/package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
/package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
/package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
/package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
/package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
/package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0

package/src/llama.cpp/ggml/src/CMakeLists.txt CHANGED Viewed

@@ -214,7 +214,7 @@ add_library(ggml
 target_link_libraries(ggml PUBLIC ggml-base)
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    target_link_libraries(ggml PRIVATE dl stdc++fs)
+    target_link_libraries(ggml PRIVATE dl)
 endif()
 function(ggml_add_backend_library backend)

package/src/llama.cpp/ggml/src/ggml-alloc.c CHANGED Viewed

@@ -816,7 +816,10 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
     size_t node_size = 0;
     if (!node->data && !node->view_src) {
-        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        // If we previously had data but don't now then reallocate
+        if (talloc->buffer_id < 0) {
+            return false;
+        }
         node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
     }
     return talloc->size_max >= node_size;

package/src/llama.cpp/ggml/src/ggml-backend.cpp CHANGED Viewed

@@ -56,7 +56,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
     return SIZE_MAX;
 }
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to ggml_nbytes
     if (buft->iface.get_alloc_size) {
         size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -152,7 +152,7 @@ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
     return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
 }
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor) {
     return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
     char * context_buffer;
     size_t context_buffer_size;
+    bool op_offload;
     int debug;
 };
@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                 for (int b = 0; b < src_backend_id; b++) {
                     if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             const int node_backend_id = tensor_backend_id(node);
-            assert(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
         ggml_backend_buffer_type_t * bufts,
         int n_backends,
         size_t graph_size,
-        bool parallel) {
+        bool parallel,
+        bool op_offload) {
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
     }
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    sched->op_offload = op_offload;
     ggml_backend_sched_reset(sched);

package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt CHANGED Viewed

@@ -385,9 +385,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         # Fetch KleidiAI sources:
         include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.6.0")
         set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")
+        set(KLEIDIAI_ARCHIVE_MD5  "75b4ad68f25ab673dcc01065e5a0b05f")
         if (POLICY CMP0135)
             cmake_policy(SET CMP0135 NEW)
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             ${KLEIDIAI_SRC}/kai/ukernels/
             ${KLEIDIAI_SRC}/kai/ukernels/matmul/
             ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
             ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
         set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
         string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
         if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
         endif()
         if (NOT I8MM_ENABLED MATCHES -1)
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
         if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
         endif()
         set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp CHANGED Viewed

@@ -72,8 +72,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
-#elif defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #define UNUSED GGML_UNUSED

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c CHANGED Viewed

@@ -20,12 +20,6 @@
 #define GROUP_MAX_EPS_IQ1_M 1e-7f
 #define GROUP_MAX_EPS_IQ1_S 1e-12f
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid warnings for hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-#endif
 #define UNUSED GGML_UNUSED
 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -6596,7 +6590,118 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
     *s = hsum_float_8(acc);
+#elif defined(__VXE__) || defined(__VXE2__)
+    uint32_t aux[3];
+    uint32_t utmp[4];
+    const int32x4_t v_z = vec_splat_s32(0);
+    const uint8x16_t v_3m = vec_splat_u8(0x03);
+    const uint8x16_t v_0c = vec_splat_u8(1);
+    const uint8x16_t v_1c = vec_sl(v_0c, 1);
+    const uint8x16_t v_2c = vec_sl(v_0c, 2);
+    const uint8x16_t v_3c = vec_sl(v_0c, 3);
+    uint8x16_t q3h[4];
+    uint8x16_t q3b[2];
+    int8x16_t q3bytes[4];
+    int8x16_t q8bytes[4];
+    uint8x16_t qhbits[2];
+    float sum = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * restrict x0l = x[i].qs;
+        const uint8_t * restrict x0h = x[i].hmask;
+        const int8_t  * restrict y0  = y[i].qs;
+        qhbits[0] = vec_xl(0 , x0h);
+        qhbits[1] = vec_xl(16, x0h);
+        int32_t isum = 0;
+        memcpy(aux, x[i].scales, 12);
+        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+        int8_t * scale = (int8_t *)utmp;
+        for (int j = 0; j < 16; ++j) scale[j] -= 32;
+        for (int j = 0; j < QK_K/128; ++j) {
+            int32x4_t isum0, isum1, isum2, isum3;
+            q3b[0] = vec_xl(0 , x0l);
+            q3b[1] = vec_xl(16, x0l);
+            x0l += 32;
+            q8bytes[0] = vec_xl(0  , y0);
+            q8bytes[1] = vec_xl(16 , y0);
+            q8bytes[2] = vec_xl(32 , y0);
+            q8bytes[3] = vec_xl(48 , y0);
+            q8bytes[4] = vec_xl(64 , y0);
+            q8bytes[5] = vec_xl(80 , y0);
+            q8bytes[6] = vec_xl(96 , y0);
+            q8bytes[7] = vec_xl(112, y0);
+            y0 += 128;
+            q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
+            q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
+            q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
+            q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
+            q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
+            q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
+            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
+            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
+            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
+            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
+            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
+            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
+            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
+            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
+            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
+            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+            scale += 4;
+            q3h[0] = vec_andc(v_2c, qhbits[0]);
+            q3h[1] = vec_andc(v_2c, qhbits[1]);
+            q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
+            q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
+            q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
+            q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
+            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
+            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
+            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
+            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
+            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
+            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
+            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
+            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
+            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
+            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
+            scale += 4;
+            if (j == 0) {
+                qhbits[0] = vec_sr(qhbits[0], 4);
+                qhbits[1] = vec_sr(qhbits[1], 4);
+            }
+        }
+        sum += d * isum;
+    }
+    *s = sum;
 #else
     // scalar version
     // This function is written like this so the compiler can manage to vectorize most of it
@@ -8414,7 +8519,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    assert((nrc == 2) || (nrc == 1));
+#else
     assert(nrc == 1);
+#endif
     UNUSED(nrc);
     UNUSED(bx);
     UNUSED(by);
@@ -8425,6 +8534,197 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
     const int nb = n / QK_K;
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q6_K * GGML_RESTRICT x0 = x;
+        const block_q6_K * GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
+        const block_q8_K * GGML_RESTRICT y0 = y;
+        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
+        float32x4_t vfsum = vdupq_n_f32(0.0f);
+        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
+            const uint8_t * GGML_RESTRICT ql0 = x0->ql;
+            const uint8_t * GGML_RESTRICT ql1 = x1->ql;
+            const uint8_t * GGML_RESTRICT qh0 = x0->qh;
+            const uint8_t * GGML_RESTRICT qh1 = x1->qh;
+            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
+            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
+            const uint8x16_t mone = vdupq_n_u8(0x30);
+            const uint8x16_t  m4b = vdupq_n_u8(0x0f);
+            int32x4_t visum = vdupq_n_s32(0);
+            // process 8 blocks per iteration, totally 16 blocks
+            for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
+                int8x16_t vx0[8], vx1[8];
+                // de-quantize vx0[8]
+                {
+                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
+                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
+                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
+                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
+                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
+                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
+                    vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
+                    vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
+                    vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
+                    vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
+                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
+                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
+                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
+                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
+                    vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
+                    vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
+                    vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
+                    vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
+                }
+                // de-quantize vx1[8]
+                {
+                    const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
+                    const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
+                    uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
+                    uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
+                    uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
+                    uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
+                    vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
+                    vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
+                    vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
+                    vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
+                    q6h_0 = vandq_u8(mone, qh_bits.val[0]);
+                    q6h_1 = vandq_u8(mone, qh_bits.val[1]);
+                    q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
+                    q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
+                    vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
+                    vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
+                    vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
+                    vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
+                }
+                // process 16 elements (one block with same scale) per iteration
+                // - vx = concat(ql, qh) - 32
+                // - r1,r2,r3,r4 = smmla(vx, vy)
+                for (int k = 0; k < 8; ++k) {
+                    const int blk = j * 8 + k;
+                    const int8x16_t vy0 = vld1q_s8(qy0);
+                    const int8x16_t vy1 = vld1q_s8(qy1);
+                    qy0 += 16;
+                    qy1 += 16;
+                    const int32x4_t block_scale = {
+                        x0->scales[blk],
+                        x0->scales[blk],
+                        x1->scales[blk],
+                        x1->scales[blk],
+                    };
+                    // calculate four results at once with outer product
+                    const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
+                    const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
+                    const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
+                    const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
+                    int32x4_t vr = vdupq_n_s32(0);
+                    vr = vmmlaq_s32(vr, vx_l, vy_l);
+                    vr = vmmlaq_s32(vr, vx_h, vy_h);
+                    // apply block scale, will NOT overflow
+                    // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
+                    visum = vmlaq_s32(visum, vr, block_scale);
+                }
+            }
+            // adjust bias, apply superblock scale
+            {
+                int32_t bias[4];
+#ifdef __ARM_FEATURE_SVE
+                const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
+                const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
+                const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
+                const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
+                const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
+                const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
+                const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
+                const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
+                const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
+                const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
+                const svint64_t zero = svdup_n_s64(0);
+                bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
+                                                                               svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
+                bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
+                                                                               svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
+                bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
+                                                                               svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
+                bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
+                                                                               svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
+#else
+                // NEON doesn't support int16 dot product, fallback to separated mul and add
+                const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
+                const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
+                int8x16_t scales_s8 = vld1q_s8(x0->scales);
+                const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
+                scales_s8 = vld1q_s8(x1->scales);
+                const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
+                int32x4_t prod;
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
+                bias[0] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
+                bias[1] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
+                bias[2] = vaddvq_s32(prod);
+                prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
+                                 vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
+                                           vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
+                bias[3] = vaddvq_s32(prod);
+#endif
+                const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
+                const float32x4_t superblock_scale = {
+                    GGML_FP16_TO_FP32(x0->d) * y0->d,
+                    GGML_FP16_TO_FP32(x0->d) * y1->d,
+                    GGML_FP16_TO_FP32(x1->d) * y0->d,
+                    GGML_FP16_TO_FP32(x1->d) * y1->d,
+                };
+                visum = vsubq_s32(visum, vibias);
+                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
+            }
+        }
+        // vfsum = ABCD -> ACBD
+        // AC -> s, BD -> (s+bs)
+        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
+        vst1_f32(s,      vget_low_f32 (vfsum));
+        vst1_f32(s + bs, vget_high_f32(vfsum));
+        return;
+    }
+#endif
 #ifdef __ARM_FEATURE_SVE
     const int vector_length = ggml_cpu_get_sve_cnt()*8;
     float sum = 0;

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -50,19 +50,6 @@
 #include "llamafile/sgemm.h"
 #endif
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-// unreachable code because of multiple instances of code after GGML_ABORT
-#pragma warning(disable: 4702)
-#endif
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
@@ -295,7 +282,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_q6_K,
         .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
         .nrows                    = 1,
+#endif
     },
     [GGML_TYPE_IQ2_XXS] = {
         .from_float               = NULL,

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED Viewed

@@ -11,24 +11,26 @@
 #include <vector>
 #ifdef GGML_USE_CPU_HBM
-#include "ggml-cpu-hbm.h"
+#    include "ggml-cpu-hbm.h"
 #endif
 #ifdef GGML_USE_CPU_KLEIDIAI
-#include "kleidiai/kleidiai.h"
-#endif
-#if defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
+#    include "kleidiai/kleidiai.h"
 #endif
 #if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-    #define NOMINMAX
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#    include <windows.h>
+#else
+#    include <unistd.h>
 #endif
-#include <windows.h>
+#if defined(__APPLE__)
+#    include <sys/sysctl.h>
+#    include <sys/types.h>
 #endif
 // ggml-backend interface
@@ -70,8 +72,10 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty
 }
 static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra == buft) return true;
+    for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
+        if (extra && extra == buft) {
+            return true;
+        }
     }
     return false;
 }
@@ -330,9 +334,18 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d
 }
 static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    // TODO
-    *free = 0;
-    *total = 0;
+#ifdef _WIN32
+    MEMORYSTATUSEX status;
+    status.dwLength = sizeof(status);
+    GlobalMemoryStatusEx(&status);
+    *total = status.ullTotalPhys;
+    *free = status.ullAvailPhys;
+#else
+    long pages = sysconf(_SC_PHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    *total = pages * page_size;
+    *free = *total;
+#endif
     GGML_UNUSED(dev);
 }