npm - cui-llama.rn - Versions diffs - 1.3.4 → 1.3.6 - Mend

cui-llama.rn 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/android/src/main/CMakeLists.txt +14 -8
package/android/src/main/jni.cpp +38 -37
package/cpp/common.cpp +50 -30
package/cpp/common.h +32 -13
package/cpp/ggml-alloc.c +0 -1
package/cpp/ggml-backend-reg.cpp +79 -49
package/cpp/ggml-backend.cpp +5 -2
package/cpp/ggml-cpp.h +1 -0
package/cpp/ggml-cpu-aarch64.cpp +57 -72
package/cpp/ggml-cpu-quants.c +5 -1
package/cpp/ggml-cpu.c +6 -6
package/cpp/ggml-cpu.cpp +9 -0
package/cpp/ggml-impl.h +11 -0
package/cpp/ggml-metal.m +2 -2
package/cpp/ggml.c +129 -1388
package/cpp/ggml.h +29 -152
package/cpp/gguf.cpp +1325 -0
package/cpp/gguf.h +202 -0
package/cpp/llama-adapter.cpp +346 -0
package/cpp/llama-adapter.h +73 -0
package/cpp/llama-arch.cpp +1434 -0
package/cpp/llama-arch.h +395 -0
package/cpp/llama-batch.cpp +368 -0
package/cpp/llama-batch.h +88 -0
package/cpp/llama-chat.cpp +567 -0
package/cpp/llama-chat.h +51 -0
package/cpp/llama-context.cpp +1771 -0
package/cpp/llama-context.h +128 -0
package/cpp/llama-cparams.cpp +1 -0
package/cpp/llama-cparams.h +37 -0
package/cpp/llama-cpp.h +30 -0
package/cpp/llama-grammar.cpp +16 -15
package/cpp/llama-grammar.h +5 -6
package/cpp/llama-hparams.cpp +71 -0
package/cpp/llama-hparams.h +140 -0
package/cpp/llama-impl.cpp +167 -0
package/cpp/llama-impl.h +16 -136
package/cpp/llama-kv-cache.cpp +718 -0
package/cpp/llama-kv-cache.h +218 -0
package/cpp/llama-mmap.cpp +589 -0
package/cpp/llama-mmap.h +67 -0
package/cpp/llama-model-loader.cpp +1011 -0
package/cpp/llama-model-loader.h +158 -0
package/cpp/llama-model.cpp +2202 -0
package/cpp/llama-model.h +391 -0
package/cpp/llama-sampling.cpp +117 -4
package/cpp/llama-vocab.cpp +26 -29
package/cpp/llama-vocab.h +14 -2
package/cpp/llama.cpp +8839 -19131
package/cpp/llama.cpp.rej +23 -0
package/cpp/llama.h +31 -9
package/cpp/rn-llama.hpp +39 -37
package/cpp/sgemm.cpp +1091 -378
package/cpp/sgemm.h +2 -2
package/cpp/unicode.cpp +6 -0
package/package.json +1 -1

package/cpp/ggml-backend-reg.cpp CHANGED Viewed

@@ -66,6 +66,26 @@
 #include "ggml-kompute.h"
 #endif
+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+static std::wstring utf8_to_utf16(const std::string & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.from_bytes(str);
+}
+static std::string utf16_to_utf8(const std::wstring & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.to_bytes(str);
+}
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
 #ifdef _WIN32
 using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
     return handle;
 }
-static dl_handle * dl_load_library(const std::string & path) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return dl_load_library(converter.from_bytes(path));
-}
 static void * dl_get_sym(dl_handle * handle, const char * name) {
     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -114,8 +129,8 @@ struct dl_handle_deleter {
     }
 };
-static void * dl_load_library(const std::string & path) {
-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+static void * dl_load_library(const std::wstring & path) {
+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
     return handle;
 }
@@ -202,11 +217,11 @@ struct lm_ggml_backend_registry {
         devices.push_back(device);
     }
-    lm_ggml_backend_reg_t load_backend(const char * path, bool silent) {
+    lm_ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
         dl_handle_ptr handle { dl_load_library(path) };
         if (!handle) {
             if (!silent) {
-                LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
+                LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -214,7 +229,7 @@ struct lm_ggml_backend_registry {
         auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
         if (score_fn && score_fn() == 0) {
             if (!silent) {
-                LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
+                LM_GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -222,7 +237,7 @@ struct lm_ggml_backend_registry {
         auto backend_init_fn = (lm_ggml_backend_init_t) dl_get_sym(handle.get(), "lm_ggml_backend_init");
         if (!backend_init_fn) {
             if (!silent) {
-                LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, path);
+                LM_GGML_LOG_ERROR("%s: failed to find lm_ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -231,16 +246,16 @@ struct lm_ggml_backend_registry {
         if (!reg || reg->api_version != LM_GGML_BACKEND_API_VERSION) {
             if (!silent) {
                 if (!reg) {
-                    LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, path);
+                    LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: lm_ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
                 } else {
                     LM_GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path, reg->api_version, LM_GGML_BACKEND_API_VERSION);
+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, LM_GGML_BACKEND_API_VERSION);
                 }
             }
             return nullptr;
         }
-        LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), path);
+        LM_GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, lm_ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
         register_backend(reg, std::move(handle));
@@ -376,14 +391,14 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
 // Dynamic loading
 lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path) {
-    return get_reg().load_backend(path, false);
+    return get_reg().load_backend(utf8_to_utf16(path), false);
 }
 void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg) {
     get_reg().unload_backend(reg, true);
 }
-static std::string get_executable_path() {
+static std::wstring get_executable_path() {
 #if defined(__APPLE__)
     // get executable path
     std::vector<char> path;
@@ -401,13 +416,17 @@ static std::string get_executable_path() {
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    return base_path + "/";
-#elif defined(__linux__)
+    return utf8_to_utf16(base_path + "/");
+#elif defined(__linux__) || defined(__FreeBSD__)
     std::string base_path = ".";
     std::vector<char> path(1024);
     while (true) {
         // get executable path
+#    if defined(__linux__)
         ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+#    elif defined(__FreeBSD__)
+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
+#    endif
         if (len == -1) {
             break;
         }
@@ -423,57 +442,63 @@ static std::string get_executable_path() {
         path.resize(path.size() * 2);
     }
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
 #elif defined(_WIN32)
-    std::vector<char> path(MAX_PATH);
-    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    std::vector<wchar_t> path(MAX_PATH);
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
     if (len == 0) {
-        return "";
+        return {};
     }
-    std::string base_path(path.data(), len);
+    std::wstring base_path(path.data(), len);
     // remove executable name
     auto last_slash = base_path.find_last_of('\\');
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    return base_path + "\\";
+    return base_path + L"\\";
+#else
+    return {};
 #endif
 }
-static std::string backend_filename_prefix() {
+static std::wstring backend_filename_prefix() {
 #ifdef _WIN32
-    return "ggml-";
+    return L"ggml-";
 #else
-    return "libggml-";
+    return L"libggml-";
 #endif
 }
-static std::string backend_filename_suffix() {
+static std::wstring backend_filename_suffix() {
 #ifdef _WIN32
-    return ".dll";
+    return L".dll";
 #else
-    return ".so";
+    return L".so";
+#endif
+}
+static std::wstring path_separator() {
+#ifdef _WIN32
+    return L"\\";
+#else
+    return L"/";
 #endif
 }
 static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
      // TODO: search system paths
-    std::string file_prefix = backend_filename_prefix() + name + "-";
-    std::vector<std::string> search_paths;
+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
+    std::vector<std::wstring> search_paths;
     if (user_search_path == nullptr) {
-        search_paths.push_back("./");
+        search_paths.push_back(L"." + path_separator());
         search_paths.push_back(get_executable_path());
     } else {
-#if defined(_WIN32)
-        search_paths.push_back(std::string(user_search_path) + "\\");
-#else
-        search_paths.push_back(std::string(user_search_path) + "/");
-#endif
+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
     }
     int best_score = 0;
-    std::string best_path;
+    std::wstring best_path;
     namespace fs = std::filesystem;
     for (const auto & search_path : search_paths) {
@@ -483,27 +508,27 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
         for (const auto & entry : dir_it) {
             if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                std::string ext = entry.path().extension().string();
+                std::wstring filename = entry.path().filename().wstring();
+                std::wstring ext = entry.path().extension().wstring();
                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
                     if (!handle && !silent) {
-                        LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                        LM_GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                     }
                     if (handle) {
                         auto score_fn = (lm_ggml_backend_score_t) dl_get_sym(handle.get(), "lm_ggml_backend_score");
                         if (score_fn) {
                             int s = score_fn();
 #ifndef NDEBUG
-                            LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+                            LM_GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
 #endif
                             if (s > best_score) {
                                 best_score = s;
-                                best_path = entry.path().string();
+                                best_path = entry.path().wstring();
                             }
                         } else {
                             if (!silent) {
-                                LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                                LM_GGML_LOG_INFO("%s: failed to find lm_ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                             }
                         }
                     }
@@ -515,15 +540,15 @@ static lm_ggml_backend_reg_t lm_ggml_backend_load_best(const char * name, bool s
     if (best_score == 0) {
         // try to load the base backend
         for (const auto & search_path : search_paths) {
-            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
             if (fs::exists(path)) {
-                return get_reg().load_backend(path.c_str(), silent);
+                return get_reg().load_backend(path, silent);
             }
         }
         return nullptr;
     }
-    return get_reg().load_backend(best_path.c_str(), silent);
+    return get_reg().load_backend(best_path, silent);
 }
 void lm_ggml_backend_load_all() {
@@ -549,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
     lm_ggml_backend_load_best("opencl", silent, dir_path);
     lm_ggml_backend_load_best("musa", silent, dir_path);
     lm_ggml_backend_load_best("cpu", silent, dir_path);
+    // check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
+    const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
+    if (backend_path) {
+        lm_ggml_backend_load(backend_path);
+    }
 }

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
         if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
             int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
             // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1) {
+            if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
                 for (int b = 0; b < src_backend_id; b++) {
                     if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
                         SET_CAUSE(tensor, "1.off");
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
             lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
+            LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
                 sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                if (j == 0) {
+                    LM_GGML_LOG_DEBUG(": ");
+                }
                 LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
                     fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
             }

package/cpp/ggml-cpp.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "gguf.h"
 #include <memory>
 // Smart pointers for ggml types

package/cpp/ggml-cpu-aarch64.cpp CHANGED Viewed

@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
 }
 static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     const __m256i zero = _mm256_setzero_si256();
     return _mm256_dpbusd_epi32(zero, ax, sy);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_avx_epi32(zero, ax, sy);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -564,21 +567,21 @@ static void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
         for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
             float32x4_t acc = vdupq_n_f32(0);
             for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
                 int8x16_t a0 = vld1q_s8(a_ptr->qs);
                 int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
                 int32x4_t ret = vdupq_n_s32(0);
@@ -647,72 +650,52 @@ static void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
-        __asm__ __volatile__(
-            "movi v2.16b, #0x4\n"
-            "movi v1.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x23, %x[a_ptr], #0x2\n"
-            "movi v0.16b, #0x0\n"
-            "mov x22, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q31, [%x[b_ptr], #0x0]\n"
-            "ldr q30, [%x[b_ptr], #0x10]\n"
-            "mov x21, x23\n"
-            "movi v29.4s, #0x0\n"
-            "ldr q28, [%x[b_ptr], #0x20]\n"
-            "ldr q27, [%x[b_ptr], #0x30]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x23, #0x2\n"
-            "ld1r { v25.8h }, [x20]\n"
-            "ldr q24, [%x[b_ptr], #-0x8]\n"
-            "sub x22, x22, #0x1\n"
-            "add x23, x23, #0x22\n"
-            "ld1r { v23.2d }, [x21], #0x8\n"
-            "sshl v22.16b, v31.16b, v2.16b\n"
-            "sshl v16.16b, v30.16b, v2.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "ld1r { v21.2d }, [x21], #0x8\n"
-            "sshl v20.16b, v28.16b, v2.16b\n"
-            "sshl v19.16b, v27.16b, v2.16b\n"
-            "ld1r { v18.2d }, [x21], #0x8\n"
-            "ld1r { v17.2d }, [x21], #0x8\n"
-            "and v31.16b, v31.16b, v1.16b\n"
-            "and v30.16b, v30.16b, v1.16b\n"
-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-            "and v28.16b, v28.16b, v1.16b\n"
-            "and v27.16b, v27.16b, v1.16b\n"
-            "fcvtl v25.4s, v25.4h\n"
-            "fcvtl v16.4s, v24.4h\n"
-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-            "fmul v16.4s, v16.4s, v25.4s\n"
-            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-            "addp v29.4s, v29.4s, v26.4s\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v0.4s, v29.4s, v16.4s\n"
-            "cbnz x22, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q0, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-        );
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+                int32x4_t ret0 = vdupq_n_s32(0);
+                int32x4_t ret1 = vdupq_n_s32(0);
+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
+                int32x4_t ret = vpaddq_s32(ret0, ret1);
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
         return;
     }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     float sumf[4];
     int sumi;
@@ -4186,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
     buffer->buft              = buft;
     buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
     buffer->iface.set_tensor  = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
+    buffer->iface.get_tensor  = nullptr;
+    buffer->iface.cpy_tensor  = nullptr;
     return buffer;
 }

package/cpp/ggml-cpu-quants.c CHANGED Viewed

@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     const __m256i zero = _mm256_setzero_si256();
     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
     return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);

package/cpp/ggml-cpu.c CHANGED Viewed

@@ -985,7 +985,7 @@ inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
 #define LM_GGML_F16_STEP 32
 #define LM_GGML_F16_EPR  4
-static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
+static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
     float tmp[4];
     tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
@@ -996,7 +996,7 @@ static inline __m128 __sse_f16x4_load(lm_ggml_fp16_t *x) {
     return _mm_loadu_ps(tmp);
 }
-static inline void __sse_f16x4_store(lm_ggml_fp16_t *x, __m128 y) {
+static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
     float arr[4];
     _mm_storeu_ps(arr, y);
@@ -7418,14 +7418,14 @@ static void lm_ggml_compute_forward_mul_mat(
     if (src1_cont) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/lm_ggml_type_size(src0->type),
                                      (const char *)src1->data + i12*nb12 + i13*nb13,
                                      nb11/lm_ggml_type_size(src1->type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/lm_ggml_type_size(dst->type),
-                                     ith, nth,
                                      src0->type,
                                      src1->type,
                                      dst->type))
@@ -7470,14 +7470,14 @@ UseGgmlGemm1:;
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/lm_ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/lm_ggml_type_size(src0->type),
                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                      row_size/lm_ggml_type_size(vec_dot_type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/lm_ggml_type_size(dst->type),
-                                     ith, nth,
                                      src0->type,
                                      vec_dot_type,
                                      dst->type))

package/cpp/ggml-cpu.cpp CHANGED Viewed

@@ -393,8 +393,11 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
     switch (op->op) {
         case LM_GGML_OP_CPY:
             return
+                op->type != LM_GGML_TYPE_IQ3_XXS &&
+                op->type != LM_GGML_TYPE_IQ3_S   &&
                 op->type != LM_GGML_TYPE_IQ2_XXS &&
                 op->type != LM_GGML_TYPE_IQ2_XS  &&
+                op->type != LM_GGML_TYPE_IQ2_S   &&
                 op->type != LM_GGML_TYPE_IQ1_S   &&
                 op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case LM_GGML_OP_MUL_MAT:
@@ -518,6 +521,12 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
         if (lm_ggml_cpu_has_sve()) {
             features.push_back({ "SVE", "1" });
         }
+        if (lm_ggml_cpu_has_dotprod()) {
+            features.push_back({ "DOTPROD", "1" });
+        }
+        if (lm_ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
         if (lm_ggml_cpu_get_sve_cnt() > 0) {
             static std::string sve_cnt = std::to_string(lm_ggml_cpu_get_sve_cnt());
             features.push_back({ "SVE_CNT", sve_cnt.c_str() });

package/cpp/ggml-impl.h CHANGED Viewed

@@ -3,6 +3,8 @@
 // GGML internal header
 #include "ggml.h"
+#include "gguf.h"
 #include <assert.h>
 #include <math.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -554,3 +556,12 @@ static inline lm_ggml_bf16_t lm_ggml_compute_fp32_to_bf16(float s) {
 #ifdef __cplusplus
 }
 #endif
+#ifdef __cplusplus
+#include <vector>
+// expose GGUF internals for test code
+LM_GGML_API size_t lm_gguf_type_size(enum lm_gguf_type type);
+LM_GGML_API struct lm_gguf_context * lm_gguf_init_from_file_impl(FILE * file, struct lm_gguf_init_params params);
+LM_GGML_API void lm_gguf_write_to_buf(const struct lm_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
+#endif // __cplusplus

package/cpp/ggml-metal.m CHANGED Viewed

@@ -2067,8 +2067,8 @@ static void lm_ggml_metal_encode_node(
                 LM_GGML_ASSERT(ne12 % ne02 == 0);
                 LM_GGML_ASSERT(ne13 % ne03 == 0);
-                const uint r2 = ne12/ne02;
-                const uint r3 = ne13/ne03;
+                const uint32_t r2 = ne12/ne02;
+                const uint32_t r3 = ne13/ne03;
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel