npm - cui-llama.rn - Versions diffs - 1.7.4 → 1.7.6 - Mend

cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/chat.h CHANGED Viewed

@@ -3,6 +3,7 @@
 #pragma once
 #include "common.h"
+#include <functional>
 #include <chrono>
 #include <string>
 #include <vector>
@@ -21,11 +22,19 @@ struct common_chat_tool_call {
     std::string name;
     std::string arguments;
     std::string id;
+    bool operator==(const common_chat_tool_call & other) const {
+        return name == other.name && arguments == other.arguments && id == other.id;
+    }
 };
 struct common_chat_msg_content_part {
     std::string type;
     std::string text;
+    bool operator==(const common_chat_msg_content_part & other) const {
+        return type == other.type && text == other.text;
+    }
 };
 struct common_chat_msg {
@@ -36,6 +45,51 @@ struct common_chat_msg {
     std::string reasoning_content;
     std::string tool_name;
     std::string tool_call_id;
+    template <class T> T to_json_oaicompat() const;
+    bool empty() const {
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+    }
+    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+        for (auto i = 0u; i < tool_calls.size(); i++) {
+            if (ids_cache.size() <= i) {
+                auto id = tool_calls[i].id;
+                if (id.empty()) {
+                    id = gen_tool_call_id();
+                }
+                ids_cache.push_back(id);
+            }
+            tool_calls[i].id = ids_cache[i];
+        }
+    }
+    bool operator==(const common_chat_msg & other) const {
+        return role == other.role
+            && content == other.content
+            && content_parts == other.content_parts
+            && tool_calls == other.tool_calls
+            && reasoning_content == other.reasoning_content
+            && tool_name == other.tool_name
+            && tool_call_id == other.tool_call_id;
+    }
+    bool operator!=(const common_chat_msg & other) const {
+        return !(*this == other);
+    }
+};
+struct common_chat_msg_diff {
+    std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
+    common_chat_tool_call tool_call_delta;
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+    bool operator==(const common_chat_msg_diff & other) const {
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
+    }
 };
 struct common_chat_tool {
@@ -57,14 +111,11 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -79,7 +130,8 @@ struct common_chat_templates_inputs {
     std::vector<common_chat_tool> tools;
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
@@ -88,11 +140,21 @@ struct common_chat_params {
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
     std::vector<common_grammar_trigger> grammar_triggers;
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
 };
+struct common_chat_syntax {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+};
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
@@ -129,8 +191,9 @@ std::string common_chat_format_example(
     const struct common_chat_templates * tmpls,
     bool use_jinja);
-std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+const char*               common_chat_format_name(common_chat_format format);
+const char*               common_reasoning_format_name(common_reasoning_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
@@ -143,3 +206,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);

package/cpp/common.cpp CHANGED Viewed

@@ -210,6 +210,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
     DWORD p = NORMAL_PRIORITY_CLASS;
     switch (prio) {
+        case LM_GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
         case LM_GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
         case LM_GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
         case LM_GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
@@ -235,6 +236,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
     int p = 0;
     switch (prio) {
+        case LM_GGML_SCHED_PRIO_LOW:      p =  5;  break;
         case LM_GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
         case LM_GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
         case LM_GGML_SCHED_PRIO_HIGH:     p = -10; break;
@@ -471,7 +473,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
 std::string regex_escape(const std::string & s) {
     static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
-    return std::regex_replace(s, special_chars, "\\$0");
+    return std::regex_replace(s, special_chars, "\\$&");
 }
 std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -711,11 +713,17 @@ bool fs_validate_filename(const std::string & filename) {
         // disable C++17 deprecation warning for std::codecvt_utf8
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
 #if defined(__clang__)
 #    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
 #endif
         filename_utf32 = converter.from_bytes(filename);
@@ -772,6 +780,9 @@ bool fs_validate_filename(const std::string & filename) {
     return true;
 }
+#include <iostream>
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
@@ -789,9 +800,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
     // process path from front to back, procedurally creating directories
     while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
         const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
-        const bool success = CreateDirectoryW(test, NULL);
+        pos_slash += 1;
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
         if (!success) {
             const DWORD error = GetLastError();
@@ -805,8 +823,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
                 return false;
             }
         }
-        pos_slash += 1;
     }
     return true;
@@ -856,7 +872,7 @@ std::string fs_get_cache_directory() {
     if (getenv("LLAMA_CACHE")) {
         cache_directory = std::getenv("LLAMA_CACHE");
     } else {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
         if (std::getenv("XDG_CACHE_HOME")) {
             cache_directory = std::getenv("XDG_CACHE_HOME");
         } else {
@@ -902,31 +918,6 @@ struct common_init_result common_init_from_params(common_params & params) {
     const llama_vocab * vocab = llama_model_get_vocab(model);
-    if (params.reranking) {
-        bool ok = true;
-        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
-            ok = false;
-        }
-        if (!ok) {
-            llama_model_free(model);
-            return iparams;
-        }
-    }
     auto cparams = common_context_params_to_llama(params);
     llama_context * lctx = llama_init_from_model(model, cparams);
@@ -936,7 +927,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
-    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
+    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
         LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -968,6 +959,35 @@ struct common_init_result common_init_from_params(common_params & params) {
         }
     }
+    if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
+        bool ok = true;
+        if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: vocab does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        if (!has_eos && !has_sep) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
+            ok = false;
+        }
+        if (!ok) {
+            llama_free(lctx);
+            llama_model_free(model);
+            return iparams;
+        }
+    }
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
         llama_adapter_lora_ptr lora;
@@ -1043,7 +1063,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_self_clear(lctx);
+        llama_memory_clear(llama_get_memory(lctx), true);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
         llama_set_warmup(lctx, false);
@@ -1145,11 +1165,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
-    if (params.reranking) {
-        cparams.embeddings    = true;
-        cparams.pooling_type  = LLAMA_POOLING_TYPE_RANK;
-    }
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;
@@ -1282,6 +1297,9 @@ std::vector<llama_token> common_tokenize(
     int n_tokens = text.length() + 2 * add_special;
     std::vector<llama_token> result(n_tokens);
     n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
     if (n_tokens < 0) {
         result.resize(-n_tokens);
         int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

package/cpp/common.h CHANGED Viewed

@@ -126,7 +126,7 @@ enum common_grammar_trigger_type {
     COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
     COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
     COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
 };
 struct common_grammar_trigger {
@@ -210,6 +210,9 @@ struct common_params_speculative {
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
+    lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
@@ -226,7 +229,8 @@ struct common_params_vocoder {
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
+    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 struct common_params {
@@ -306,6 +310,7 @@ struct common_params {
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
+    bool    offline                    = false;
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -368,7 +373,7 @@ struct common_params {
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embeddings
-    bool reranking         = false; // enable reranking support on server
+    std::string cls_sep    = "\t";  // separator of classification sequences
     // server params
     int32_t port           = 8080;         // server listens on this network port
@@ -383,6 +388,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
     std::vector<std::string> api_keys;

package/cpp/ggml-backend-reg.cpp CHANGED Viewed

@@ -69,6 +69,9 @@
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
 namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
 #if defined(__clang__)
 #    pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
 #endif
 #ifdef _WIN32

package/cpp/ggml-backend.cpp CHANGED Viewed

@@ -1340,7 +1340,10 @@ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
     // allocate graph
     if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
         // the re-allocation may cause the split inputs to be moved to a different address
-        lm_ggml_backend_sched_synchronize(sched);
+        // synchronize without lm_ggml_backend_sched_synchronize to avoid changing cur_copy
+        for (int i = 0; i < sched->n_backends; i++) {
+            lm_ggml_backend_synchronize(sched->backends[i]);
+        }
 #ifndef NDEBUG
         LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
@@ -1564,7 +1567,6 @@ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_
     lm_ggml_backend_sched_split_graph(sched, graph);
     if (!lm_ggml_backend_sched_alloc_splits(sched)) {
         return false;
     }
@@ -1598,6 +1600,12 @@ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_backends; i++) {
         lm_ggml_backend_synchronize(sched->backends[i]);
     }
+    if (!sched->is_alloc) {
+        // if the graph is not already allocated, always use copy 0 after a synchronization
+        // this ensures that during generation the same copy is used every time,
+        // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
+        sched->cur_copy = 0;
+    }
 }
 void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {

package/cpp/ggml-common.h CHANGED Viewed

@@ -1074,6 +1074,10 @@ LM_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 LM_GGML_TABLE_END()
+LM_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
+    -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
+LM_GGML_TABLE_END()
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f

package/cpp/ggml-cpu/amx/amx.cpp CHANGED Viewed

@@ -5,7 +5,7 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
-#include "ggml-cpu-traits.h"
+#include "traits.h"
 #if defined(__gnu_linux__)
 #include <sys/syscall.h>

package/cpp/ggml-cpu/amx/mmq.cpp CHANGED Viewed

@@ -8,7 +8,8 @@
 #include "mmq.h"
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
-#include "ggml-cpu-quants.h"
+#include "simd-mappings.h"
+#include "quants.h"
 #include "ggml-quants.h"
 #include <algorithm>
 #include <type_traits>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
         // Quantize these floats
         const float iscale = 127.f / amax;
-        y[i].d = LM_GGML_FP32_TO_FP16(1 / iscale);
+        y[i].d = LM_GGML_CPU_FP32_TO_FP16(1 / iscale);
         const float id = ( amax != 0.0f ) ? iscale : 0.f;
         const __m512 vscale = _mm512_set1_ps(id);
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
         const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
             __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
         const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(lm_ggml_half))));
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].d));
-            const __m512 vs1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].s));
+            const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vs1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].s));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
             __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
         const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
         for (int m = 0; m < nr; ++m) {
-            const __m512 vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].d));
+            const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
             const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
             __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                     vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
                 }
-                vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
             }
             // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
                 for (int k = 0; k < 8; ++k) {
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                 }
-                vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].d));
-                vs1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].s));
+                vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
+                vs1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
             }
             // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
                     va[k] = _mm512_set1_epi32(a_ptr[k]);
                     va[k] = _mm512_add_epi8(va[k], off);
                 }
-                vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].d));
+                vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
             }
             // load b

package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp ADDED Viewed

@@ -0,0 +1,94 @@
+#include "ggml-backend-impl.h"
+#if defined(__aarch64__)
+#if defined(__linux__)
+#include <sys/auxv.h>
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+#if !defined(HWCAP2_I8MM)
+#define HWCAP2_I8MM (1 << 13)
+#endif
+#if !defined(HWCAP2_SME)
+#define HWCAP2_SME (1 << 23)
+#endif
+struct aarch64_features {
+    // has_neon not needed, aarch64 has NEON guaranteed
+    bool has_dotprod     = false;
+    bool has_fp16_va     = false;
+    bool has_sve         = false;
+    bool has_sve2        = false;
+    bool has_i8mm        = false;
+    bool has_sme         = false;
+    aarch64_features() {
+#if defined(__linux__)
+        uint32_t hwcap = getauxval(AT_HWCAP);
+        uint32_t hwcap2 = getauxval(AT_HWCAP2);
+        has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
+        has_fp16_va = !!(hwcap & HWCAP_FPHP);
+        has_sve     = !!(hwcap & HWCAP_SVE);
+        has_sve2    = !!(hwcap2 & HWCAP2_SVE2);
+        has_i8mm    = !!(hwcap2 & HWCAP2_I8MM);
+        has_sme     = !!(hwcap2 & HWCAP2_SME);
+#elif defined(__APPLE__)
+        int oldp = 0;
+        size_t size = sizeof(oldp);
+        if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
+            has_dotprod = static_cast<bool>(oldp);
+        }
+        if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
+            has_i8mm = static_cast<bool>(oldp);
+        }
+        if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
+            has_sme = static_cast<bool>(oldp);
+        }
+        // Apple apparently does not implement SVE yet
+#endif
+    }
+};
+static int lm_ggml_backend_cpu_aarch64_score() {
+    int score = 1;
+    aarch64_features af;
+#ifdef LM_GGML_USE_DOTPROD
+    if (!af.has_dotprod) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef LM_GGML_USE_FP16_VECTOR_ARITHMETIC
+    if (!af.has_fp16_va) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef LM_GGML_USE_SVE
+    if (!af.has_sve) { return 0; }
+    score += 1<<3;
+#endif
+#ifdef LM_GGML_USE_MATMUL_INT8
+    if (!af.has_i8mm) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef LM_GGML_USE_SVE2
+    if (!af.has_sve2) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef LM_GGML_USE_SME
+    if (!af.has_sme) { return 0; }
+    score += 1<<6;
+#endif
+    return score;
+}
+LM_GGML_BACKEND_DL_SCORE_IMPL(lm_ggml_backend_cpu_aarch64_score)
+# endif // defined(__aarch64__)