npm - @fugood/llama.node - Versions diffs - 0.3.11 → 0.3.13 - Mend

@fugood/llama.node 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +1 -0
package/lib/index.js +26 -20
package/lib/index.ts +32 -28
package/package.json +1 -1
package/src/LlamaCompletionWorker.cpp +14 -0
package/src/LlamaContext.cpp +13 -4
package/src/llama.cpp/.github/workflows/build.yml +35 -3
package/src/llama.cpp/.github/workflows/docker.yml +2 -0
package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
package/src/llama.cpp/common/CMakeLists.txt +20 -3
package/src/llama.cpp/common/arg.cpp +180 -3
package/src/llama.cpp/common/chat-template.hpp +21 -7
package/src/llama.cpp/common/chat.cpp +220 -101
package/src/llama.cpp/common/chat.hpp +3 -0
package/src/llama.cpp/common/common.h +15 -7
package/src/llama.cpp/common/llguidance.cpp +3 -3
package/src/llama.cpp/common/log.cpp +1 -0
package/src/llama.cpp/common/log.h +2 -1
package/src/llama.cpp/common/minja.hpp +24 -9
package/src/llama.cpp/common/sampling.cpp +52 -46
package/src/llama.cpp/common/speculative.h +1 -1
package/src/llama.cpp/docs/build.md +2 -2
package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
package/src/llama.cpp/examples/run/run.cpp +5 -12
package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/server/httplib.h +381 -292
package/src/llama.cpp/examples/server/server.cpp +58 -47
package/src/llama.cpp/examples/server/utils.hpp +7 -5
package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
package/src/llama.cpp/ggml/include/ggml.h +1 -1
package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
package/src/llama.cpp/ggml/src/ggml.c +1 -1
package/src/llama.cpp/include/llama.h +14 -10
package/src/llama.cpp/src/llama-grammar.cpp +1 -1
package/src/llama.cpp/src/llama-grammar.h +1 -1
package/src/llama.cpp/src/llama-impl.h +6 -6
package/src/llama.cpp/src/llama-kv-cache.h +1 -1
package/src/llama.cpp/src/llama-mmap.h +1 -0
package/src/llama.cpp/src/llama-model.cpp +1 -1
package/src/llama.cpp/src/llama-sampling.cpp +131 -57
package/src/llama.cpp/src/llama.cpp +7 -5
package/src/llama.cpp/src/unicode.cpp +9 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
package/src/llama.cpp/tests/test-chat.cpp +237 -69
package/src/llama.cpp/tests/test-gguf.cpp +4 -4
package/src/llama.cpp/tests/test-sampling.cpp +15 -0

package/src/llama.cpp/examples/server/server.cpp CHANGED Viewed

@@ -42,7 +42,7 @@ enum stop_type {
     STOP_TYPE_LIMIT,
 };
-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
+// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
 enum slot_state {
     SLOT_STATE_IDLE,
     SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
@@ -173,6 +173,7 @@ struct slot_params {
             {"grammar_trigger_words",     grammar_trigger_words},
             {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
             {"preserved_tokens",          sampling.preserved_tokens},
+            {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
             {"samplers",                  samplers},
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
@@ -334,24 +335,24 @@ struct server_task {
         if (data.contains("json_schema") && !data.contains("grammar")) {
             try {
                 auto schema                  = json_value(data, "json_schema", json::object());
-                LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str());
+                SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
                 params.sampling.grammar      = json_schema_to_grammar(schema);
-                LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
+                SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
             } catch (const std::exception & e) {
                 throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
             }
         } else {
             params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
-            LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
+            SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
             params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
-            LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
+            SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
         }
         {
             auto it = data.find("chat_format");
             if (it != data.end()) {
                 params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
             } else {
                 params.oaicompat_chat_format = defaults.oaicompat_chat_format;
             }
@@ -367,12 +368,12 @@ struct server_task {
                     auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
                     if (ids.size() == 1) {
-                        LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
+                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
                         params.sampling.grammar_trigger_tokens.push_back(ids[0]);
                         params.sampling.preserved_tokens.insert(ids[0]);
                         continue;
                     }
-                    LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
+                    SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
                     params.sampling.grammar_trigger_words.push_back(trigger);
                 }
             }
@@ -381,11 +382,11 @@ struct server_task {
                 for (const auto & t : *preserved_tokens) {
                     auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
                     if (ids.size() == 1) {
-                        LOG_DBG("Preserved token: %d\n", ids[0]);
+                        SRV_DBG("Preserved token: %d\n", ids[0]);
                         params.sampling.preserved_tokens.insert(ids[0]);
                     } else {
                         // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
-                        LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
+                        SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
                     }
                 }
             }
@@ -717,16 +718,26 @@ struct server_task_result_cmpl_final : server_task_result {
         std::string finish_reason = "length";
         common_chat_msg msg;
         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            LOG_DBG("Parsing chat message: %s\n", content.c_str());
+            SRV_DBG("Parsing chat message: %s\n", content.c_str());
             msg = common_chat_parse(content, oaicompat_chat_format);
             finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
         } else {
             msg.content = content;
         }
-        json tool_calls;
+        json message {
+            {"role", "assistant"},
+        };
+        if (!msg.reasoning_content.empty()) {
+            message["reasoning_content"] = msg.reasoning_content;
+        }
+        if (msg.content.empty() && !msg.tool_calls.empty()) {
+            message["content"] = json();
+        } else {
+            message["content"] = msg.content;
+        }
         if (!msg.tool_calls.empty()) {
-            tool_calls = json::array();
+            auto tool_calls = json::array();
             for (const auto & tc : msg.tool_calls) {
                 tool_calls.push_back({
                     {"type", "function"},
@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
                     {"id", tc.id},
                 });
             }
-        }
-        json message {
-            {"content", msg.content},
-            {"tool_calls", tool_calls},
-            {"role", "assistant"},
-        };
-        if (!msg.tool_plan.empty()) {
-            message["tool_plan"] = msg.tool_plan;
+            message["tool_calls"] = tool_calls;
         }
         json choice {
@@ -1600,6 +1603,10 @@ struct server_queue {
             while (true) {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (!running) {
+                    QUE_DBG("%s", "terminate\n");
+                    return;
+                }
                 if (queue_tasks.empty()) {
                     lock.unlock();
                     break;
@@ -1620,11 +1627,11 @@ struct server_queue {
             QUE_DBG("%s", "waiting for new tasks\n");
             {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (!running) {
+                    QUE_DBG("%s", "terminate\n");
+                    return;
+                }
                 if (queue_tasks.empty()) {
-                    if (!running) {
-                        QUE_DBG("%s", "terminate\n");
-                        return;
-                    }
                     condition_tasks.wait(lock, [&]{
                         return (!queue_tasks.empty() || !running);
                     });
@@ -1885,7 +1892,7 @@ struct server_context {
         }
         if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) {
-            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
             chat_templates = common_chat_templates_from_model(model, "chatml");
         } else {
             chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
@@ -2069,8 +2076,8 @@ struct server_context {
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
             // Might be better to reject the request with a 400 ?
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
             slot.params.n_predict = slot.n_predict;
-            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
         }
         if (slot.params.ignore_eos && has_eos_token) {
@@ -2275,7 +2282,7 @@ struct server_context {
             for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
                 result.probs.push_back({
                     cur_p->data[i].id,
-                    common_detokenize(ctx, {cur_p->data[i].id}, special),
+                    common_token_to_piece(ctx, cur_p->data[i].id, special),
                     cur_p->data[i].p
                 });
             }
@@ -2297,7 +2304,7 @@ struct server_context {
             for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
                 result.probs.push_back({
                     cur[i].id,
-                    common_detokenize(ctx, {cur[i].id}, special),
+                    common_token_to_piece(ctx, cur[i].id, special),
                     cur[i].p
                 });
             }
@@ -3355,10 +3362,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
     // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
-    LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
-    LOG_DBG("request:  %s\n", req.body.c_str());
-    LOG_DBG("response: %s\n", res.body.c_str());
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
 }
 std::function<void(int)> shutdown_handler;
@@ -3649,7 +3656,7 @@ int main(int argc, char ** argv) {
             }, {
                     {"name",  "n_busy_slots_per_decode"},
                     {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
+                    {"value",  (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},
@@ -3860,7 +3867,9 @@ int main(int argc, char ** argv) {
         try {
             const auto & prompt = data.at("prompt");
-            LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
+            // TODO: this log can become very long, put it behind a flag or think about a more compact format
+            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
             std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
             tasks.reserve(tokenized_prompts.size());
             for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4054,7 +4063,7 @@ int main(int argc, char ** argv) {
         }
         auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
@@ -4067,7 +4076,7 @@ int main(int argc, char ** argv) {
     // same with handle_chat_completions, but without inference part
     const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
         auto body = json::parse(req.body);
-        json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
+        json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
     };
@@ -4376,6 +4385,9 @@ int main(int argc, char ** argv) {
                     res.set_content("Error: gzip is not supported by this browser", "text/plain");
                 } else {
                     res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
                     res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
                 }
                 return false;
@@ -4425,6 +4437,7 @@ int main(int argc, char ** argv) {
     // clean up function, to be called before exit
     auto clean_up = [&svr]() {
+        SRV_INF("%s: cleaning up before exit...\n", __func__);
         svr->stop();
         llama_backend_free();
     };
@@ -4441,10 +4454,6 @@ int main(int argc, char ** argv) {
     }
     if (!was_bound) {
-        //LOG_ERROR("couldn't bind HTTP server socket", {
-        //    {"hostname", params.hostname},
-        //    {"port", params.port},
-        //});
         LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
         clean_up();
         return 1;
@@ -4461,7 +4470,7 @@ int main(int argc, char ** argv) {
     if (!ctx_server.load_model(params)) {
         clean_up();
-        t.join();
+        // t.join(); // FIXME: see below
         LOG_ERR("%s: exiting due to model loading error\n", __func__);
         return 1;
     }
@@ -4485,13 +4494,10 @@ int main(int argc, char ** argv) {
     });
     shutdown_handler = [&](int) {
+        // this will unblock start_loop()
         ctx_server.queue_tasks.terminate();
     };
-    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
-    ctx_server.queue_tasks.start_loop();
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
     sigint_action.sa_handler = signal_handler;
@@ -4506,8 +4512,13 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
+    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+    // this call blocks the main thread until queue_tasks.terminate() is called
+    ctx_server.queue_tasks.start_loop();
     clean_up();
-    t.join();
+    // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
     return 0;
 }

package/src/llama.cpp/examples/server/utils.hpp CHANGED Viewed

@@ -367,10 +367,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
                     }
                 }
             } else {
-                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
             }
         } else {
-            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
         }
         chat.push_back({role, content, /* tool_calls= */ {}});
@@ -578,6 +578,7 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
     const json & body, /* openai api json semantics */
     bool use_jinja,
+    common_reasoning_format reasoning_format,
     const common_chat_templates & chat_templates)
 {
     json llama_params;
@@ -633,9 +634,10 @@ static json oaicompat_completion_params_parse(
             throw std::runtime_error("Cannot use custom grammar constraints with tools.");
         }
         common_chat_inputs inputs;
-        inputs.messages = body.at("messages");
-        inputs.tools = tools;
-        inputs.tool_choice = tool_choice;
+        inputs.extract_reasoning   = reasoning_format != COMMON_REASONING_FORMAT_NONE;
+        inputs.messages            = body.at("messages");
+        inputs.tools               = tools;
+        inputs.tool_choice         = tool_choice;
         inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
         if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
             LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");

package/src/llama.cpp/ggml/include/ggml-cpu.h CHANGED Viewed

@@ -8,7 +8,7 @@ extern "C" {
 #endif
     // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
+    // since https://github.com/ggml-org/ggml/issues/287
     struct ggml_cplan {
         size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

package/src/llama.cpp/ggml/include/ggml-metal.h CHANGED Viewed

@@ -45,7 +45,7 @@ GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
 GGML_DEPRECATED(
         GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
+        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
 GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);

package/src/llama.cpp/ggml/include/ggml-vulkan.h CHANGED Viewed

@@ -10,8 +10,6 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
-GGML_BACKEND_API void ggml_vk_instance_init(void);
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -198,7 +198,7 @@
 #ifndef __GNUC__
 #    define GGML_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))

package/src/llama.cpp/ggml/src/ggml-common.h CHANGED Viewed

@@ -473,7 +473,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
     240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
 GGML_TABLE_END()
-//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
 GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
     0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
     0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
@@ -508,7 +507,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
     0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
     0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
 GGML_TABLE_END()
-//#endif
 GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h CHANGED Viewed

@@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 #endif
 #if defined(__loongarch_asx)
-typedef union {
-    int32_t i;
-    float f;
-} ft_union;
 /* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+static __m128 __lsx_vreplfr2vr_s(const float val) {
+    v4f32 res = {val, val, val, val};
+    return (__m128)res;
 }
-static __m256 __lasx_xvreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+static __m256 __lasx_xvreplfr2vr_s(const float val) {
+    v8f32 res = {val, val, val, val, val, val, val, val};
+    return (__m256)res;
 }
 #endif