@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +47 -8
  19. package/lib/index.js +21 -1
  20. package/lib/index.ts +31 -1
  21. package/package.json +12 -3
  22. package/src/LlamaCompletionWorker.cpp +33 -6
  23. package/src/LlamaCompletionWorker.h +3 -1
  24. package/src/LlamaContext.cpp +336 -28
  25. package/src/LlamaContext.h +2 -0
  26. package/src/common.hpp +19 -2
  27. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  29. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  31. package/src/llama.cpp/CMakeLists.txt +10 -19
  32. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  33. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  34. package/src/llama.cpp/common/arg.cpp +66 -16
  35. package/src/llama.cpp/common/chat-template.hpp +515 -0
  36. package/src/llama.cpp/common/chat.cpp +966 -0
  37. package/src/llama.cpp/common/chat.hpp +52 -0
  38. package/src/llama.cpp/common/common.cpp +159 -36
  39. package/src/llama.cpp/common/common.h +56 -14
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  41. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  42. package/src/llama.cpp/common/llguidance.cpp +270 -0
  43. package/src/llama.cpp/common/log.cpp +1 -10
  44. package/src/llama.cpp/common/log.h +10 -0
  45. package/src/llama.cpp/common/minja.hpp +2868 -0
  46. package/src/llama.cpp/common/sampling.cpp +22 -1
  47. package/src/llama.cpp/common/sampling.h +3 -0
  48. package/src/llama.cpp/docs/build.md +54 -9
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  50. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  51. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  52. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  54. package/src/llama.cpp/examples/llava/clip.h +2 -0
  55. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  56. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  57. package/src/llama.cpp/examples/main/main.cpp +26 -25
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  59. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  60. package/src/llama.cpp/examples/run/run.cpp +224 -69
  61. package/src/llama.cpp/examples/server/server.cpp +252 -81
  62. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  63. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  64. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  65. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  66. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  67. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  68. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  71. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  73. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  74. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  75. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  77. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  82. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  83. package/src/llama.cpp/include/llama.h +14 -1
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  85. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  86. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  87. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  88. package/src/llama.cpp/src/llama-arch.h +3 -1
  89. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  90. package/src/llama.cpp/src/llama-chat.h +1 -0
  91. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  92. package/src/llama.cpp/src/llama-grammar.h +22 -1
  93. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  95. package/src/llama.cpp/src/llama-model.cpp +76 -6
  96. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  97. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  98. package/src/llama.cpp/src/llama.cpp +181 -123
  99. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  100. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  101. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  102. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  103. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  104. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  105. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  106. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -5,10 +5,6 @@
5
5
  #include "llama.h"
6
6
  #include "common/base64.hpp"
7
7
 
8
- #ifndef NDEBUG
9
- // crash the server in debug mode, otherwise send an http 500 error
10
- #define CPPHTTPLIB_NO_EXCEPTIONS 1
11
- #endif
12
8
  // increase max payload length to allow use of larger context size
13
9
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
14
10
  #include "httplib.h"
@@ -16,6 +12,9 @@
16
12
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
17
13
  #define JSON_ASSERT GGML_ASSERT
18
14
  #include "json.hpp"
15
+ #include "minja.hpp"
16
+ #include "chat.hpp"
17
+ #include "chat-template.hpp"
19
18
 
20
19
  #include <random>
21
20
  #include <sstream>
@@ -349,7 +348,7 @@ static llama_tokens format_infill(
349
348
  }
350
349
 
351
350
  // Format given chat. If tmpl is empty, we take the template from model metadata
352
- inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
351
+ inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
353
352
  std::vector<common_chat_msg> chat;
354
353
 
355
354
  for (size_t i = 0; i < messages.size(); ++i) {
@@ -374,10 +373,10 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
374
373
  throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
375
374
  }
376
375
 
377
- chat.push_back({role, content});
376
+ chat.push_back({role, content, /* tool_calls= */ {}});
378
377
  }
379
378
 
380
- const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
379
+ const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
381
380
  LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
382
381
 
383
382
  return formatted_chat;
@@ -576,14 +575,32 @@ static json oaicompat_completion_params_parse(const json & body) {
576
575
  return llama_params;
577
576
  }
578
577
 
579
- static json oaicompat_chat_completion_params_parse(
580
- const struct llama_model * model,
581
- const json & body, /* openai api json semantics */
582
- const std::string & chat_template) {
578
+ static json oaicompat_completion_params_parse(
579
+ const json & body, /* openai api json semantics */
580
+ bool use_jinja,
581
+ const common_chat_templates & chat_templates)
582
+ {
583
583
  json llama_params;
584
+ const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use
585
+ ? *chat_templates.template_tool_use
586
+ : *chat_templates.template_default;
584
587
 
585
- // Apply chat template to the list of messages
586
- llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
588
+ auto tools = json_value(body, "tools", json());
589
+ auto stream = json_value(body, "stream", false);
590
+
591
+ if (tools.is_array() && !tools.empty()) {
592
+ if (stream) {
593
+ throw std::runtime_error("Cannot use tools with stream");
594
+ }
595
+ if (!use_jinja) {
596
+ throw std::runtime_error("tools param requires --jinja flag");
597
+ }
598
+ }
599
+ if (!use_jinja) {
600
+ if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
601
+ throw std::runtime_error("Unsupported param: tool_choice");
602
+ }
603
+ }
587
604
 
588
605
  // Handle "stop" field
589
606
  if (body.contains("stop") && body.at("stop").is_string()) {
@@ -606,6 +623,49 @@ static json oaicompat_chat_completion_params_parse(
606
623
  }
607
624
  }
608
625
 
626
+ // Apply chat template to the list of messages
627
+ if (use_jinja) {
628
+ auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
629
+ if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") {
630
+ throw std::runtime_error("Invalid tool_choice: " + tool_choice);
631
+ }
632
+ if (tool_choice != "none" && llama_params.contains("grammar")) {
633
+ throw std::runtime_error("Cannot use custom grammar constraints with tools.");
634
+ }
635
+ common_chat_inputs inputs;
636
+ inputs.messages = body.at("messages");
637
+ inputs.tools = tools;
638
+ inputs.tool_choice = tool_choice;
639
+ inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
640
+ if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) {
641
+ LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n");
642
+ inputs.parallel_tool_calls = false;
643
+ }
644
+ inputs.stream = stream;
645
+ // TODO: support mixing schema w/ tools beyond generic format.
646
+ inputs.json_schema = json_value(llama_params, "json_schema", json());
647
+ auto chat_params = common_chat_params_init(tmpl, inputs);
648
+
649
+ llama_params["chat_format"] = static_cast<int>(chat_params.format);
650
+ llama_params["prompt"] = chat_params.prompt;
651
+ llama_params["grammar"] = chat_params.grammar;
652
+ llama_params["grammar_lazy"] = chat_params.grammar_lazy;
653
+ auto grammar_triggers = json::array();
654
+ for (const auto & trigger : chat_params.grammar_triggers) {
655
+ grammar_triggers.push_back({
656
+ {"word", trigger.word},
657
+ {"at_start", trigger.at_start},
658
+ });
659
+ }
660
+ llama_params["grammar_triggers"] = grammar_triggers;
661
+ llama_params["preserved_tokens"] = chat_params.preserved_tokens;
662
+ for (const auto & stop : chat_params.additional_stops) {
663
+ llama_params["stop"].push_back(stop);
664
+ }
665
+ } else {
666
+ llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
667
+ }
668
+
609
669
  // Handle "n" field
610
670
  int n_choices = json_value(body, "n", 1);
611
671
  if (n_choices != 1) {
@@ -620,14 +680,6 @@ static json oaicompat_chat_completion_params_parse(
620
680
  throw std::runtime_error("top_logprobs requires logprobs to be set to true");
621
681
  }
622
682
 
623
- // Params supported by OAI but unsupported by llama.cpp
624
- static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
625
- for (const auto & param : unsupported_params) {
626
- if (body.contains(param)) {
627
- throw std::runtime_error("Unsupported param: " + param);
628
- }
629
- }
630
-
631
683
  // Copy remaining properties to llama_params
632
684
  // This allows user to use llama.cpp-specific params like "mirostat", ... via OAI endpoint.
633
685
  // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
@@ -95,13 +95,15 @@ int main(int argc, char ** argv) {
95
95
  llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
96
96
 
97
97
  // helper function to evaluate a prompt and generate a response
98
- auto generate = [&](const std::string & prompt, bool is_first) {
98
+ auto generate = [&](const std::string & prompt) {
99
99
  std::string response;
100
100
 
101
+ const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
102
+
101
103
  // tokenize the prompt
102
104
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
103
105
  std::vector<llama_token> prompt_tokens(n_prompt_tokens);
104
- if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
106
+ if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) {
105
107
  GGML_ABORT("failed to tokenize the prompt\n");
106
108
  }
107
109
 
@@ -161,7 +163,7 @@ int main(int argc, char ** argv) {
161
163
  break;
162
164
  }
163
165
 
164
- const char * tmpl = llama_model_chat_template(model);
166
+ const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
165
167
 
166
168
  // add the user input to the message list and format it
167
169
  messages.push_back({"user", strdup(user.c_str())});
@@ -180,7 +182,7 @@ int main(int argc, char ** argv) {
180
182
 
181
183
  // generate a response
182
184
  printf("\033[33m");
183
- std::string response = generate(prompt, prev_len == 0);
185
+ std::string response = generate(prompt);
184
186
  printf("\n\033[0m");
185
187
 
186
188
  // add the response to the messages
@@ -0,0 +1,11 @@
1
+ cmake_minimum_required(VERSION 3.12)
2
+ project(llama-simple-cmake-pkg)
3
+
4
+ set(TARGET llama-simple-cmake-pkg)
5
+
6
+ find_package(Llama REQUIRED)
7
+
8
+ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../simple/simple.cpp)
9
+ install(TARGETS ${TARGET} RUNTIME)
10
+ target_link_libraries(${TARGET} PRIVATE llama ggml::all ${CMAKE_THREAD_LIBS_INIT})
11
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -58,7 +58,8 @@ else()
58
58
  set(GGML_BLAS_VENDOR_DEFAULT "Generic")
59
59
  endif()
60
60
 
61
- if (CMAKE_CROSSCOMPILING)
61
+ if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
62
+ message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
62
63
  set(GGML_NATIVE_DEFAULT OFF)
63
64
  else()
64
65
  set(GGML_NATIVE_DEFAULT ON)
@@ -153,6 +154,8 @@ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashA
153
154
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
154
155
 
155
156
  option(GGML_HIP "ggml: use HIP" OFF)
157
+ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
158
+ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
156
159
  option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
157
160
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
158
161
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
@@ -264,3 +267,77 @@ if (GGML_STANDALONE)
264
267
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
265
268
  DESTINATION share/pkgconfig)
266
269
  endif()
270
+
271
+ #
272
+ # Create CMake package
273
+ #
274
+
275
+ # Generate version info based on git commit.
276
+
277
+ if(NOT DEFINED GGML_BUILD_NUMBER)
278
+ find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
279
+ execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
280
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
281
+ OUTPUT_VARIABLE GGML_BUILD_NUMBER
282
+ OUTPUT_STRIP_TRAILING_WHITESPACE
283
+ )
284
+
285
+ if(GGML_BUILD_NUMBER EQUAL 1)
286
+ message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
287
+ endif()
288
+
289
+ execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
290
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
291
+ OUTPUT_VARIABLE GGML_BUILD_COMMIT
292
+ OUTPUT_STRIP_TRAILING_WHITESPACE
293
+ )
294
+ endif()
295
+
296
+
297
+ # Capture variables prefixed with GGML_.
298
+
299
+ set(variable_set_statements
300
+ "
301
+ ####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
302
+ ####### Any changes to this file will be overwritten by the next CMake run #######
303
+
304
+ ")
305
+
306
+ set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})
307
+
308
+ get_cmake_property(all_variables VARIABLES)
309
+ foreach(variable_name IN LISTS all_variables)
310
+ if(variable_name MATCHES "^GGML_")
311
+ string(REPLACE ";" "\\;"
312
+ variable_value "${${variable_name}}")
313
+
314
+ set(variable_set_statements
315
+ "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
316
+ endif()
317
+ endforeach()
318
+
319
+ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
320
+
321
+ # Create the CMake package and set install location.
322
+
323
+ set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
324
+ set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
325
+ set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
326
+ set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
327
+
328
+ configure_package_config_file(
329
+ ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
330
+ ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
331
+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
332
+ PATH_VARS GGML_INCLUDE_INSTALL_DIR
333
+ GGML_LIB_INSTALL_DIR
334
+ GGML_BIN_INSTALL_DIR)
335
+
336
+ write_basic_package_version_file(
337
+ ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
338
+ VERSION ${GGML_INSTALL_VERSION}
339
+ COMPATIBILITY SameMajorVersion)
340
+
341
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
342
+ ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
343
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
@@ -1775,7 +1775,7 @@ extern "C" {
1775
1775
  struct ggml_tensor * a,
1776
1776
  int k);
1777
1777
 
1778
- #define GGML_KQ_MASK_PAD 32
1778
+ #define GGML_KQ_MASK_PAD 64
1779
1779
 
1780
1780
  // q: [n_embd, n_batch, n_head, 1]
1781
1781
  // k: [n_embd, n_kv, n_head_kv, 1]
@@ -93,12 +93,18 @@ endif()
93
93
 
94
94
  if (GGML_CCACHE)
95
95
  find_program(GGML_CCACHE_FOUND ccache)
96
+ find_program(GGML_SCCACHE_FOUND sccache)
96
97
 
97
- if (GGML_CCACHE_FOUND)
98
+ if (GGML_CCACHE_FOUND OR GGML_SCCACHE_FOUND)
99
+ if(GGML_CCACHE_FOUND)
100
+ set(GGML_CCACHE_VARIANT ccache)
101
+ else()
102
+ set(GGML_CCACHE_VARIANT sccache)
103
+ endif()
98
104
  # TODO: should not be set globally
99
- set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
105
+ set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
100
106
  set(ENV{CCACHE_SLOPPINESS} time_macros)
101
- message(STATUS "ccache found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
107
+ message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
102
108
  else()
103
109
  message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
104
110
  endif ()
@@ -250,6 +256,17 @@ function(ggml_add_backend_library backend)
250
256
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
251
257
  target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
252
258
  endif()
259
+
260
+ if(NOT GGML_AVAILABLE_BACKENDS)
261
+ set(GGML_AVAILABLE_BACKENDS "${backend}"
262
+ CACHE INTERNAL "List of backends for cmake package")
263
+ else()
264
+ list(FIND GGML_AVAILABLE_BACKENDS "${backend}" has_backend)
265
+ if(has_backend EQUAL -1)
266
+ set(GGML_AVAILABLE_BACKENDS "${GGML_AVAILABLE_BACKENDS};${backend}"
267
+ CACHE INTERNAL "List of backends for cmake package")
268
+ endif()
269
+ endif()
253
270
  endfunction()
254
271
 
255
272
  function(ggml_add_backend backend)
@@ -297,7 +314,7 @@ if (GGML_CPU_ALL_VARIANTS)
297
314
  # MSVC doesn't support AMX
298
315
  ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
299
316
  endif()
300
- else ()
317
+ elseif (GGML_CPU)
301
318
  ggml_add_cpu_backend_variant_impl("")
302
319
  endif()
303
320
 
@@ -989,19 +989,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
989
989
  this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
990
990
  }
991
991
 
992
- if (this_size > max_size) {
993
- GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
- __func__, t->name,
995
- ggml_backend_buft_name(buft),
996
- this_size, max_size);
997
- for (size_t i = 0; i < n_buffers; i++) {
998
- ggml_backend_buffer_free(buffers[i]);
999
- }
1000
- free(buffers);
1001
- return NULL;
1002
- }
1003
-
1004
- if ((cur_buf_size + this_size) > max_size) {
992
+ if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
1005
993
  // allocate tensors in the current buffer
1006
994
  if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1007
995
  return NULL;
@@ -297,6 +297,90 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
297
297
  static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
298
298
  #endif
299
299
 
300
+ #if defined(__loongarch_sx)
301
+
302
+ static __m128i lsx_packs_w(__m128i a, __m128i b) {
303
+ __m128i tmp, tmp1;
304
+ tmp = __lsx_vsat_w(a, 15);
305
+ tmp1 = __lsx_vsat_w(b, 15);
306
+ return __lsx_vpickev_h(tmp1, tmp);
307
+ }
308
+
309
+ static __m128i lsx_packs_h(__m128i a, __m128i b) {
310
+ __m128i tmp, tmp1;
311
+ tmp = __lsx_vsat_h(a, 7);
312
+ tmp1 = __lsx_vsat_h(b, 7);
313
+ return __lsx_vpickev_b(tmp1, tmp);
314
+ }
315
+
316
+ static __m128i lsx_packus_h(__m128i a, __m128i b) {
317
+ __m128i tmp, tmp1;
318
+ tmp = __lsx_vsat_hu(a, 7);
319
+ tmp1 = __lsx_vsat_hu(b, 7);
320
+ return __lsx_vpickev_b(tmp1, tmp);
321
+ }
322
+
323
+ static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
324
+ __m128i tmp1, tmp2;
325
+ tmp1 = __lsx_vmulwev_h_b(a, b);
326
+ tmp2 = __lsx_vmulwod_h_b(a, b);
327
+ return __lsx_vsadd_h(tmp1, tmp2);
328
+ }
329
+
330
+ static __m128i lsx_madd_h(__m128i a, __m128i b) {
331
+ __m128i tmp1, tmp2;
332
+ tmp1 = __lsx_vmulwev_w_h(a, b);
333
+ tmp2 = __lsx_vmulwod_w_h(a, b);
334
+ return __lsx_vadd_w(tmp1, tmp2);
335
+ }
336
+
337
+ static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
338
+ v4i32 __ret = {d, c, b, a};
339
+ return (__m128i)__ret;
340
+ }
341
+
342
+ static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
343
+ __m128i mask_f, zero, tmp0, tmp2, mask;
344
+ int f = 0x8f;
345
+ mask_f = __lsx_vreplgr2vr_b(f);
346
+ zero = __lsx_vldi(0);
347
+ tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
348
+ tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
349
+ mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
350
+ tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
351
+ return __lsx_vshuf_b(a, zero, tmp2);
352
+ }
353
+
354
+ static __m128i lsx_hadd_h(__m128i a, __m128i b) {
355
+ __m128i tmp1 = __lsx_vpickev_h(b, a);
356
+ __m128i tmp2 = __lsx_vpickod_h(b, a);
357
+ return __lsx_vadd_h(tmp1, tmp2);
358
+ }
359
+
360
+ static __m128i lsx_hadd_w(__m128i a, __m128i b) {
361
+ __m128i tmp1 = __lsx_vpickev_w(b, a);
362
+ __m128i tmp2 = __lsx_vpickod_w(b, a);
363
+ return __lsx_vadd_w(tmp1, tmp2);
364
+ }
365
+
366
+ static __m128 lsx_hadd_s(__m128 a, __m128 b) {
367
+ __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
368
+ __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
369
+
370
+ return __lsx_vfadd_s(tmp1, tmp2);
371
+ }
372
+
373
+ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
374
+ __m128 res_0 =lsx_hadd_s(a, b);
375
+ __m128 res_1 =lsx_hadd_s(c, d);
376
+ __m128 res =lsx_hadd_s(res_0, res_1);
377
+ res =lsx_hadd_s(res, res);
378
+ res =lsx_hadd_s(res, res);
379
+
380
+ return ((v4f32)res)[0];
381
+ }
382
+ #endif
383
+
300
384
  #if defined(__loongarch_asx)
301
385
 
302
386
  #ifdef __clang__
@@ -395,11 +479,6 @@ static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1
395
479
  return (__m256i)__ret;
396
480
  }
397
481
 
398
- static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
399
- v4i32 __ret = {d, c, b, a};
400
- return (__m128i)__ret;
401
- }
402
-
403
482
  static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
404
483
  v4i64 __ret = {d, c, b, a};
405
484
  return (__m256i)__ret;
@@ -409,18 +488,6 @@ static __m256i lasx_insertf128( __m128i x, __m128i y) {
409
488
  return lasx_set_q(x, y);
410
489
  }
411
490
 
412
- static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
413
- __m128i mask_f, zero, tmp0, tmp2, mask;
414
- int f = 0x8f;
415
- mask_f = __lsx_vreplgr2vr_b(f);
416
- zero = __lsx_vldi(0);
417
- tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
418
- tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
419
- mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
420
- tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
421
- return __lsx_vshuf_b(a, zero, tmp2);
422
- }
423
-
424
491
  static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
425
492
  __m256i mask_f, zero, tmp0, tmp2, mask;
426
493
  int f = 0x8f;
@@ -482,25 +549,6 @@ static __m128 lasx_extractf128( __m256 a, int pos) {
482
549
  return ret;
483
550
  }
484
551
 
485
- static __m128i lsx_hadd_h(__m128i a, __m128i b) {
486
- __m128i tmp1 = __lsx_vpickev_h(b, a);
487
- __m128i tmp2 = __lsx_vpickod_h(b, a);
488
- return __lsx_vadd_h(tmp1, tmp2);
489
- }
490
-
491
- static __m128i lsx_hadd_w(__m128i a, __m128i b) {
492
- __m128i tmp1 = __lsx_vpickev_w(b, a);
493
- __m128i tmp2 = __lsx_vpickod_w(b, a);
494
- return __lsx_vadd_w(tmp1, tmp2);
495
- }
496
-
497
- static __m128 lsx_hadd_s(__m128 a, __m128 b) {
498
- __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
499
- __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
500
-
501
- return __lsx_vfadd_s(tmp1, tmp2);
502
- }
503
-
504
552
  static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
505
553
  __m256i tmp1, tmp2;
506
554
  tmp1 = __lasx_xvmulwev_h_b(a, b);
@@ -529,42 +577,6 @@ static __m256i lasx_packs_h(__m256i a, __m256i b) {
529
577
  return __lasx_xvpickev_b(tmp1, tmp);
530
578
  }
531
579
 
532
- static __m128i lsx_packs_w(__m128i a, __m128i b) {
533
- __m128i tmp, tmp1;
534
- tmp = __lsx_vsat_w(a, 15);
535
- tmp1 = __lsx_vsat_w(b, 15);
536
- return __lsx_vpickev_h(tmp1, tmp);
537
- }
538
-
539
- static __m128i lsx_packs_h(__m128i a, __m128i b) {
540
- __m128i tmp, tmp1;
541
- tmp = __lsx_vsat_h(a, 7);
542
- tmp1 = __lsx_vsat_h(b, 7);
543
- return __lsx_vpickev_b(tmp1, tmp);
544
- }
545
-
546
- static __m128i lsx_packus_h(__m128i a, __m128i b) {
547
- __m128i tmp, tmp1;
548
- tmp = __lsx_vsat_hu(a, 7);
549
- tmp1 = __lsx_vsat_hu(b, 7);
550
- return __lsx_vpickev_b(tmp1, tmp);
551
- }
552
-
553
-
554
- static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
555
- __m128i tmp1, tmp2;
556
- tmp1 = __lsx_vmulwev_h_b(a, b);
557
- tmp2 = __lsx_vmulwod_h_b(a, b);
558
- return __lsx_vsadd_h(tmp1, tmp2);
559
- }
560
-
561
- static __m128i lsx_madd_h(__m128i a, __m128i b) {
562
- __m128i tmp1, tmp2;
563
- tmp1 = __lsx_vmulwev_w_h(a, b);
564
- tmp2 = __lsx_vmulwod_w_h(a, b);
565
- return __lsx_vadd_w(tmp1, tmp2);
566
- }
567
-
568
580
  // multiply int8_t, add results pairwise twice
569
581
  static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
570
582
  // Get absolute values of x vectors
@@ -2232,21 +2244,22 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2232
2244
  }
2233
2245
 
2234
2246
  sumf = hsum_float_8(acc);
2247
+
2235
2248
  #elif defined(__loongarch_sx)
2236
2249
  // set constants
2237
2250
  const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
2238
2251
  const __m128i off = __lsx_vreplgr2vr_b(8);
2239
2252
 
2240
2253
  // Initialize accumulator with zeros
2241
- __m128 acc_0 = __lsx_vldi(0);
2242
- __m128 acc_1 = __lsx_vldi(0);
2243
- __m128 acc_2 = __lsx_vldi(0);
2244
- __m128 acc_3 = __lsx_vldi(0);
2254
+ __m128 acc_0 = (__m128)__lsx_vldi(0);
2255
+ __m128 acc_1 = (__m128)__lsx_vldi(0);
2256
+ __m128 acc_2 = (__m128)__lsx_vldi(0);
2257
+ __m128 acc_3 = (__m128)__lsx_vldi(0);
2245
2258
 
2246
2259
  for (; ib + 1 < nb; ib += 2) {
2247
2260
 
2248
2261
  // Compute combined scale for the block 0 and 1
2249
- const __m128 d_0_1 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
2262
+ const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
2250
2263
 
2251
2264
  const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
2252
2265
 
@@ -2264,7 +2277,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2264
2277
  //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
2265
2278
 
2266
2279
  // Compute combined scale for the block 2 and 3
2267
- const __m128 d_2_3 = __lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
2280
+ const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
2268
2281
 
2269
2282
  const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
2270
2283
 
@@ -1302,7 +1302,7 @@ struct ggml_threadpool {
1302
1302
  // these are atomic as an annotation for thread-sanitizer
1303
1303
  atomic_bool stop; // Used for stopping the threadpool altogether
1304
1304
  atomic_bool pause; // Used for pausing the threadpool or individual threads
1305
- atomic_bool abort; // Used for aborting processing of a graph
1305
+ atomic_int abort; // Used for aborting processing of a graph
1306
1306
 
1307
1307
  struct ggml_compute_state * workers; // per thread state
1308
1308
  int n_threads_max; // number of threads in the pool
@@ -7883,7 +7883,7 @@ static void ggml_compute_forward_out_prod_f32(
7883
7883
 
7884
7884
  float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
7885
7885
  float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
7886
- float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7886
+ float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7887
7887
 
7888
7888
  ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
7889
7889
  }
@@ -7892,7 +7892,7 @@ static void ggml_compute_forward_out_prod_f32(
7892
7892
 
7893
7893
  float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
7894
7894
  float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
7895
- float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7895
+ float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
7896
 
7897
7897
  ggml_vec_mad_f32(ne0, d, s0, *s1);
7898
7898
  }
@@ -13851,14 +13851,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
13851
13851
  /*.threadpool=*/ tp,
13852
13852
  };
13853
13853
 
13854
- for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
13854
+ for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
13855
13855
  struct ggml_tensor * node = cgraph->nodes[node_n];
13856
13856
 
13857
13857
  ggml_compute_forward(&params, node);
13858
13858
 
13859
13859
  if (state->ith == 0 && cplan->abort_callback &&
13860
13860
  cplan->abort_callback(cplan->abort_callback_data)) {
13861
- tp->abort = true;
13861
+ atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
13862
13862
  tp->ec = GGML_STATUS_ABORTED;
13863
13863
  }
13864
13864
 
@@ -14031,7 +14031,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
14031
14031
  threadpool->current_chunk = 0;
14032
14032
  threadpool->stop = false;
14033
14033
  threadpool->pause = tpp->paused;
14034
- threadpool->abort = false;
14034
+ threadpool->abort = -1;
14035
14035
  threadpool->workers = NULL;
14036
14036
  threadpool->n_threads_max = tpp->n_threads;
14037
14037
  threadpool->n_threads_cur = tpp->n_threads;
@@ -14110,7 +14110,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
14110
14110
  threadpool->cgraph = cgraph;
14111
14111
  threadpool->cplan = cplan;
14112
14112
  threadpool->current_chunk = 0;
14113
- threadpool->abort = false;
14113
+ threadpool->abort = -1;
14114
14114
  threadpool->ec = GGML_STATUS_SUCCESS;
14115
14115
  }
14116
14116
 
@@ -416,7 +416,8 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
416
416
  case GGML_OP_IM2COL_BACK:
417
417
  return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
418
418
  case GGML_OP_OUT_PROD:
419
- return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
419
+ return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
420
+ src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
420
421
  default:
421
422
  return true;
422
423
  }
@@ -28,7 +28,7 @@ if (CUDAToolkit_FOUND)
28
28
  list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
29
29
 
30
30
  file(GLOB GGML_SOURCES_CUDA "*.cu")
31
- file(GLOB SRCS "template-instances/fattn-wmma*.cu")
31
+ file(GLOB SRCS "template-instances/fattn-mma*.cu")
32
32
  list(APPEND GGML_SOURCES_CUDA ${SRCS})
33
33
  file(GLOB SRCS "template-instances/mmq*.cu")
34
34
  list(APPEND GGML_SOURCES_CUDA ${SRCS})