@fugood/llama.node 1.1.11 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +111 -1
  28. package/src/llama.cpp/common/chat.h +3 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/log.cpp +53 -2
  31. package/src/llama.cpp/common/log.h +10 -4
  32. package/src/llama.cpp/common/sampling.cpp +23 -2
  33. package/src/llama.cpp/common/sampling.h +3 -1
  34. package/src/llama.cpp/common/speculative.cpp +1 -1
  35. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  36. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  37. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  38. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  39. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +14 -13
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  41. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +0 -6
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  44. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -9
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +218 -4
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  47. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  48. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  49. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  50. package/src/llama.cpp/include/llama.h +5 -6
  51. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  52. package/src/llama.cpp/src/llama-adapter.h +3 -0
  53. package/src/llama.cpp/src/llama-arch.cpp +27 -4
  54. package/src/llama.cpp/src/llama-arch.h +2 -0
  55. package/src/llama.cpp/src/llama-context.cpp +62 -56
  56. package/src/llama.cpp/src/llama-context.h +1 -1
  57. package/src/llama.cpp/src/llama-graph.cpp +54 -9
  58. package/src/llama.cpp/src/llama-graph.h +8 -0
  59. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  60. package/src/llama.cpp/src/llama-hparams.h +9 -3
  61. package/src/llama.cpp/src/llama-kv-cache.cpp +1 -23
  62. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  63. package/src/llama.cpp/src/llama-model.cpp +159 -1
  64. package/src/llama.cpp/src/llama-model.h +0 -1
  65. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  66. package/src/anyascii.c +0 -22223
  67. package/src/anyascii.h +0 -42
  68. package/src/tts_utils.cpp +0 -371
  69. package/src/tts_utils.h +0 -103
@@ -150,6 +150,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
150
150
  throw std::runtime_error("Invalid tool_choice: " + tool_choice);
151
151
  }
152
152
 
153
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
154
+ common_chat_templates_inputs dummy_inputs;
155
+ common_chat_msg msg;
156
+ msg.role = "user";
157
+ msg.content = "test";
158
+ dummy_inputs.messages = {msg};
159
+ dummy_inputs.enable_thinking = false;
160
+ const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
161
+ dummy_inputs.enable_thinking = true;
162
+ const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
163
+ return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
164
+ }
165
+
153
166
  template <>
154
167
  std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
155
168
  std::vector<common_chat_msg> msgs;
@@ -610,6 +623,7 @@ const char * common_chat_format_name(common_chat_format format) {
610
623
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
611
624
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
612
625
  case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
626
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
613
627
  default:
614
628
  throw std::runtime_error("Unknown chat format");
615
629
  }
@@ -1170,6 +1184,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1170
1184
  });
1171
1185
  return data;
1172
1186
  }
1187
+
1188
+ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1189
+ common_chat_params data;
1190
+
1191
+ // Generate the prompt using the apply() function with the template
1192
+ data.prompt = apply(tmpl, inputs);
1193
+ data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
1194
+
1195
+ // Handle thinking tags appropriately based on inputs.enable_thinking
1196
+ if (string_ends_with(data.prompt, "<think>\n")) {
1197
+ if (!inputs.enable_thinking) {
1198
+ data.prompt += "</think>";
1199
+ } else {
1200
+ data.thinking_forced_open = true;
1201
+ }
1202
+ }
1203
+
1204
+ // When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
1205
+ if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1206
+ data.grammar_lazy = true;
1207
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1208
+ auto schemas = json::array();
1209
+ foreach_function(inputs.tools, [&](const json & tool) {
1210
+ const auto & function = tool.at("function");
1211
+ schemas.push_back({
1212
+ { "type", "object" },
1213
+ { "properties",
1214
+ {
1215
+ { "name",
1216
+ {
1217
+ { "type", "string" },
1218
+ { "const", function.at("name") },
1219
+ } },
1220
+ { "arguments", function.at("parameters") },
1221
+ } },
1222
+ { "required", json::array({ "name", "arguments" }) },
1223
+ });
1224
+ });
1225
+ auto schema = json{
1226
+ { "type", "array" },
1227
+ { "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1228
+ { "minItems", 1 },
1229
+ };
1230
+ if (!inputs.parallel_tool_calls) {
1231
+ schema["maxItems"] = 1;
1232
+ }
1233
+ builder.add_rule("root",
1234
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1235
+ "\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
1236
+ " \"</TOOLCALL>\"");
1237
+ });
1238
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1239
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1240
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1241
+ std::string(data.thinking_forced_open ?
1242
+ "[\\s\\S]*?(</think>\\s*)" :
1243
+ "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1244
+ "(<TOOLCALL>)[\\s\\S]*" });
1245
+ }
1246
+ return data;
1247
+ }
1173
1248
  static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1174
1249
  if (!builder.syntax().parse_tool_calls) {
1175
1250
  builder.add_content(builder.consume_rest());
@@ -1816,7 +1891,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
1816
1891
  // If thinking_forced_open, then we capture the </think> tag in the grammar,
1817
1892
  // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1818
1893
  std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1819
- "(\\s*"
1894
+ "\\s*("
1820
1895
  "(?:<tool_call>"
1821
1896
  "|<function"
1822
1897
  "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2046,6 +2121,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2046
2121
  }
2047
2122
  }
2048
2123
 
2124
+ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2125
+ // Parse thinking tags
2126
+ builder.try_parse_reasoning("<think>", "</think>");
2127
+ if (!builder.syntax().parse_tool_calls) {
2128
+ builder.add_content(builder.consume_rest());
2129
+ return;
2130
+ }
2131
+
2132
+ // Look for tool calls
2133
+ static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
2134
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
2135
+ builder.move_to(res->groups[0].end);
2136
+
2137
+ // Expect JSON array of tool calls
2138
+ auto tool_calls_data = builder.consume_json();
2139
+ if (tool_calls_data.json.is_array()) {
2140
+ if (!builder.try_consume_literal("</TOOLCALL>")) {
2141
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2142
+ }
2143
+ builder.add_tool_calls(tool_calls_data.json);
2144
+ } else {
2145
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2146
+ }
2147
+ }
2148
+ builder.add_content(builder.consume_rest());
2149
+ }
2150
+
2049
2151
  static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2050
2152
  // Parse thinking tags first - this handles the main reasoning content
2051
2153
  builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2279,6 +2381,11 @@ static common_chat_params common_chat_templates_apply_jinja(
2279
2381
  return common_chat_params_init_seed_oss(tmpl, params, inputs);
2280
2382
  }
2281
2383
 
2384
+ // Nemotron v2
2385
+ if (src.find("<SPECIAL_10>") != std::string::npos) {
2386
+ return common_chat_params_init_nemotron_v2(tmpl, params);
2387
+ }
2388
+
2282
2389
  // Use generic handler when mixing tools + JSON schema.
2283
2390
  // TODO: support that mix in handlers below.
2284
2391
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2440,6 +2547,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2440
2547
  case COMMON_CHAT_FORMAT_SEED_OSS:
2441
2548
  common_chat_parse_seed_oss(builder);
2442
2549
  break;
2550
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2551
+ common_chat_parse_nemotron_v2(builder);
2552
+ break;
2443
2553
  default:
2444
2554
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2445
2555
  }
@@ -123,6 +123,7 @@ enum common_chat_format {
123
123
  COMMON_CHAT_FORMAT_GRANITE,
124
124
  COMMON_CHAT_FORMAT_GPT_OSS,
125
125
  COMMON_CHAT_FORMAT_SEED_OSS,
126
+ COMMON_CHAT_FORMAT_NEMOTRON_V2,
126
127
 
127
128
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
128
129
  };
@@ -209,6 +210,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
209
210
 
210
211
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
211
212
 
213
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
214
+
212
215
  // Parses a JSON array of messages in OpenAI's chat completion API format.
213
216
  // T can be std::string containing JSON or nlohmann::ordered_json
214
217
  template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
@@ -445,7 +445,7 @@ struct common_params {
445
445
 
446
446
  // "advanced" endpoints are disabled by default for better security
447
447
  bool webui = true;
448
- bool endpoint_slots = false;
448
+ bool endpoint_slots = true;
449
449
  bool endpoint_props = false; // only control POST requests, not GET
450
450
  bool endpoint_metrics = false;
451
451
 
@@ -4,17 +4,52 @@
4
4
  #include <condition_variable>
5
5
  #include <cstdarg>
6
6
  #include <cstdio>
7
+ #include <cstdlib>
8
+ #include <cstring>
7
9
  #include <mutex>
8
10
  #include <sstream>
9
11
  #include <thread>
10
12
  #include <vector>
11
13
 
14
+ #if defined(_WIN32)
15
+ # include <io.h>
16
+ # include <windows.h>
17
+ # define isatty _isatty
18
+ # define fileno _fileno
19
+ #else
20
+ # include <unistd.h>
21
+ #endif // defined(_WIN32)
22
+
12
23
  int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
13
24
 
14
25
  void common_log_set_verbosity_thold(int verbosity) {
15
26
  common_log_verbosity_thold = verbosity;
16
27
  }
17
28
 
29
+ // Auto-detect if colors should be enabled based on terminal and environment
30
+ static bool common_log_should_use_colors_auto() {
31
+ // Check NO_COLOR environment variable (https://no-color.org/)
32
+ if (const char * no_color = std::getenv("NO_COLOR")) {
33
+ if (no_color[0] != '\0') {
34
+ return false;
35
+ }
36
+ }
37
+
38
+ // Check TERM environment variable
39
+ if (const char * term = std::getenv("TERM")) {
40
+ if (std::strcmp(term, "dumb") == 0) {
41
+ return false;
42
+ }
43
+ }
44
+
45
+ // Check if stdout and stderr are connected to a terminal
46
+ // We check both because log messages can go to either
47
+ bool stdout_is_tty = isatty(fileno(stdout));
48
+ bool stderr_is_tty = isatty(fileno(stderr));
49
+
50
+ return stdout_is_tty || stderr_is_tty;
51
+ }
52
+
18
53
  static int64_t t_us() {
19
54
  return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
20
55
  }
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
353
388
 
354
389
  struct common_log * common_log_main() {
355
390
  static struct common_log log;
391
+ static std::once_flag init_flag;
392
+ std::call_once(init_flag, [&]() {
393
+ // Set default to auto-detect colors
394
+ log.set_colors(common_log_should_use_colors_auto());
395
+ });
356
396
 
357
397
  return &log;
358
398
  }
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
380
420
  log->set_file(file);
381
421
  }
382
422
 
383
- void common_log_set_colors(struct common_log * log, bool colors) {
384
- log->set_colors(colors);
423
+ void common_log_set_colors(struct common_log * log, log_colors colors) {
424
+ if (colors == LOG_COLORS_AUTO) {
425
+ log->set_colors(common_log_should_use_colors_auto());
426
+ return;
427
+ }
428
+
429
+ if (colors == LOG_COLORS_DISABLED) {
430
+ log->set_colors(false);
431
+ return;
432
+ }
433
+
434
+ GGML_ASSERT(colors == LOG_COLORS_ENABLED);
435
+ log->set_colors(true);
385
436
  }
386
437
 
387
438
  void common_log_set_prefix(struct common_log * log, bool prefix) {
@@ -24,6 +24,12 @@
24
24
  #define LOG_DEFAULT_DEBUG 1
25
25
  #define LOG_DEFAULT_LLAMA 0
26
26
 
27
+ enum log_colors {
28
+ LOG_COLORS_AUTO = -1,
29
+ LOG_COLORS_DISABLED = 0,
30
+ LOG_COLORS_ENABLED = 1,
31
+ };
32
+
27
33
  // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
28
34
  // set via common_log_set_verbosity()
29
35
  extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
65
71
  // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
66
72
  //
67
73
 
68
- void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
69
- void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
70
- void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
71
- void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
74
+ void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
75
+ void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
76
+ void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
77
+ void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
72
78
 
73
79
  // helper macros for logging
74
80
  // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
426
426
 
427
427
  // helpers
428
428
 
429
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430
- return &gsmpl->cur_p;
429
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
430
+ auto * res = &gsmpl->cur_p;
431
+
432
+ if (do_sort && !res->sorted) {
433
+ // remember the selected token before sorting
434
+ const llama_token id = res->data[res->selected].id;
435
+
436
+ std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
437
+ return a.p > b.p;
438
+ });
439
+
440
+ // restore the selected token after sorting
441
+ for (size_t i = 0; i < res->size; ++i) {
442
+ if (res->data[i].id == id) {
443
+ res->selected = i;
444
+ break;
445
+ }
446
+ }
447
+
448
+ res->sorted = true;
449
+ }
450
+
451
+ return res;
431
452
  }
432
453
 
433
454
  llama_token common_sampler_last(const struct common_sampler * gsmpl) {
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
86
86
  // helpers
87
87
 
88
88
  // access the internal list of current candidate tokens
89
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
89
+ // if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
90
+ // the .sorted flag of the result indicates whether the returned candidates are sorted
91
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
90
92
 
91
93
  // get the last accepted token
92
94
  llama_token common_sampler_last(const struct common_sampler * gsmpl);
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
317
317
 
318
318
  common_sampler_sample(smpl, ctx_dft, 0, true);
319
319
 
320
- const auto * cur_p = common_sampler_get_candidates(smpl);
320
+ const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
322
322
  for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
323
323
  LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
@@ -129,10 +129,11 @@ endif()
129
129
  option(GGML_LASX "ggml: enable lasx" ON)
130
130
  option(GGML_LSX "ggml: enable lsx" ON)
131
131
  option(GGML_RVV "ggml: enable rvv" ON)
132
- option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
+ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
133
+ option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
134
+ option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
133
135
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
134
136
  option(GGML_VXE "ggml: enable vxe" ON)
135
- option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
136
137
 
137
138
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
138
139
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -307,6 +307,9 @@ extern "C" {
307
307
  GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
308
308
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
309
309
 
310
+ // Split graph without allocating it
311
+ GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
312
+
310
313
  // Allocate and compute graph on the backend scheduler
311
314
  GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
312
315
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
@@ -101,7 +101,6 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
- GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
105
104
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
106
105
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
107
106
 
@@ -511,6 +511,7 @@ extern "C" {
511
511
  GGML_OP_CONV_TRANSPOSE_1D,
512
512
  GGML_OP_IM2COL,
513
513
  GGML_OP_IM2COL_BACK,
514
+ GGML_OP_IM2COL_3D,
514
515
  GGML_OP_CONV_2D,
515
516
  GGML_OP_CONV_3D,
516
517
  GGML_OP_CONV_2D_DW,
@@ -1870,6 +1871,41 @@ extern "C" {
1870
1871
  int d0, // dilation dimension 0
1871
1872
  int d1); // dilation dimension 1
1872
1873
 
1874
+ GGML_API struct ggml_tensor * ggml_im2col_3d(
1875
+ struct ggml_context * ctx,
1876
+ struct ggml_tensor * a,
1877
+ struct ggml_tensor * b,
1878
+ int64_t IC,
1879
+ int s0, // stride width
1880
+ int s1, // stride height
1881
+ int s2, // stride depth
1882
+ int p0, // padding width
1883
+ int p1, // padding height
1884
+ int p2, // padding depth
1885
+ int d0, // dilation width
1886
+ int d1, // dilation height
1887
+ int d2, // dilation depth
1888
+ enum ggml_type dst_type);
1889
+
1890
+ // a: [OC*IC, KD, KH, KW]
1891
+ // b: [N*IC, ID, IH, IW]
1892
+ // result: [N*OC, OD, OH, OW]
1893
+ GGML_API struct ggml_tensor * ggml_conv_3d(
1894
+ struct ggml_context * ctx,
1895
+ struct ggml_tensor * a,
1896
+ struct ggml_tensor * b,
1897
+ int64_t IC,
1898
+ int s0, // stride width
1899
+ int s1, // stride height
1900
+ int s2, // stride depth
1901
+ int p0, // padding width
1902
+ int p1, // padding height
1903
+ int p2, // padding depth
1904
+ int d0, // dilation width
1905
+ int d1, // dilation height
1906
+ int d2 // dilation depth
1907
+ );
1908
+
1873
1909
  // kernel size is a->ne[0] x a->ne[1]
1874
1910
  // stride is equal to kernel size
1875
1911
  // padding is zero
@@ -1941,7 +1977,7 @@ extern "C" {
1941
1977
  int d0, // dilation dimension 0
1942
1978
  int d1); // dilation dimension 1
1943
1979
 
1944
- GGML_API struct ggml_tensor * ggml_conv_3d(
1980
+ GGML_API struct ggml_tensor * ggml_conv_3d_direct(
1945
1981
  struct ggml_context * ctx,
1946
1982
  struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947
1983
  struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2048,6 +2084,19 @@ extern "C" {
2048
2084
  int p2,
2049
2085
  int p3);
2050
2086
 
2087
+ GGML_API struct ggml_tensor * ggml_pad_ext(
2088
+ struct ggml_context * ctx,
2089
+ struct ggml_tensor * a,
2090
+ int lp0,
2091
+ int rp0,
2092
+ int lp1,
2093
+ int rp1,
2094
+ int lp2,
2095
+ int rp2,
2096
+ int lp3,
2097
+ int rp3
2098
+ );
2099
+
2051
2100
  // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
2052
2101
  GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
2053
2102
  struct ggml_context * ctx,
@@ -433,15 +433,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
433
433
  ggml-cpu/arch/riscv/quants.c
434
434
  ggml-cpu/arch/riscv/repack.cpp
435
435
  )
436
- if (GGML_RVV)
437
- if (GGML_XTHEADVECTOR)
438
- list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
439
- elseif (GGML_RV_ZFH)
440
- list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
441
- else()
442
- list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
436
+ set(MARCH_STR "rv64gc")
437
+ if (GGML_RV_ZFH)
438
+ string(APPEND MARCH_STR "_zfh")
439
+ endif()
440
+ if (GGML_XTHEADVECTOR)
441
+ string(APPEND MARCH_STR "_xtheadvector")
442
+ elseif (GGML_RVV)
443
+ string(APPEND MARCH_STR "_v")
444
+ if (GGML_RV_ZVFH)
445
+ string(APPEND MARCH_STR "_zvfh")
443
446
  endif()
444
447
  endif()
448
+ if (GGML_RV_ZICBOP)
449
+ string(APPEND MARCH_STR "_zicbop")
450
+ endif()
451
+ list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
445
452
  elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
446
453
  message(STATUS "s390x detected")
447
454
  list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
@@ -450,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
450
457
 
451
458
  # TODO: Separation to determine activation of VX/VXE/VXE2
452
459
  if (${S390X_M} MATCHES "8561|8562")
453
- set(GGML_NNPA OFF)
454
460
  message(STATUS "z15 target")
455
461
  list(APPEND ARCH_FLAGS -march=z15)
456
462
  elseif (${S390X_M} MATCHES "3931")
@@ -472,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
472
478
  list(APPEND ARCH_FLAGS -mvx -mzvector)
473
479
  list(APPEND ARCH_DEFINITIONS GGML_VXE)
474
480
  endif()
475
-
476
- if (GGML_NNPA)
477
- message(STATUS "NNPA enabled")
478
- list(APPEND ARCH_DEFINITIONS GGML_NNPA)
479
- endif()
480
481
  elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
481
482
  message(STATUS "Wasm detected")
482
483
  list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)