@fugood/llama.node 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1727,10 +1727,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1727
1727
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1728
1728
  LOG_DBG("%s\n", __func__);
1729
1729
  common_chat_params data;
1730
- data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
1730
+ const std::optional<json> tools_override = json();
1731
+ const std::optional<json> additional_context = json {
1731
1732
  {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
1732
1733
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1733
- });
1734
+ };
1735
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
1734
1736
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1735
1737
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1736
1738
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2216,15 +2218,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
2216
2218
 
2217
2219
  static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2218
2220
  // Parse thinking tags
2221
+ static const common_regex start_think_regex(regex_escape("<think>"));
2222
+ static const common_regex end_think_regex(regex_escape("</think>"));
2223
+ // Granite models output partial tokens such as "<" and "<think".
2224
+ // By leveraging try_consume_regex()/try_find_regex() throwing
2225
+ // common_chat_msg_partial_exception for these partial tokens,
2226
+ // processing is interrupted and the tokens are not passed to add_content().
2227
+ if (auto res = builder.try_consume_regex(start_think_regex)) {
2228
+ // Restore position for try_parse_reasoning()
2229
+ builder.move_to(res->groups[0].begin);
2230
+ builder.try_find_regex(end_think_regex, std::string::npos, false);
2231
+ // Restore position for try_parse_reasoning()
2232
+ builder.move_to(res->groups[0].begin);
2233
+ }
2219
2234
  builder.try_parse_reasoning("<think>", "</think>");
2220
2235
 
2221
- // Parse response tags using regex
2222
- static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
2223
- if (auto res = builder.try_find_regex(response_regex)) {
2224
- // Extract the content between the tags (capture group 1)
2225
- auto content = builder.str(res->groups[1]);
2226
- builder.add_content(content);
2227
- builder.move_to(res->groups[0].end);
2236
+ // Parse response tags
2237
+ static const common_regex start_response_regex(regex_escape("<response>"));
2238
+ static const common_regex end_response_regex(regex_escape("</response>"));
2239
+ // Granite models output partial tokens such as "<" and "<response".
2240
+ // Same hack as reasoning parsing.
2241
+ if (builder.try_consume_regex(start_response_regex)) {
2242
+ builder.try_find_regex(end_response_regex);
2228
2243
  }
2229
2244
 
2230
2245
  if (!builder.syntax().parse_tool_calls) {
@@ -2238,13 +2253,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2238
2253
  builder.move_to(res->groups[0].end);
2239
2254
 
2240
2255
  // Expect JSON array of tool calls
2241
- auto tool_calls_data = builder.consume_json();
2242
- if (tool_calls_data.json.is_array()) {
2243
- if (!builder.add_tool_calls(tool_calls_data.json)) {
2244
- builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2256
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
2257
+ if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
2258
+ throw common_chat_msg_partial_exception("incomplete tool call");
2245
2259
  }
2246
- } else {
2247
- builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2248
2260
  }
2249
2261
  } else {
2250
2262
  builder.add_content(builder.consume_rest());
@@ -193,10 +193,11 @@ struct common_params_sampling {
193
193
  };
194
194
 
195
195
  struct common_params_model {
196
- std::string path = ""; // model local path // NOLINT
197
- std::string url = ""; // model url to download // NOLINT
198
- std::string hf_repo = ""; // HF repo // NOLINT
199
- std::string hf_file = ""; // HF file // NOLINT
196
+ std::string path = ""; // model local path // NOLINT
197
+ std::string url = ""; // model url to download // NOLINT
198
+ std::string hf_repo = ""; // HF repo // NOLINT
199
+ std::string hf_file = ""; // HF file // NOLINT
200
+ std::string docker_repo = ""; // Docker repo // NOLINT
200
201
  };
201
202
 
202
203
  struct common_params_speculative {
@@ -288,9 +289,9 @@ struct common_params {
288
289
  float rope_freq_base = 0.0f; // RoPE base frequency
289
290
  float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
290
291
  float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
291
- float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
292
- float yarn_beta_fast = 32.0f; // YaRN low correction dim
293
- float yarn_beta_slow = 1.0f; // YaRN high correction dim
292
+ float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
293
+ float yarn_beta_fast = -1.0f; // YaRN low correction dim
294
+ float yarn_beta_slow = -1.0f; // YaRN high correction dim
294
295
  int32_t yarn_orig_ctx = 0; // YaRN original context length
295
296
 
296
297
  // offload params
@@ -453,7 +454,7 @@ struct common_params {
453
454
 
454
455
  std::string slot_save_path;
455
456
 
456
- float slot_prompt_similarity = 0.5f;
457
+ float slot_prompt_similarity = 0.1f;
457
458
 
458
459
  // batched-bench params
459
460
  bool is_pp_shared = false;
@@ -734,6 +735,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
734
735
 
735
736
  }
736
737
 
738
+ //
739
+ // MoE utils
740
+ //
741
+
742
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
743
+
744
+ static std::string llm_ffn_exps_block_regex(int idx) {
745
+ return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
746
+ }
747
+
748
+ static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
749
+ return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
750
+ }
751
+
737
752
  //
738
753
  // training utils
739
754
  //
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
257
257
  };
258
258
 
259
259
  static bool is_reserved_name(const std::string & name) {
260
- static std::unordered_set<std::string> RESERVED_NAMES;
261
- if (RESERVED_NAMES.empty()) {
262
- RESERVED_NAMES.insert("root");
263
- for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
264
- for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
265
- }
260
+ static const std::unordered_set<std::string> RESERVED_NAMES = [] {
261
+ std::unordered_set<std::string> s;
262
+ s.insert("root");
263
+ for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
264
+ for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
265
+ return s;
266
+ }();
266
267
  return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
267
268
  }
268
269
 
@@ -1,5 +1,41 @@
1
1
  cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2
2
  project("ggml" C CXX ASM)
3
+
4
+ ### GGML Version
5
+ set(GGML_VERSION_MAJOR 0)
6
+ set(GGML_VERSION_MINOR 9)
7
+ set(GGML_VERSION_PATCH 0)
8
+ set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
9
+ set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
10
+
11
+ find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
12
+ if(GIT_EXE)
13
+ # Get current git commit hash
14
+ execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
15
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
16
+ OUTPUT_VARIABLE GGML_BUILD_COMMIT
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE
18
+ ERROR_QUIET
19
+ )
20
+
21
+ # Check if the working directory is dirty (i.e., has uncommitted changes)
22
+ execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
23
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
24
+ RESULT_VARIABLE GGML_GIT_DIRTY
25
+ ERROR_QUIET
26
+ )
27
+ endif()
28
+
29
+ # Build the version string with optional -dev suffix and dirty flag
30
+ set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
31
+ if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
32
+ set(GGML_VERSION "${GGML_VERSION}-dirty")
33
+ endif()
34
+
35
+ if(NOT GGML_BUILD_COMMIT)
36
+ set(GGML_BUILD_COMMIT "unknown")
37
+ endif()
38
+
3
39
  include(CheckIncludeFileCXX)
4
40
 
5
41
  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -190,7 +226,6 @@ option(GGML_WEBGPU "ggml: use WebGPU"
190
226
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
191
227
  option(GGML_ZDNN "ggml: use zDNN" OFF)
192
228
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
193
- option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
194
229
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
195
230
  option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
196
231
  option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@@ -301,26 +336,6 @@ endif()
301
336
  # Create CMake package
302
337
  #
303
338
 
304
- # Generate version info based on git commit.
305
-
306
- if(NOT DEFINED GGML_BUILD_NUMBER)
307
- find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
308
- execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
309
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
310
- OUTPUT_VARIABLE GGML_BUILD_NUMBER
311
- OUTPUT_STRIP_TRAILING_WHITESPACE
312
- )
313
-
314
- if(GGML_BUILD_NUMBER EQUAL 1)
315
- message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
316
- endif()
317
-
318
- execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
319
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
320
- OUTPUT_VARIABLE GGML_BUILD_COMMIT
321
- OUTPUT_STRIP_TRAILING_WHITESPACE
322
- )
323
- endif()
324
339
 
325
340
 
326
341
  # Capture variables prefixed with GGML_.
@@ -349,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
349
364
 
350
365
  # Create the CMake package and set install location.
351
366
 
352
- set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
367
+ set(GGML_INSTALL_VERSION ${GGML_VERSION})
353
368
  set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
354
369
  set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
355
370
  set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
@@ -39,6 +39,7 @@ extern "C" {
39
39
  // user-code should use only these functions
40
40
  //
41
41
 
42
+ // TODO: remove in the future
42
43
  GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
43
44
 
44
45
  GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
@@ -7,8 +7,6 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
- GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
11
-
12
10
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
13
11
 
14
12
  #ifdef __cplusplus
@@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
284
284
  // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
285
285
  //
286
286
  #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
287
- const type prefix##0 = (pointer)->array[0]; \
287
+ const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
288
288
  GGML_UNUSED(prefix##0);
289
289
  #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
290
290
  GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
291
- const type prefix##1 = (pointer)->array[1]; \
291
+ const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
292
292
  GGML_UNUSED(prefix##1);
293
293
  #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
294
294
  GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
295
- const type prefix##2 = (pointer)->array[2]; \
295
+ const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
296
296
  GGML_UNUSED(prefix##2);
297
297
  #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
298
298
  GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
299
- const type prefix##3 = (pointer)->array[3]; \
299
+ const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
300
300
  GGML_UNUSED(prefix##3);
301
301
 
302
302
  #define GGML_TENSOR_UNARY_OP_LOCALS \
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
114
114
 
115
115
  if (NOT MSVC)
116
116
  if (GGML_STATIC)
117
+ if (UNIX AND NOT APPLE)
118
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
119
+ endif()
117
120
  add_link_options(-static)
118
121
  if (MINGW)
119
122
  add_link_options(-static-libgcc -static-libstdc++)
@@ -7,7 +7,7 @@
7
7
  #include "ggml-cpu.h"
8
8
  #include "traits.h"
9
9
 
10
- #if defined(__gnu_linux__)
10
+ #if defined(__linux__)
11
11
  #include <sys/syscall.h>
12
12
  #include <unistd.h>
13
13
  #endif
@@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
186
186
  #define XFEATURE_XTILEDATA 18
187
187
 
188
188
  static bool ggml_amx_init() {
189
- #if defined(__gnu_linux__)
189
+ #if defined(__linux__)
190
190
  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
191
191
  fprintf(stderr, "AMX is not ready to be used!\n");
192
192
  return false;
@@ -194,6 +194,8 @@ static bool ggml_amx_init() {
194
194
  return true;
195
195
  #elif defined(_WIN32)
196
196
  return true;
197
+ #else
198
+ return false;
197
199
  #endif
198
200
  }
199
201
 
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
28
28
  return GGML_BF16_TO_FP32(x);
29
29
  }
30
30
 
31
+ static inline float i32_to_f32(int32_t x) {
32
+ return x;
33
+ }
34
+
35
+ static inline int32_t f32_to_i32(float x) {
36
+ return x;
37
+ }
38
+
31
39
  static inline float f32_to_f32(float x) {
32
40
  return x;
33
41
  }
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
54
62
  static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
55
63
  };
56
64
 
65
+ template <>
66
+ struct type_conversion_table<int32_t> {
67
+ static constexpr float (*to_f32)(int32_t) = i32_to_f32;
68
+ static constexpr int32_t (*from_f32)(float) = f32_to_i32;
69
+ };
70
+
57
71
  static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
58
72
  const int64_t ith = params->ith;
59
73
  const int64_t nth = params->nth;
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
190
190
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
191
191
  /* .event_record = */ NULL,
192
192
  /* .event_wait = */ NULL,
193
- /* .optimize_graph = */ NULL,
193
+ /* .graph_optimize = */ NULL,
194
194
  };
195
195
 
196
196
  static ggml_guid_t ggml_backend_cpu_guid(void) {