@fugood/llama.node 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/arg.cpp +359 -310
  3. package/src/llama.cpp/common/chat.cpp +27 -15
  4. package/src/llama.cpp/common/common.cpp +1 -0
  5. package/src/llama.cpp/common/sampling.cpp +1 -0
  6. package/src/llama.cpp/ggml/CMakeLists.txt +37 -21
  7. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -1
  8. package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
  11. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
  12. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +17 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +93 -862
  16. package/src/llama.cpp/include/llama.h +15 -11
  17. package/src/llama.cpp/src/llama-context.cpp +151 -0
  18. package/src/llama.cpp/src/llama-context.h +10 -0
  19. package/src/llama.cpp/src/llama-cparams.h +1 -1
  20. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +8 -0
  21. package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  22. package/src/llama.cpp/src/llama-kv-cache.cpp +8 -0
  23. package/src/llama.cpp/src/llama-kv-cache.h +2 -0
  24. package/src/llama.cpp/src/llama-memory-hybrid.cpp +8 -0
  25. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
  26. package/src/llama.cpp/src/llama-memory-recurrent.cpp +8 -0
  27. package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
  28. package/src/llama.cpp/src/llama-memory.h +3 -0
  29. package/src/llama.cpp/src/llama-model.cpp +14 -4
  30. package/src/llama.cpp/src/llama-model.h +5 -1
@@ -1727,10 +1727,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1727
1727
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1728
1728
  LOG_DBG("%s\n", __func__);
1729
1729
  common_chat_params data;
1730
- data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
1730
+ const std::optional<json> tools_override = json();
1731
+ const std::optional<json> additional_context = json {
1731
1732
  {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
1732
1733
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1733
- });
1734
+ };
1735
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
1734
1736
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1735
1737
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1736
1738
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2216,15 +2218,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
2216
2218
 
2217
2219
  static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2218
2220
  // Parse thinking tags
2221
+ static const common_regex start_think_regex(regex_escape("<think>"));
2222
+ static const common_regex end_think_regex(regex_escape("</think>"));
2223
+ // Granite models output partial tokens such as "<" and "<think".
2224
+ // By leveraging try_consume_regex()/try_find_regex() throwing
2225
+ // common_chat_msg_partial_exception for these partial tokens,
2226
+ // processing is interrupted and the tokens are not passed to add_content().
2227
+ if (auto res = builder.try_consume_regex(start_think_regex)) {
2228
+ // Restore position for try_parse_reasoning()
2229
+ builder.move_to(res->groups[0].begin);
2230
+ builder.try_find_regex(end_think_regex, std::string::npos, false);
2231
+ // Restore position for try_parse_reasoning()
2232
+ builder.move_to(res->groups[0].begin);
2233
+ }
2219
2234
  builder.try_parse_reasoning("<think>", "</think>");
2220
2235
 
2221
- // Parse response tags using regex
2222
- static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
2223
- if (auto res = builder.try_find_regex(response_regex)) {
2224
- // Extract the content between the tags (capture group 1)
2225
- auto content = builder.str(res->groups[1]);
2226
- builder.add_content(content);
2227
- builder.move_to(res->groups[0].end);
2236
+ // Parse response tags
2237
+ static const common_regex start_response_regex(regex_escape("<response>"));
2238
+ static const common_regex end_response_regex(regex_escape("</response>"));
2239
+ // Granite models output partial tokens such as "<" and "<response".
2240
+ // Same hack as reasoning parsing.
2241
+ if (builder.try_consume_regex(start_response_regex)) {
2242
+ builder.try_find_regex(end_response_regex);
2228
2243
  }
2229
2244
 
2230
2245
  if (!builder.syntax().parse_tool_calls) {
@@ -2238,13 +2253,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2238
2253
  builder.move_to(res->groups[0].end);
2239
2254
 
2240
2255
  // Expect JSON array of tool calls
2241
- auto tool_calls_data = builder.consume_json();
2242
- if (tool_calls_data.json.is_array()) {
2243
- if (!builder.add_tool_calls(tool_calls_data.json)) {
2244
- builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2256
+ if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
2257
+ if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
2258
+ throw common_chat_msg_partial_exception("incomplete tool call");
2245
2259
  }
2246
- } else {
2247
- builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2248
2260
  }
2249
2261
  } else {
2250
2262
  builder.add_content(builder.consume_rest());
@@ -14,6 +14,7 @@
14
14
  #include <climits>
15
15
  #include <cmath>
16
16
  #include <codecvt>
17
+ #include <chrono>
17
18
  #include <cstdarg>
18
19
  #include <cstring>
19
20
  #include <ctime>
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
332
332
  }
333
333
  if (ctx) {
334
334
  llama_perf_context_print(ctx);
335
+ llama_memory_breakdown_print(ctx);
335
336
  }
336
337
  }
337
338
 
@@ -1,5 +1,41 @@
1
1
  cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2
2
  project("ggml" C CXX ASM)
3
+
4
+ ### GGML Version
5
+ set(GGML_VERSION_MAJOR 0)
6
+ set(GGML_VERSION_MINOR 9)
7
+ set(GGML_VERSION_PATCH 0)
8
+ set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
9
+ set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
10
+
11
+ find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
12
+ if(GIT_EXE)
13
+ # Get current git commit hash
14
+ execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
15
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
16
+ OUTPUT_VARIABLE GGML_BUILD_COMMIT
17
+ OUTPUT_STRIP_TRAILING_WHITESPACE
18
+ ERROR_QUIET
19
+ )
20
+
21
+ # Check if the working directory is dirty (i.e., has uncommitted changes)
22
+ execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
23
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
24
+ RESULT_VARIABLE GGML_GIT_DIRTY
25
+ ERROR_QUIET
26
+ )
27
+ endif()
28
+
29
+ # Build the version string with optional -dev suffix and dirty flag
30
+ set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
31
+ if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
32
+ set(GGML_VERSION "${GGML_VERSION}-dirty")
33
+ endif()
34
+
35
+ if(NOT GGML_BUILD_COMMIT)
36
+ set(GGML_BUILD_COMMIT "unknown")
37
+ endif()
38
+
3
39
  include(CheckIncludeFileCXX)
4
40
 
5
41
  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -300,26 +336,6 @@ endif()
300
336
  # Create CMake package
301
337
  #
302
338
 
303
- # Generate version info based on git commit.
304
-
305
- if(NOT DEFINED GGML_BUILD_NUMBER)
306
- find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
307
- execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
308
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
309
- OUTPUT_VARIABLE GGML_BUILD_NUMBER
310
- OUTPUT_STRIP_TRAILING_WHITESPACE
311
- )
312
-
313
- if(GGML_BUILD_NUMBER EQUAL 1)
314
- message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
315
- endif()
316
-
317
- execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
318
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
319
- OUTPUT_VARIABLE GGML_BUILD_COMMIT
320
- OUTPUT_STRIP_TRAILING_WHITESPACE
321
- )
322
- endif()
323
339
 
324
340
 
325
341
  # Capture variables prefixed with GGML_.
@@ -348,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})
348
364
 
349
365
  # Create the CMake package and set install location.
350
366
 
351
- set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
367
+ set(GGML_INSTALL_VERSION ${GGML_VERSION})
352
368
  set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
353
369
  set(GGML_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
354
370
  set(GGML_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
@@ -314,7 +314,8 @@ extern "C" {
314
314
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
315
315
  GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
316
316
 
317
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
317
+ GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
318
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
318
319
 
319
320
  GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
320
321
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
@@ -7,6 +7,9 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
+ // device buffer
11
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
12
+
10
13
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
11
14
 
12
15
  #ifdef __cplusplus
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
114
114
 
115
115
  if (NOT MSVC)
116
116
  if (GGML_STATIC)
117
+ if (UNIX AND NOT APPLE)
118
+ set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
119
+ endif()
117
120
  add_link_options(-static)
118
121
  if (MINGW)
119
122
  add_link_options(-static-libgcc -static-libstdc++)
@@ -7,7 +7,7 @@
7
7
  #include "ggml-cpu.h"
8
8
  #include "traits.h"
9
9
 
10
- #if defined(__gnu_linux__)
10
+ #if defined(__linux__)
11
11
  #include <sys/syscall.h>
12
12
  #include <unistd.h>
13
13
  #endif
@@ -186,7 +186,7 @@ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_ty
186
186
  #define XFEATURE_XTILEDATA 18
187
187
 
188
188
  static bool ggml_amx_init() {
189
- #if defined(__gnu_linux__)
189
+ #if defined(__linux__)
190
190
  if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
191
191
  fprintf(stderr, "AMX is not ready to be used!\n");
192
192
  return false;
@@ -194,6 +194,8 @@ static bool ggml_amx_init() {
194
194
  return true;
195
195
  #elif defined(_WIN32)
196
196
  return true;
197
+ #else
198
+ return false;
197
199
  #endif
198
200
  }
199
201
 
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
878
878
  const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
879
879
  const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
880
880
 
881
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
881
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
882
882
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
883
883
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
884
884
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
1231
1231
  const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
1232
1232
  const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
1233
1233
 
1234
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
1234
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
1235
1235
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
1236
1236
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
1237
1237
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
28
28
  return GGML_BF16_TO_FP32(x);
29
29
  }
30
30
 
31
+ static inline float i32_to_f32(int32_t x) {
32
+ return x;
33
+ }
34
+
35
+ static inline int32_t f32_to_i32(float x) {
36
+ return x;
37
+ }
38
+
31
39
  static inline float f32_to_f32(float x) {
32
40
  return x;
33
41
  }
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
54
62
  static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
55
63
  };
56
64
 
65
+ template <>
66
+ struct type_conversion_table<int32_t> {
67
+ static constexpr float (*to_f32)(int32_t) = i32_to_f32;
68
+ static constexpr int32_t (*from_f32)(float) = f32_to_i32;
69
+ };
70
+
57
71
  static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
58
72
  const int64_t ith = params->ith;
59
73
  const int64_t nth = params->nth;
@@ -473,10 +473,10 @@ struct ggml_threadpool {
473
473
  struct ggml_compute_state {
474
474
  #ifndef GGML_USE_OPENMP
475
475
  ggml_thread_t thrd;
476
- bool cpumask[GGML_MAX_N_THREADS];
477
476
  int last_graph;
478
477
  bool pending;
479
478
  #endif
479
+ bool cpumask[GGML_MAX_N_THREADS];
480
480
  struct ggml_threadpool * threadpool;
481
481
  int ith;
482
482
  };
@@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
3081
3081
 
3082
3082
  threadpool->workers = workers;
3083
3083
 
3084
- #ifndef GGML_USE_OPENMP
3084
+ #ifdef GGML_USE_OPENMP
3085
+ int32_t cpumask_iter = 0;
3086
+
3087
+ // Compute CPU masks for each thread
3088
+ for (int j = 0; j < tpp->n_threads; j++) {
3089
+ ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3090
+ }
3091
+ #else // GGML_USE_OPENMP
3085
3092
  ggml_mutex_init(&threadpool->mutex);
3086
3093
  ggml_cond_init(&threadpool->cond);
3087
3094
 
@@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3154
3161
  atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3155
3162
  }
3156
3163
 
3157
- ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
3164
+ // Apply thread CPU mask and priority
3165
+ int ith = omp_get_thread_num();
3166
+
3167
+ ggml_thread_apply_priority(threadpool->prio);
3168
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
3169
+ ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
3170
+ }
3171
+ ggml_graph_compute_thread(&threadpool->workers[ith]);
3158
3172
  }
3159
3173
  } else {
3160
3174
  atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
190
190
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
191
191
  /* .event_record = */ NULL,
192
192
  /* .event_wait = */ NULL,
193
- /* .optimize_graph = */ NULL,
193
+ /* .graph_optimize = */ NULL,
194
194
  };
195
195
 
196
196
  static ggml_guid_t ggml_backend_cpu_guid(void) {