@fugood/llama.node 1.0.0-beta.5 → 1.0.0-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/lib/binding.ts +3 -1
  2. package/lib/index.js +2 -0
  3. package/lib/index.ts +3 -1
  4. package/package.json +14 -14
  5. package/scripts/llama.cpp.patch +27 -26
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +28 -7
  8. package/src/LlamaCompletionWorker.h +4 -0
  9. package/src/LlamaContext.cpp +14 -17
  10. package/src/common.hpp +7 -6
  11. package/src/llama.cpp/CMakeLists.txt +15 -4
  12. package/src/llama.cpp/common/CMakeLists.txt +15 -24
  13. package/src/llama.cpp/common/arg.cpp +172 -110
  14. package/src/llama.cpp/common/chat-parser.cpp +385 -0
  15. package/src/llama.cpp/common/chat-parser.h +120 -0
  16. package/src/llama.cpp/common/chat.cpp +726 -596
  17. package/src/llama.cpp/common/chat.h +74 -8
  18. package/src/llama.cpp/common/common.cpp +56 -38
  19. package/src/llama.cpp/common/common.h +9 -3
  20. package/src/llama.cpp/common/json-partial.cpp +256 -0
  21. package/src/llama.cpp/common/json-partial.h +38 -0
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  23. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -4
  24. package/src/llama.cpp/common/sampling.cpp +7 -8
  25. package/src/llama.cpp/common/speculative.cpp +6 -4
  26. package/src/llama.cpp/ggml/CMakeLists.txt +48 -3
  27. package/src/llama.cpp/ggml/include/ggml.h +22 -3
  28. package/src/llama.cpp/ggml/src/CMakeLists.txt +81 -22
  29. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +131 -49
  30. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  31. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2162 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  43. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  44. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  45. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +12 -13
  47. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +64 -88
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  49. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  51. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  52. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +282 -100
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  55. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  56. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  57. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1570 -0
  58. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  59. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +119 -5
  60. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  61. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  62. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +204 -49
  63. package/src/llama.cpp/include/llama.h +145 -40
  64. package/src/llama.cpp/src/CMakeLists.txt +5 -1
  65. package/src/llama.cpp/src/llama-arch.cpp +99 -3
  66. package/src/llama.cpp/src/llama-arch.h +10 -1
  67. package/src/llama.cpp/src/llama-batch.cpp +728 -272
  68. package/src/llama.cpp/src/llama-batch.h +112 -54
  69. package/src/llama.cpp/src/llama-chat.cpp +19 -2
  70. package/src/llama.cpp/src/llama-chat.h +1 -0
  71. package/src/llama.cpp/src/llama-context.cpp +525 -339
  72. package/src/llama.cpp/src/llama-context.h +38 -17
  73. package/src/llama.cpp/src/llama-cparams.cpp +4 -0
  74. package/src/llama.cpp/src/llama-cparams.h +2 -0
  75. package/src/llama.cpp/src/llama-grammar.cpp +12 -2
  76. package/src/llama.cpp/src/llama-graph.cpp +413 -353
  77. package/src/llama.cpp/src/llama-graph.h +112 -56
  78. package/src/llama.cpp/src/llama-hparams.cpp +10 -2
  79. package/src/llama.cpp/src/llama-hparams.h +13 -2
  80. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +279 -0
  81. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +128 -0
  82. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +1815 -0
  83. package/src/llama.cpp/src/llama-kv-cache-unified.h +303 -0
  84. package/src/llama.cpp/src/llama-kv-cells.h +415 -0
  85. package/src/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  86. package/src/llama.cpp/src/llama-memory-hybrid.h +138 -0
  87. package/src/llama.cpp/src/llama-memory-recurrent.cpp +1112 -0
  88. package/src/llama.cpp/src/llama-memory-recurrent.h +183 -0
  89. package/src/llama.cpp/src/llama-memory.cpp +41 -0
  90. package/src/llama.cpp/src/llama-memory.h +86 -5
  91. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  92. package/src/llama.cpp/src/llama-model-loader.cpp +42 -17
  93. package/src/llama.cpp/src/llama-model-saver.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model.cpp +1137 -528
  95. package/src/llama.cpp/src/llama-model.h +4 -0
  96. package/src/llama.cpp/src/llama-quant.cpp +2 -1
  97. package/src/llama.cpp/src/llama-sampling.cpp +2 -2
  98. package/src/llama.cpp/src/llama-vocab.cpp +69 -32
  99. package/src/llama.cpp/src/llama-vocab.h +1 -0
  100. package/src/llama.cpp/src/llama.cpp +11 -7
  101. package/src/llama.cpp/src/unicode.cpp +5 -0
  102. package/src/tts_utils.h +1 -1
  103. package/src/llama.cpp/common/json.hpp +0 -24766
  104. package/src/llama.cpp/common/minja/chat-template.hpp +0 -541
  105. package/src/llama.cpp/common/minja/minja.hpp +0 -2974
  106. package/src/llama.cpp/common/stb_image.h +0 -7988
  107. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  108. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13326
  109. package/src/llama.cpp/src/llama-kv-cache.cpp +0 -2827
  110. package/src/llama.cpp/src/llama-kv-cache.h +0 -515
  111. /package/src/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  112. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  113. /package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
package/lib/binding.ts CHANGED
@@ -22,7 +22,6 @@ export type ChatMessage = {
22
22
  export type LlamaModelOptions = {
23
23
  model: string
24
24
  chat_template?: string
25
- reasoning_format?: string
26
25
  embedding?: boolean
27
26
  embd_normalize?: number
28
27
  pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
@@ -74,11 +73,14 @@ export type CompletionResponseFormat = {
74
73
  export type LlamaCompletionOptions = {
75
74
  messages?: ChatMessage[]
76
75
  jinja?: boolean
76
+ reasoning_format?: string
77
77
  chat_template?: string
78
78
  response_format?: CompletionResponseFormat
79
79
  tools?: object
80
80
  parallel_tool_calls?: boolean
81
81
  tool_choice?: string
82
+ enable_thinking?: boolean
83
+ thinking_forced_open?: boolean
82
84
  prompt?: string
83
85
  temperature?: number
84
86
  top_k?: number
package/lib/index.js CHANGED
@@ -131,6 +131,7 @@ class LlamaContextWrapper {
131
131
  };
132
132
  }
133
133
  getFormattedChat(messages, template, params) {
134
+ var _a;
134
135
  const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
135
136
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
136
137
  let tmpl;
@@ -143,6 +144,7 @@ class LlamaContextWrapper {
143
144
  tools: params === null || params === void 0 ? void 0 : params.tools,
144
145
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
145
146
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
147
+ enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
146
148
  });
147
149
  if (!useJinja) {
148
150
  return {
package/lib/index.ts CHANGED
@@ -158,7 +158,8 @@ class LlamaContextWrapper {
158
158
  response_format?: CompletionResponseFormat
159
159
  tools?: object
160
160
  parallel_tool_calls?: object
161
- tool_choice?: string
161
+ tool_choice?: string,
162
+ enable_thinking?: boolean,
162
163
  },
163
164
  ): FormattedChatResult {
164
165
  const {
@@ -178,6 +179,7 @@ class LlamaContextWrapper {
178
179
  tools: params?.tools,
179
180
  parallel_tool_calls: params?.parallel_tool_calls,
180
181
  tool_choice: params?.tool_choice,
182
+ enable_thinking: params?.enable_thinking ?? true,
181
183
  })
182
184
 
183
185
  if (!useJinja) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.0-beta.5",
4
+ "version": "1.0.0-beta.7",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.0-beta.5",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.5",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.5",
76
- "@fugood/node-llama-linux-arm64": "1.0.0-beta.5",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.5",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.5",
79
- "@fugood/node-llama-win32-x64": "1.0.0-beta.5",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.5",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.5",
82
- "@fugood/node-llama-win32-arm64": "1.0.0-beta.5",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.5",
84
- "@fugood/node-llama-darwin-x64": "1.0.0-beta.5",
85
- "@fugood/node-llama-darwin-arm64": "1.0.0-beta.5"
73
+ "@fugood/node-llama-linux-x64": "1.0.0-beta.7",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.7",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.7",
76
+ "@fugood/node-llama-linux-arm64": "1.0.0-beta.7",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.7",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.7",
79
+ "@fugood/node-llama-win32-x64": "1.0.0-beta.7",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.7",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.7",
82
+ "@fugood/node-llama-win32-arm64": "1.0.0-beta.7",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.7",
84
+ "@fugood/node-llama-darwin-x64": "1.0.0-beta.7",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.0-beta.7"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -1,18 +1,19 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index f138c7bc..e177fe92 100644
2
+ index 7d9aaeb1..a7b68d4a 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
- @@ -1,8 +1,6 @@
6
- #include "chat.h"
7
- #include "json-schema-to-grammar.h"
5
+ @@ -6,9 +6,6 @@
8
6
  #include "log.h"
9
- -#include "minja/chat-template.hpp"
10
- -#include "minja/minja.hpp"
7
+ #include "regex-partial.h"
11
8
 
12
- #include <optional>
13
-
14
- @@ -15,14 +13,6 @@ static std::string format_time(const std::chrono::system_clock::time_point & now
15
- return res;
9
+ -#include <minja/chat-template.hpp>
10
+ -#include <minja/minja.hpp>
11
+ -
12
+ #include <cstdio>
13
+ #include <exception>
14
+ #include <iostream>
15
+ @@ -121,14 +118,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
16
+ return diffs;
16
17
  }
17
18
 
18
19
  -typedef minja::chat_template common_chat_template;
@@ -27,17 +28,17 @@ index f138c7bc..e177fe92 100644
27
28
  json messages;
28
29
  json tools;
29
30
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
30
- index d26a09c2..cb92721a 100644
31
+ index 9f59e6b0..9b7fe724 100644
31
32
  --- a/src/llama.cpp/common/chat.h
32
33
  +++ b/src/llama.cpp/common/chat.h
33
- @@ -6,8 +6,16 @@
34
- #include <chrono>
34
+ @@ -8,7 +8,16 @@
35
35
  #include <string>
36
36
  #include <vector>
37
- +#include "minja/chat-template.hpp"
38
- +#include "minja/minja.hpp"
39
37
 
40
38
  -struct common_chat_templates;
39
+ +#include <minja/chat-template.hpp>
40
+ +#include <minja/minja.hpp>
41
+ +
41
42
  +typedef minja::chat_template common_chat_template;
42
43
  +
43
44
  +struct common_chat_templates {
@@ -49,10 +50,10 @@ index d26a09c2..cb92721a 100644
49
50
  struct common_chat_tool_call {
50
51
  std::string name;
51
52
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
52
- index 94f545f8..a55df8aa 100644
53
+ index e4e71ad1..091ddda4 100644
53
54
  --- a/src/llama.cpp/common/common.cpp
54
55
  +++ b/src/llama.cpp/common/common.cpp
55
- @@ -1062,6 +1062,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
56
+ @@ -1101,6 +1101,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
56
57
  mparams.n_gpu_layers = params.n_gpu_layers;
57
58
  }
58
59
 
@@ -61,10 +62,10 @@ index 94f545f8..a55df8aa 100644
61
62
  mparams.split_mode = params.split_mode;
62
63
  mparams.tensor_split = params.tensor_split;
63
64
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
64
- index 0a9dc059..996afcd8 100644
65
+ index e08a59ea..d120b67d 100644
65
66
  --- a/src/llama.cpp/common/common.h
66
67
  +++ b/src/llama.cpp/common/common.h
67
- @@ -217,6 +217,7 @@ enum common_reasoning_format {
68
+ @@ -223,6 +223,7 @@ enum common_reasoning_format {
68
69
  };
69
70
 
70
71
  struct common_params {
@@ -73,11 +74,11 @@ index 0a9dc059..996afcd8 100644
73
74
  int32_t n_ctx = 4096; // context size
74
75
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
75
76
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
76
- index 9a3085be..8218cc16 100644
77
+ index 71b1d67b..093cd6f9 100644
77
78
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
78
79
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
79
- @@ -90,7 +90,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
80
- message(STATUS "ARM detected")
80
+ @@ -104,7 +104,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
81
+ )
81
82
 
82
83
  if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
83
84
  - message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
@@ -86,10 +87,10 @@ index 9a3085be..8218cc16 100644
86
87
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
87
88
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
88
89
  diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
89
- index 662f1377..f9f99698 100644
90
+ index 39f022f3..7ae9047e 100644
90
91
  --- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
91
92
  +++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
92
- @@ -122,7 +122,7 @@ if (Vulkan_FOUND)
93
+ @@ -110,7 +110,7 @@ if (Vulkan_FOUND)
93
94
  endif()
94
95
 
95
96
  # Set up toolchain for host compilation whether cross-compiling or not
@@ -98,10 +99,10 @@ index 662f1377..f9f99698 100644
98
99
  if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
99
100
  set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
100
101
  else()
101
- @@ -144,7 +144,7 @@ if (Vulkan_FOUND)
102
+ @@ -130,7 +130,7 @@ if (Vulkan_FOUND)
103
+
102
104
  include(ExternalProject)
103
105
 
104
- # Add toolchain file if cross-compiling
105
106
  - if (CMAKE_CROSSCOMPILING)
106
107
  + if (CMAKE_CROSSCOMPILING OR NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR)
107
108
  list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
@@ -8,7 +8,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
8
8
  _params(params) {}
9
9
 
10
10
  void EmbeddingWorker::Execute() {
11
- llama_kv_self_clear(_sess->context());
11
+ llama_memory_clear(llama_get_memory(_sess->context()), true);
12
12
  auto tokens = ::common_tokenize(_sess->context(), _text, true);
13
13
  // add SEP if not present
14
14
  auto vocab = llama_model_get_vocab(_sess->model());
@@ -25,12 +25,18 @@ size_t findStoppingStrings(const std::string &text,
25
25
 
26
26
  LlamaCompletionWorker::LlamaCompletionWorker(
27
27
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
28
- Napi::Function callback, common_params params,
29
- std::vector<std::string> stop_words, int32_t chat_format,
28
+ Napi::Function callback,
29
+ common_params params,
30
+ std::vector<std::string> stop_words,
31
+ int32_t chat_format,
32
+ bool thinking_forced_open,
33
+ std::string reasoning_format,
30
34
  const std::vector<std::string> &media_paths,
31
35
  const std::vector<llama_token> &guide_tokens)
32
36
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
33
37
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
38
+ _thinking_forced_open(thinking_forced_open),
39
+ _reasoning_format(reasoning_format),
34
40
  _media_paths(media_paths), _guide_tokens(guide_tokens) {
35
41
  if (!callback.IsEmpty()) {
36
42
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
@@ -65,7 +71,7 @@ void LlamaCompletionWorker::Execute() {
65
71
 
66
72
  // Process media if any are provided
67
73
  if (!_media_paths.empty()) {
68
- const auto *mtmd_ctx = _sess->get_mtmd_ctx();
74
+ auto *mtmd_ctx = _sess->get_mtmd_ctx();
69
75
 
70
76
  if (mtmd_ctx != nullptr) {
71
77
  // Process the media and get the tokens
@@ -109,7 +115,7 @@ void LlamaCompletionWorker::Execute() {
109
115
  --n_cur;
110
116
  }
111
117
  n_input -= n_cur;
112
- llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
118
+ llama_memory_seq_rm(llama_get_memory(ctx), 0, n_cur, -1);
113
119
  }
114
120
  // Set the tokens
115
121
  _sess->set_tokens(std::move(prompt_tokens));
@@ -131,8 +137,9 @@ void LlamaCompletionWorker::Execute() {
131
137
  const int n_left = n_cur - n_keep - 1;
132
138
  const int n_discard = n_left / 2;
133
139
 
134
- llama_kv_self_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
135
- llama_kv_self_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
140
+ auto mem = llama_get_memory(ctx);
141
+ llama_memory_seq_rm(mem, 0, n_keep + 1, n_keep + n_discard + 1);
142
+ llama_memory_seq_add(mem, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
136
143
 
137
144
  // shift the tokens
138
145
  embd->insert(embd->begin() + n_keep + 1,
@@ -234,8 +241,22 @@ void LlamaCompletionWorker::OnOK() {
234
241
  std::string content;
235
242
  if (!_stop) {
236
243
  try {
244
+ common_chat_syntax chat_syntax;
245
+ chat_syntax.format = static_cast<common_chat_format>(_chat_format);
246
+ chat_syntax.thinking_forced_open = _thinking_forced_open;
247
+
248
+ if (_reasoning_format == "deepseek") {
249
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
250
+ } else if (_reasoning_format == "deepseek-legacy") {
251
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
252
+ } else {
253
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
254
+ }
237
255
  common_chat_msg message = common_chat_parse(
238
- _result.text, static_cast<common_chat_format>(_chat_format));
256
+ _result.text,
257
+ false,
258
+ chat_syntax
259
+ );
239
260
  if (!message.reasoning_content.empty()) {
240
261
  reasoning_content = message.reasoning_content;
241
262
  }
@@ -20,6 +20,8 @@ public:
20
20
  Napi::Function callback, common_params params,
21
21
  std::vector<std::string> stop_words,
22
22
  int32_t chat_format,
23
+ bool thinking_forced_open,
24
+ std::string reasoning_format,
23
25
  const std::vector<std::string> &media_paths = {},
24
26
  const std::vector<llama_token> &guide_tokens = {});
25
27
 
@@ -41,6 +43,8 @@ private:
41
43
  common_params _params;
42
44
  std::vector<std::string> _stop_words;
43
45
  int32_t _chat_format;
46
+ bool _thinking_forced_open;
47
+ std::string _reasoning_format;
44
48
  std::vector<std::string> _media_paths;
45
49
  std::vector<llama_token> _guide_tokens;
46
50
  std::function<void()> _onComplete;
@@ -10,7 +10,7 @@
10
10
  #include "ggml.h"
11
11
  #include "gguf.h"
12
12
  #include "json-schema-to-grammar.h"
13
- #include "json.hpp"
13
+ #include <nlohmann/json.hpp>
14
14
  #include "llama-impl.h"
15
15
 
16
16
  #include <atomic>
@@ -223,14 +223,6 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
223
223
 
224
224
  params.chat_template = get_option<std::string>(options, "chat_template", "");
225
225
 
226
- std::string reasoning_format =
227
- get_option<std::string>(options, "reasoning_format", "none");
228
- if (reasoning_format == "deepseek") {
229
- params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
230
- } else {
231
- params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
232
- }
233
-
234
226
  params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
235
227
  params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
236
228
  params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
@@ -507,7 +499,9 @@ common_chat_params getFormattedChatWithJinja(
507
499
  const common_chat_templates_ptr &templates, const std::string &messages,
508
500
  const std::string &chat_template, const std::string &json_schema,
509
501
  const std::string &tools, const bool &parallel_tool_calls,
510
- const std::string &tool_choice) {
502
+ const std::string &tool_choice,
503
+ const bool &enable_thinking
504
+ ) {
511
505
  common_chat_templates_inputs inputs;
512
506
  inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
513
507
  auto useTools = !tools.empty();
@@ -521,8 +515,7 @@ common_chat_params getFormattedChatWithJinja(
521
515
  if (!json_schema.empty()) {
522
516
  inputs.json_schema = json::parse(json_schema);
523
517
  }
524
- inputs.extract_reasoning =
525
- sess->params().reasoning_format != COMMON_REASONING_FORMAT_NONE;
518
+ inputs.enable_thinking = enable_thinking;
526
519
 
527
520
  // If chat_template is provided, create new one and use it (probably slow)
528
521
  if (!chat_template.empty()) {
@@ -596,12 +589,11 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
596
589
  auto parallel_tool_calls =
597
590
  get_option<bool>(params, "parallel_tool_calls", false);
598
591
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
592
+ auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
599
593
 
600
594
  auto chatParams = getFormattedChatWithJinja(
601
595
  _sess, _templates, messages, chat_template, json_schema_str, tools_str,
602
- parallel_tool_calls, tool_choice);
603
-
604
- console_log(env, std::string("format: ") + std::to_string(chatParams.format));
596
+ parallel_tool_calls, tool_choice, enable_thinking);
605
597
 
606
598
  Napi::Object result = Napi::Object::New(env);
607
599
  result.Set("prompt", chatParams.prompt);
@@ -622,6 +614,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
622
614
  grammar_triggers.Set(i, triggerObj);
623
615
  }
624
616
  result.Set("grammar_triggers", grammar_triggers);
617
+ result.Set("thinking_forced_open", chatParams.thinking_forced_open);
625
618
  // preserved_tokens: string[]
626
619
  Napi::Array preserved_tokens = Napi::Array::New(env);
627
620
  for (size_t i = 0; i < chatParams.preserved_tokens.size(); i++) {
@@ -695,6 +688,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
695
688
  }
696
689
 
697
690
  int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
691
+ bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
692
+ std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
698
693
 
699
694
  common_params params = _sess->params();
700
695
  auto grammar_from_params = get_option<std::string>(options, "grammar", "");
@@ -802,14 +797,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
802
797
  get_option<bool>(options, "parallel_tool_calls", false);
803
798
  auto tool_choice =
804
799
  get_option<std::string>(options, "tool_choice", "none");
800
+ auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
805
801
 
806
802
  auto chatParams = getFormattedChatWithJinja(
807
803
  _sess, _templates, json_stringify(messages), chat_template,
808
- json_schema_str, tools_str, parallel_tool_calls, tool_choice);
804
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
809
805
 
810
806
  params.prompt = chatParams.prompt;
811
807
 
812
808
  chat_format = chatParams.format;
809
+ thinking_forced_open = chatParams.thinking_forced_open;
813
810
 
814
811
  for (const auto &token : chatParams.preserved_tokens) {
815
812
  auto ids =
@@ -904,7 +901,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
904
901
 
905
902
  auto *worker =
906
903
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
907
- chat_format, media_paths, guide_tokens);
904
+ chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens);
908
905
  worker->Queue();
909
906
  _wip = worker;
910
907
  worker->OnComplete([this]() { _wip = nullptr; });
package/src/common.hpp CHANGED
@@ -6,6 +6,7 @@
6
6
  #include "llama.h"
7
7
  #include "tools/mtmd/clip.h"
8
8
  #include "tools/mtmd/mtmd.h"
9
+ #include "tools/mtmd/mtmd-helper.h"
9
10
  #include <memory>
10
11
  #include <mutex>
11
12
  #include <napi.h>
@@ -97,7 +98,7 @@ public:
97
98
  inline std::mutex &get_mutex() { return mutex; }
98
99
 
99
100
  // Getter for the multimodal context
100
- inline const mtmd_context *get_mtmd_ctx() const { return _mtmd_ctx; }
101
+ inline mtmd_context *get_mtmd_ctx() { return _mtmd_ctx; }
101
102
 
102
103
  // Setter for the multimodal context
103
104
  inline void set_mtmd_ctx(mtmd_context *ctx) { _mtmd_ctx = ctx; }
@@ -219,7 +220,7 @@ struct TokenizeResult {
219
220
  };
220
221
 
221
222
  static TokenizeResult
222
- tokenizeWithMedia(const mtmd_context *mtmd_ctx, const std::string &prompt,
223
+ tokenizeWithMedia(mtmd_context *mtmd_ctx, const std::string &prompt,
223
224
  const std::vector<std::string> &media_paths) {
224
225
  if (mtmd_ctx == nullptr) {
225
226
  throw std::runtime_error("Multimodal context is not initialized");
@@ -263,7 +264,7 @@ tokenizeWithMedia(const mtmd_context *mtmd_ctx, const std::string &prompt,
263
264
  std::vector<uint8_t> media_data = base64_decode(base64_data);
264
265
 
265
266
  // Load bitmap from memory buffer using direct initialization
266
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(),
267
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mtmd_ctx, media_data.data(),
267
268
  media_data.size()));
268
269
  if (!bmp.ptr) {
269
270
  bitmaps.entries.clear();
@@ -300,7 +301,7 @@ tokenizeWithMedia(const mtmd_context *mtmd_ctx, const std::string &prompt,
300
301
  fclose(file);
301
302
 
302
303
  // Create bitmap directly
303
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
304
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mtmd_ctx, media_path.c_str()));
304
305
  if (!bmp.ptr) {
305
306
  bitmaps.entries.clear();
306
307
  throw std::runtime_error("Failed to load media");
@@ -388,7 +389,7 @@ tokenizeWithMedia(const mtmd_context *mtmd_ctx, const std::string &prompt,
388
389
 
389
390
  // Process media and add them to the tokenized input
390
391
  static llama_pos
391
- processMediaPrompt(llama_context *ctx, const mtmd_context *mtmd_ctx,
392
+ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
392
393
  LlamaSessionPtr sess, const common_params &params,
393
394
  const std::vector<std::string> &media_paths) {
394
395
  if (mtmd_ctx == nullptr) {
@@ -460,7 +461,7 @@ processMediaPrompt(llama_context *ctx, const mtmd_context *mtmd_ctx,
460
461
  }
461
462
 
462
463
  // Clear all KV cache entries after position n_past
463
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
464
+ llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
464
465
 
465
466
  size_t num_chunks = mtmd_input_chunks_size(chunks);
466
467
 
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
89
89
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
90
90
  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
91
91
 
92
+ if (NOT DEFINED LLAMA_BUILD_NUMBER)
93
+ set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
94
+ endif()
95
+ if (NOT DEFINED LLAMA_BUILD_COMMIT)
96
+ set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
97
+ endif()
98
+ set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
99
+
92
100
  # override ggml options
93
101
  set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
94
102
  set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -155,10 +163,17 @@ if (LLAMA_USE_SYSTEM_GGML)
155
163
  endif()
156
164
 
157
165
  if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
166
+ set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
167
+ set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
158
168
  add_subdirectory(ggml)
159
169
  # ... otherwise assume ggml is added by a parent CMakeLists.txt
160
170
  endif()
161
171
 
172
+ if (MINGW)
173
+ # Target Windows 8 for PrefetchVirtualMemory
174
+ add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
175
+ endif()
176
+
162
177
  #
163
178
  # build the library
164
179
  #
@@ -199,10 +214,6 @@ endif()
199
214
  include(GNUInstallDirs)
200
215
  include(CMakePackageConfigHelpers)
201
216
 
202
- set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
203
- set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
204
- set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
205
-
206
217
  set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
207
218
  set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
208
219
  set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
@@ -7,8 +7,8 @@ llama_add_compile_flags()
7
7
  # Build info header
8
8
  #
9
9
 
10
- if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
11
- set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
10
+ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
11
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
12
12
 
13
13
  # Is git submodule
14
14
  if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
18
18
  if (SLASH_POS EQUAL 0)
19
19
  set(GIT_DIR "${REAL_GIT_DIR}")
20
20
  else()
21
- set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
21
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
22
22
  endif()
23
23
  endif()
24
24
 
25
25
  if(EXISTS "${GIT_DIR}/index")
26
- set(GIT_INDEX "${GIT_DIR}/index")
26
+ # For build-info.cpp below
27
+ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
27
28
  else()
28
29
  message(WARNING "Git index not found in git repository.")
29
- set(GIT_INDEX "")
30
30
  endif()
31
31
  else()
32
32
  message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
33
- set(GIT_INDEX "")
34
33
  endif()
35
34
 
36
- # Add a custom command to rebuild build-info.cpp when .git/index changes
37
- add_custom_command(
38
- OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
39
- COMMENT "Generating build details from Git"
40
- COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
41
- -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
42
- -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
43
- -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
44
- -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
45
- WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
46
- DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
47
- VERBATIM
48
- )
35
+ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
36
+ set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
37
+ configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
38
+
49
39
  set(TARGET build_info)
50
- add_library(${TARGET} OBJECT build-info.cpp)
40
+ add_library(${TARGET} OBJECT ${OUTPUT_FILE})
51
41
  if (BUILD_SHARED_LIBS)
52
42
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
53
43
  endif()
@@ -58,19 +48,20 @@ add_library(${TARGET} STATIC
58
48
  arg.cpp
59
49
  arg.h
60
50
  base64.hpp
51
+ chat-parser.cpp
52
+ chat-parser.h
61
53
  chat.cpp
62
54
  chat.h
63
55
  common.cpp
64
56
  common.h
65
57
  console.cpp
66
58
  console.h
59
+ json-partial.cpp
60
+ json-partial.h
67
61
  json-schema-to-grammar.cpp
68
- json.hpp
69
62
  llguidance.cpp
70
63
  log.cpp
71
64
  log.h
72
- minja/chat-template.hpp
73
- minja/minja.hpp
74
65
  ngram-cache.cpp
75
66
  ngram-cache.h
76
67
  regex-partial.cpp
@@ -143,7 +134,7 @@ if (LLAMA_LLGUIDANCE)
143
134
  set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
144
135
  endif ()
145
136
 
146
- target_include_directories(${TARGET} PUBLIC .)
137
+ target_include_directories(${TARGET} PUBLIC . ../vendor)
147
138
  target_compile_features (${TARGET} PUBLIC cxx_std_17)
148
139
  target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
149
140