@fugood/llama.node 1.4.12 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +11 -1
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +2 -0
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -9
  7. package/src/LlamaContext.cpp +5 -2
  8. package/src/llama.cpp/common/arg.cpp +249 -101
  9. package/src/llama.cpp/common/arg.h +0 -8
  10. package/src/llama.cpp/common/chat.cpp +4 -4
  11. package/src/llama.cpp/common/common.cpp +21 -1
  12. package/src/llama.cpp/common/common.h +20 -7
  13. package/src/llama.cpp/common/download.cpp +104 -55
  14. package/src/llama.cpp/common/download.h +26 -5
  15. package/src/llama.cpp/common/llguidance.cpp +10 -6
  16. package/src/llama.cpp/common/preset.cpp +76 -1
  17. package/src/llama.cpp/common/preset.h +10 -1
  18. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  19. package/src/llama.cpp/common/sampling.cpp +58 -14
  20. package/src/llama.cpp/common/sampling.h +3 -1
  21. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  22. package/src/llama.cpp/include/llama.h +92 -10
  23. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  24. package/src/llama.cpp/src/llama-arch.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +615 -28
  26. package/src/llama.cpp/src/llama-context.h +43 -1
  27. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  28. package/src/llama.cpp/src/llama-grammar.h +2 -0
  29. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  30. package/src/llama.cpp/src/llama-graph.h +71 -6
  31. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  32. package/src/llama.cpp/src/llama-hparams.h +8 -2
  33. package/src/llama.cpp/src/llama-mmap.cpp +70 -37
  34. package/src/llama.cpp/src/llama-mmap.h +5 -4
  35. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  36. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  37. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  38. package/src/llama.cpp/src/llama-model.cpp +66 -16
  39. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  40. package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
  41. package/src/llama.cpp/src/llama-sampling.h +16 -7
  42. package/src/llama.cpp/src/llama.cpp +101 -57
  43. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  44. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  45. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  46. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  47. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  48. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  49. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
package/lib/binding.ts CHANGED
@@ -112,7 +112,7 @@ export type CompletionResponseFormat = {
112
112
  export type LlamaCompletionOptions = {
113
113
  messages?: ChatMessage[]
114
114
  jinja?: boolean
115
- reasoning_format?: string
115
+ reasoning_format?: 'none' | 'auto' | 'deepseek'
116
116
  chat_template?: string
117
117
  response_format?: CompletionResponseFormat
118
118
  tools?: Tool[]
@@ -200,6 +200,13 @@ export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
200
200
  */
201
201
  save_state_path?: string
202
202
 
203
+ /**
204
+ * File path to save prompt-only state to after prompt processing.
205
+ * Useful for fast prompt reuse (especially for recurrent/hybrid models).
206
+ * Example: `'/path/to/prompt_state.bin'` or `'file:///path/to/prompt_state.bin'`
207
+ */
208
+ save_prompt_state_path?: string
209
+
203
210
  /**
204
211
  * Number of tokens to load when loading state.
205
212
  * If not specified or <= 0, all tokens from the state file will be loaded.
@@ -363,6 +370,8 @@ export type ModelInfo = {
363
370
  nEmbd: number
364
371
  nParams: number
365
372
  size: number
373
+ is_recurrent: boolean
374
+ is_hybrid: boolean
366
375
  chatTemplates: {
367
376
  llamaChat: boolean
368
377
  minja: {
@@ -475,6 +484,7 @@ export interface LlamaContext {
475
484
  parallel_tool_calls?: boolean
476
485
  tool_choice?: string
477
486
  enable_thinking?: boolean
487
+ reasoning_format?: 'none' | 'auto' | 'deepseek'
478
488
  add_generation_prompt?: boolean
479
489
  now?: string | number
480
490
  chat_template_kwargs?: Record<string, string>
package/lib/index.js CHANGED
@@ -87,7 +87,7 @@ class LlamaContextWrapper {
87
87
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
88
88
  }
89
89
  getFormattedChat(messages, template, params) {
90
- var _a, _b;
90
+ var _a, _b, _c;
91
91
  const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
92
92
  const useJinja = this.isJinjaSupported() && ((_a = params === null || params === void 0 ? void 0 : params.jinja) !== null && _a !== void 0 ? _a : true);
93
93
  let tmpl;
@@ -100,6 +100,7 @@ class LlamaContextWrapper {
100
100
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
101
101
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
102
102
  enable_thinking: (_b = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _b !== void 0 ? _b : true,
103
+ reasoning_format: (_c = params === null || params === void 0 ? void 0 : params.reasoning_format) !== null && _c !== void 0 ? _c : 'none',
103
104
  add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
104
105
  now: params === null || params === void 0 ? void 0 : params.now,
105
106
  chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
package/lib/index.ts CHANGED
@@ -118,6 +118,7 @@ class LlamaContextWrapper {
118
118
  parallel_tool_calls?: boolean
119
119
  tool_choice?: string
120
120
  enable_thinking?: boolean
121
+ reasoning_format?: 'none' | 'auto' | 'deepseek'
121
122
  add_generation_prompt?: boolean
122
123
  now?: string | number
123
124
  chat_template_kwargs?: Record<string, string>
@@ -136,6 +137,7 @@ class LlamaContextWrapper {
136
137
  parallel_tool_calls: params?.parallel_tool_calls,
137
138
  tool_choice: params?.tool_choice,
138
139
  enable_thinking: params?.enable_thinking ?? true,
140
+ reasoning_format: params?.reasoning_format ?? 'none',
139
141
  add_generation_prompt: params?.add_generation_prompt,
140
142
  now: params?.now,
141
143
  chat_template_kwargs: params?.chat_template_kwargs
package/lib/parallel.ts CHANGED
@@ -1,10 +1,10 @@
1
1
  // Parallel decoding API implementation for llama.node
2
2
  import type {
3
3
  LlamaContext,
4
- LlamaCompletionOptions,
5
4
  LlamaCompletionToken,
6
5
  RerankParams,
7
6
  ParallelStatus,
7
+ LlamaParallelCompletionOptions,
8
8
  } from './binding'
9
9
  import { formatMediaChat } from './utils'
10
10
 
@@ -68,7 +68,7 @@ export class LlamaParallelAPI {
68
68
  * @returns Object with requestId, promise for result, and stop function
69
69
  */
70
70
  async completion(
71
- options: LlamaCompletionOptions,
71
+ options: LlamaParallelCompletionOptions,
72
72
  onToken?: (requestId: number, data: LlamaCompletionToken) => void,
73
73
  ): Promise<{
74
74
  requestId: number
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.12",
4
+ "version": "1.4.14",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.12",
76
- "@fugood/node-llama-darwin-x64": "1.4.12",
77
- "@fugood/node-llama-linux-arm64": "1.4.12",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.12",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.12",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.12",
81
- "@fugood/node-llama-linux-x64": "1.4.12",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.12",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.12",
84
- "@fugood/node-llama-win32-arm64": "1.4.12",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.12",
86
- "@fugood/node-llama-win32-x64": "1.4.12",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.12",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.12"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.14",
76
+ "@fugood/node-llama-darwin-x64": "1.4.14",
77
+ "@fugood/node-llama-linux-arm64": "1.4.14",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.14",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.14",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.14",
81
+ "@fugood/node-llama-linux-x64": "1.4.14",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.14",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.14",
84
+ "@fugood/node-llama-win32-arm64": "1.4.14",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.14",
86
+ "@fugood/node-llama-win32-x64": "1.4.14",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.14",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.14"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
32
32
  static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
33
33
  int count = 0;
34
34
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
35
- index b98ab21ce..2f782837a 100644
35
+ index 22e527bab..c3d0affca 100644
36
36
  --- a/src/llama.cpp/common/chat.cpp
37
37
  +++ b/src/llama.cpp/common/chat.cpp
38
38
  @@ -7,9 +7,6 @@
@@ -96,10 +96,10 @@ index 8bd4a325f..333b3301f 100644
96
96
  struct common_chat_tool_call {
97
97
  std::string name;
98
98
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
99
- index 79c475612..cf189f8bc 100644
99
+ index 744f0b4ee..04fcebb9e 100644
100
100
  --- a/src/llama.cpp/common/common.cpp
101
101
  +++ b/src/llama.cpp/common/common.cpp
102
- @@ -1342,6 +1342,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
102
+ @@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
103
103
  mparams.devices = params.devices.data();
104
104
  }
105
105
 
@@ -108,10 +108,10 @@ index 79c475612..cf189f8bc 100644
108
108
  mparams.main_gpu = params.main_gpu;
109
109
  mparams.split_mode = params.split_mode;
110
110
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
111
- index f8bc686b6..555ba044a 100644
111
+ index 7794c0268..5b77ae0c3 100644
112
112
  --- a/src/llama.cpp/common/common.h
113
113
  +++ b/src/llama.cpp/common/common.h
114
- @@ -307,6 +307,7 @@ struct lr_opt {
114
+ @@ -310,6 +310,7 @@ struct lr_opt {
115
115
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
116
116
 
117
117
  struct common_params {
@@ -133,10 +133,10 @@ index 7622d0bf4..d2edcfddb 100644
133
133
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
134
134
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
135
135
  diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
136
- index 13b96d61f..5fa163442 100644
136
+ index 365a24b49..83bf4ee62 100644
137
137
  --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
138
138
  +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
139
- @@ -2680,9 +2680,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
139
+ @@ -2798,9 +2798,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
140
140
  GGML_UNUSED(dev);
141
141
  }
142
142
 
@@ -163,7 +163,7 @@ index 13b96d61f..5fa163442 100644
163
163
  *total = *free;
164
164
 
165
165
  GGML_UNUSED(dev);
166
- @@ -2879,10 +2894,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
166
+ @@ -3010,10 +3025,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
167
167
  }
168
168
  }
169
169
 
@@ -185,7 +185,7 @@ index 13b96d61f..5fa163442 100644
185
185
 
186
186
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
187
187
 
188
- @@ -2895,6 +2917,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
188
+ @@ -3026,6 +3048,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
189
189
  } catch (const std::exception & exc) {
190
190
  GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
191
191
  devices[i].context = nullptr;
@@ -595,6 +595,8 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
595
595
  details.Set("nEmbd", llama_model_n_embd(model));
596
596
  details.Set("nParams", llama_model_n_params(model));
597
597
  details.Set("size", llama_model_size(model));
598
+ details.Set("is_recurrent", llama_model_is_recurrent(model));
599
+ details.Set("is_hybrid", llama_model_is_hybrid(model));
598
600
 
599
601
  Napi::Object chatTemplates = Napi::Object::New(info.Env());
600
602
  chatTemplates.Set("llamaChat", _rn_ctx->validateModelChatTemplate(false, nullptr));
@@ -703,6 +705,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
703
705
  get_option<bool>(params, "parallel_tool_calls", false);
704
706
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
705
707
  auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
708
+ auto reasoning_format = get_option<std::string>(params, "reasoning_format", "none");
706
709
  auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
707
710
  auto now_str = get_option<std::string>(params, "now", "");
708
711
 
@@ -721,7 +724,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
721
724
  try {
722
725
  chatParams = _rn_ctx->getFormattedChatWithJinja(
723
726
  messages, chat_template, json_schema_str, tools_str,
724
- parallel_tool_calls, tool_choice, enable_thinking,
727
+ parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
725
728
  add_generation_prompt, now_str, chat_template_kwargs);
726
729
  } catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
727
730
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
@@ -962,7 +965,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
962
965
  try {
963
966
  chatParams = _rn_ctx->getFormattedChatWithJinja(
964
967
  json_stringify(messages), chat_template,
965
- json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
968
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
966
969
  add_generation_prompt, now_str, chat_template_kwargs);
967
970
  } catch (const std::exception &e) {
968
971
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();