@fugood/llama.node 1.4.12 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +11 -1
- package/lib/index.js +2 -1
- package/lib/index.ts +2 -0
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaContext.cpp +5 -2
- package/src/llama.cpp/common/arg.cpp +249 -101
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +21 -1
- package/src/llama.cpp/common/common.h +20 -7
- package/src/llama.cpp/common/download.cpp +104 -55
- package/src/llama.cpp/common/download.h +26 -5
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/preset.cpp +76 -1
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +92 -10
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-mmap.cpp +70 -37
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +66 -16
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +101 -57
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
package/lib/binding.ts
CHANGED
|
@@ -112,7 +112,7 @@ export type CompletionResponseFormat = {
|
|
|
112
112
|
export type LlamaCompletionOptions = {
|
|
113
113
|
messages?: ChatMessage[]
|
|
114
114
|
jinja?: boolean
|
|
115
|
-
reasoning_format?:
|
|
115
|
+
reasoning_format?: 'none' | 'auto' | 'deepseek'
|
|
116
116
|
chat_template?: string
|
|
117
117
|
response_format?: CompletionResponseFormat
|
|
118
118
|
tools?: Tool[]
|
|
@@ -200,6 +200,13 @@ export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
|
|
|
200
200
|
*/
|
|
201
201
|
save_state_path?: string
|
|
202
202
|
|
|
203
|
+
/**
|
|
204
|
+
* File path to save prompt-only state to after prompt processing.
|
|
205
|
+
* Useful for fast prompt reuse (especially for recurrent/hybrid models).
|
|
206
|
+
* Example: `'/path/to/prompt_state.bin'` or `'file:///path/to/prompt_state.bin'`
|
|
207
|
+
*/
|
|
208
|
+
save_prompt_state_path?: string
|
|
209
|
+
|
|
203
210
|
/**
|
|
204
211
|
* Number of tokens to load when loading state.
|
|
205
212
|
* If not specified or <= 0, all tokens from the state file will be loaded.
|
|
@@ -363,6 +370,8 @@ export type ModelInfo = {
|
|
|
363
370
|
nEmbd: number
|
|
364
371
|
nParams: number
|
|
365
372
|
size: number
|
|
373
|
+
is_recurrent: boolean
|
|
374
|
+
is_hybrid: boolean
|
|
366
375
|
chatTemplates: {
|
|
367
376
|
llamaChat: boolean
|
|
368
377
|
minja: {
|
|
@@ -475,6 +484,7 @@ export interface LlamaContext {
|
|
|
475
484
|
parallel_tool_calls?: boolean
|
|
476
485
|
tool_choice?: string
|
|
477
486
|
enable_thinking?: boolean
|
|
487
|
+
reasoning_format?: 'none' | 'auto' | 'deepseek'
|
|
478
488
|
add_generation_prompt?: boolean
|
|
479
489
|
now?: string | number
|
|
480
490
|
chat_template_kwargs?: Record<string, string>
|
package/lib/index.js
CHANGED
|
@@ -87,7 +87,7 @@ class LlamaContextWrapper {
|
|
|
87
87
|
return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
|
|
88
88
|
}
|
|
89
89
|
getFormattedChat(messages, template, params) {
|
|
90
|
-
var _a, _b;
|
|
90
|
+
var _a, _b, _c;
|
|
91
91
|
const { messages: chat, has_media, media_paths } = (0, utils_1.formatMediaChat)(messages);
|
|
92
92
|
const useJinja = this.isJinjaSupported() && ((_a = params === null || params === void 0 ? void 0 : params.jinja) !== null && _a !== void 0 ? _a : true);
|
|
93
93
|
let tmpl;
|
|
@@ -100,6 +100,7 @@ class LlamaContextWrapper {
|
|
|
100
100
|
parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
|
|
101
101
|
tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
|
|
102
102
|
enable_thinking: (_b = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _b !== void 0 ? _b : true,
|
|
103
|
+
reasoning_format: (_c = params === null || params === void 0 ? void 0 : params.reasoning_format) !== null && _c !== void 0 ? _c : 'none',
|
|
103
104
|
add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
|
|
104
105
|
now: params === null || params === void 0 ? void 0 : params.now,
|
|
105
106
|
chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs)
|
package/lib/index.ts
CHANGED
|
@@ -118,6 +118,7 @@ class LlamaContextWrapper {
|
|
|
118
118
|
parallel_tool_calls?: boolean
|
|
119
119
|
tool_choice?: string
|
|
120
120
|
enable_thinking?: boolean
|
|
121
|
+
reasoning_format?: 'none' | 'auto' | 'deepseek'
|
|
121
122
|
add_generation_prompt?: boolean
|
|
122
123
|
now?: string | number
|
|
123
124
|
chat_template_kwargs?: Record<string, string>
|
|
@@ -136,6 +137,7 @@ class LlamaContextWrapper {
|
|
|
136
137
|
parallel_tool_calls: params?.parallel_tool_calls,
|
|
137
138
|
tool_choice: params?.tool_choice,
|
|
138
139
|
enable_thinking: params?.enable_thinking ?? true,
|
|
140
|
+
reasoning_format: params?.reasoning_format ?? 'none',
|
|
139
141
|
add_generation_prompt: params?.add_generation_prompt,
|
|
140
142
|
now: params?.now,
|
|
141
143
|
chat_template_kwargs: params?.chat_template_kwargs
|
package/lib/parallel.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
// Parallel decoding API implementation for llama.node
|
|
2
2
|
import type {
|
|
3
3
|
LlamaContext,
|
|
4
|
-
LlamaCompletionOptions,
|
|
5
4
|
LlamaCompletionToken,
|
|
6
5
|
RerankParams,
|
|
7
6
|
ParallelStatus,
|
|
7
|
+
LlamaParallelCompletionOptions,
|
|
8
8
|
} from './binding'
|
|
9
9
|
import { formatMediaChat } from './utils'
|
|
10
10
|
|
|
@@ -68,7 +68,7 @@ export class LlamaParallelAPI {
|
|
|
68
68
|
* @returns Object with requestId, promise for result, and stop function
|
|
69
69
|
*/
|
|
70
70
|
async completion(
|
|
71
|
-
options:
|
|
71
|
+
options: LlamaParallelCompletionOptions,
|
|
72
72
|
onToken?: (requestId: number, data: LlamaCompletionToken) => void,
|
|
73
73
|
): Promise<{
|
|
74
74
|
requestId: number
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.14",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.14",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.14",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.14",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.14",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.14",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.14",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.14",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.14",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.14",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.14",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.14",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.14",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.14",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.14"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
|
|
|
32
32
|
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
|
|
33
33
|
int count = 0;
|
|
34
34
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
35
|
-
index
|
|
35
|
+
index 22e527bab..c3d0affca 100644
|
|
36
36
|
--- a/src/llama.cpp/common/chat.cpp
|
|
37
37
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
38
38
|
@@ -7,9 +7,6 @@
|
|
@@ -96,10 +96,10 @@ index 8bd4a325f..333b3301f 100644
|
|
|
96
96
|
struct common_chat_tool_call {
|
|
97
97
|
std::string name;
|
|
98
98
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
99
|
-
index
|
|
99
|
+
index 744f0b4ee..04fcebb9e 100644
|
|
100
100
|
--- a/src/llama.cpp/common/common.cpp
|
|
101
101
|
+++ b/src/llama.cpp/common/common.cpp
|
|
102
|
-
@@ -
|
|
102
|
+
@@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
103
103
|
mparams.devices = params.devices.data();
|
|
104
104
|
}
|
|
105
105
|
|
|
@@ -108,10 +108,10 @@ index 79c475612..cf189f8bc 100644
|
|
|
108
108
|
mparams.main_gpu = params.main_gpu;
|
|
109
109
|
mparams.split_mode = params.split_mode;
|
|
110
110
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
111
|
-
index
|
|
111
|
+
index 7794c0268..5b77ae0c3 100644
|
|
112
112
|
--- a/src/llama.cpp/common/common.h
|
|
113
113
|
+++ b/src/llama.cpp/common/common.h
|
|
114
|
-
@@ -
|
|
114
|
+
@@ -310,6 +310,7 @@ struct lr_opt {
|
|
115
115
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
116
116
|
|
|
117
117
|
struct common_params {
|
|
@@ -133,10 +133,10 @@ index 7622d0bf4..d2edcfddb 100644
|
|
|
133
133
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
134
134
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
135
135
|
diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
136
|
-
index
|
|
136
|
+
index 365a24b49..83bf4ee62 100644
|
|
137
137
|
--- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
138
138
|
+++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
|
|
139
|
-
@@ -
|
|
139
|
+
@@ -2798,9 +2798,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
|
|
140
140
|
GGML_UNUSED(dev);
|
|
141
141
|
}
|
|
142
142
|
|
|
@@ -163,7 +163,7 @@ index 13b96d61f..5fa163442 100644
|
|
|
163
163
|
*total = *free;
|
|
164
164
|
|
|
165
165
|
GGML_UNUSED(dev);
|
|
166
|
-
@@ -
|
|
166
|
+
@@ -3010,10 +3025,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
167
167
|
}
|
|
168
168
|
}
|
|
169
169
|
|
|
@@ -185,7 +185,7 @@ index 13b96d61f..5fa163442 100644
|
|
|
185
185
|
|
|
186
186
|
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
|
187
187
|
|
|
188
|
-
@@ -
|
|
188
|
+
@@ -3026,6 +3048,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
|
189
189
|
} catch (const std::exception & exc) {
|
|
190
190
|
GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
|
|
191
191
|
devices[i].context = nullptr;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -595,6 +595,8 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
|
|
|
595
595
|
details.Set("nEmbd", llama_model_n_embd(model));
|
|
596
596
|
details.Set("nParams", llama_model_n_params(model));
|
|
597
597
|
details.Set("size", llama_model_size(model));
|
|
598
|
+
details.Set("is_recurrent", llama_model_is_recurrent(model));
|
|
599
|
+
details.Set("is_hybrid", llama_model_is_hybrid(model));
|
|
598
600
|
|
|
599
601
|
Napi::Object chatTemplates = Napi::Object::New(info.Env());
|
|
600
602
|
chatTemplates.Set("llamaChat", _rn_ctx->validateModelChatTemplate(false, nullptr));
|
|
@@ -703,6 +705,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
703
705
|
get_option<bool>(params, "parallel_tool_calls", false);
|
|
704
706
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
705
707
|
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
708
|
+
auto reasoning_format = get_option<std::string>(params, "reasoning_format", "none");
|
|
706
709
|
auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
|
|
707
710
|
auto now_str = get_option<std::string>(params, "now", "");
|
|
708
711
|
|
|
@@ -721,7 +724,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
721
724
|
try {
|
|
722
725
|
chatParams = _rn_ctx->getFormattedChatWithJinja(
|
|
723
726
|
messages, chat_template, json_schema_str, tools_str,
|
|
724
|
-
parallel_tool_calls, tool_choice, enable_thinking,
|
|
727
|
+
parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
|
|
725
728
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
726
729
|
} catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
|
|
727
730
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
@@ -962,7 +965,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
962
965
|
try {
|
|
963
966
|
chatParams = _rn_ctx->getFormattedChatWithJinja(
|
|
964
967
|
json_stringify(messages), chat_template,
|
|
965
|
-
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
|
|
968
|
+
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking, reasoning_format,
|
|
966
969
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
967
970
|
} catch (const std::exception &e) {
|
|
968
971
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|