@fugood/llama.node 1.1.6 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaCompletionWorker.cpp +73 -20
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/LlamaContext.cpp +9 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +132 -41
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +311 -9
- package/src/llama.cpp/common/chat.h +4 -1
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +46 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +28 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/include/llama.h +25 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +2 -4
- package/src/llama.cpp/src/llama-context.cpp +29 -22
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +81 -70
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
package/lib/binding.ts
CHANGED
|
@@ -167,6 +167,10 @@ export type LlamaCompletionResult = {
|
|
|
167
167
|
|
|
168
168
|
export type LlamaCompletionToken = {
|
|
169
169
|
token: string
|
|
170
|
+
content?: string
|
|
171
|
+
reasoning_content?: string
|
|
172
|
+
tool_calls?: ToolCall[]
|
|
173
|
+
accumulated_text?: string
|
|
170
174
|
}
|
|
171
175
|
|
|
172
176
|
export type TokenizeResult = {
|
package/lib/index.js
CHANGED
|
@@ -23,9 +23,10 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
});
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
|
|
26
|
+
exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
|
|
27
27
|
exports.addNativeLogListener = addNativeLogListener;
|
|
28
28
|
const binding_1 = require("./binding");
|
|
29
|
+
const version_1 = require("./version");
|
|
29
30
|
__exportStar(require("./binding"), exports);
|
|
30
31
|
exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
|
|
31
32
|
const mods = {};
|
|
@@ -259,3 +260,7 @@ const loadLlamaModelInfo = (path) => __awaiter(void 0, void 0, void 0, function*
|
|
|
259
260
|
return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip);
|
|
260
261
|
});
|
|
261
262
|
exports.loadLlamaModelInfo = loadLlamaModelInfo;
|
|
263
|
+
exports.BuildInfo = {
|
|
264
|
+
number: version_1.BUILD_NUMBER,
|
|
265
|
+
commit: version_1.BUILD_COMMIT,
|
|
266
|
+
};
|
package/lib/index.ts
CHANGED
|
@@ -17,6 +17,7 @@ import type {
|
|
|
17
17
|
Tool,
|
|
18
18
|
GGUFModelInfo,
|
|
19
19
|
} from './binding'
|
|
20
|
+
import { BUILD_NUMBER, BUILD_COMMIT } from './version'
|
|
20
21
|
|
|
21
22
|
export * from './binding'
|
|
22
23
|
|
|
@@ -353,3 +354,8 @@ export const loadLlamaModelInfo = async (path: string): Promise<GGUFModelInfo> =
|
|
|
353
354
|
refreshNativeLogSetup()
|
|
354
355
|
return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip)
|
|
355
356
|
}
|
|
357
|
+
|
|
358
|
+
export const BuildInfo = {
|
|
359
|
+
number: BUILD_NUMBER,
|
|
360
|
+
commit: BUILD_COMMIT,
|
|
361
|
+
}
|
package/lib/version.js
ADDED
package/lib/version.ts
ADDED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.8",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.8",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.8",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.8",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.8",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.8",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.8",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.8",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.8",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.8",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.8",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.8",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.8",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.8"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
2
|
-
index
|
|
2
|
+
index 23d3828f9..ca48af00c 100644
|
|
3
3
|
--- a/src/llama.cpp/common/chat.cpp
|
|
4
4
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
5
5
|
@@ -6,9 +6,6 @@
|
|
@@ -30,7 +30,7 @@ index 60805ab3..71b4236a 100644
|
|
|
30
30
|
json messages;
|
|
31
31
|
json tools;
|
|
32
32
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
33
|
-
index
|
|
33
|
+
index d1e480c91..437e64e29 100644
|
|
34
34
|
--- a/src/llama.cpp/common/chat.h
|
|
35
35
|
+++ b/src/llama.cpp/common/chat.h
|
|
36
36
|
@@ -9,7 +9,18 @@
|
|
@@ -54,10 +54,10 @@ index b014f9f0..3a868797 100644
|
|
|
54
54
|
struct common_chat_tool_call {
|
|
55
55
|
std::string name;
|
|
56
56
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
57
|
-
index
|
|
57
|
+
index 67dd5404f..909a97c66 100644
|
|
58
58
|
--- a/src/llama.cpp/common/common.cpp
|
|
59
59
|
+++ b/src/llama.cpp/common/common.cpp
|
|
60
|
-
@@ -
|
|
60
|
+
@@ -1117,6 +1117,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
61
61
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
62
62
|
}
|
|
63
63
|
|
|
@@ -66,11 +66,11 @@ index c6962d1d..ba5a4786 100644
|
|
|
66
66
|
mparams.split_mode = params.split_mode;
|
|
67
67
|
mparams.tensor_split = params.tensor_split;
|
|
68
68
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
69
|
-
index
|
|
69
|
+
index 75596e6b3..0e04694c8 100644
|
|
70
70
|
--- a/src/llama.cpp/common/common.h
|
|
71
71
|
+++ b/src/llama.cpp/common/common.h
|
|
72
|
-
@@ -
|
|
73
|
-
|
|
72
|
+
@@ -267,6 +267,7 @@ struct lr_opt {
|
|
73
|
+
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
74
74
|
|
|
75
75
|
struct common_params {
|
|
76
76
|
+ bool vocab_only = false;
|
|
@@ -78,7 +78,7 @@ index 6c1c7ee2..c3eb0552 100644
|
|
|
78
78
|
int32_t n_ctx = 4096; // context size
|
|
79
79
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
80
80
|
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
81
|
-
index
|
|
81
|
+
index ce0a3e128..df9300224 100644
|
|
82
82
|
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
83
83
|
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
|
|
84
84
|
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
@@ -91,7 +91,7 @@ index f188d163..0c33acad 100644
|
|
|
91
91
|
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
|
92
92
|
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
|
93
93
|
diff --git a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
94
|
-
index
|
|
94
|
+
index b97e7bf99..c3eb9519f 100644
|
|
95
95
|
--- a/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
96
96
|
+++ b/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt
|
|
97
97
|
@@ -111,7 +111,7 @@ if (Vulkan_FOUND)
|
|
@@ -55,6 +55,32 @@ LlamaCompletionWorker::~LlamaCompletionWorker() {
|
|
|
55
55
|
}
|
|
56
56
|
}
|
|
57
57
|
|
|
58
|
+
LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(const std::string &generated_text) {
|
|
59
|
+
PartialOutput result;
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
common_chat_syntax chat_syntax;
|
|
63
|
+
chat_syntax.format = static_cast<common_chat_format>(_chat_format);
|
|
64
|
+
chat_syntax.thinking_forced_open = _thinking_forced_open;
|
|
65
|
+
|
|
66
|
+
// Set reasoning format using the common function
|
|
67
|
+
chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
|
|
68
|
+
|
|
69
|
+
chat_syntax.parse_tool_calls = true;
|
|
70
|
+
|
|
71
|
+
// Use is_partial=true for streaming partial output
|
|
72
|
+
common_chat_msg parsed_msg = common_chat_parse(generated_text, true, chat_syntax);
|
|
73
|
+
|
|
74
|
+
result.content = parsed_msg.content;
|
|
75
|
+
result.reasoning_content = parsed_msg.reasoning_content;
|
|
76
|
+
result.tool_calls = parsed_msg.tool_calls;
|
|
77
|
+
} catch (const std::exception &e) {
|
|
78
|
+
// If parsing fails, leave content empty - this is expected for partial content
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return result;
|
|
82
|
+
}
|
|
83
|
+
|
|
58
84
|
void LlamaCompletionWorker::Execute() {
|
|
59
85
|
_sess->get_mutex().lock();
|
|
60
86
|
const auto t_main_start = ggml_time_us();
|
|
@@ -222,6 +248,13 @@ void LlamaCompletionWorker::Execute() {
|
|
|
222
248
|
|
|
223
249
|
// sample the next token
|
|
224
250
|
llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
|
|
251
|
+
|
|
252
|
+
// is it an end of generation?
|
|
253
|
+
if (llama_vocab_is_eog(vocab, new_token_id)) {
|
|
254
|
+
_result.stopped_eos = true;
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
|
|
225
258
|
if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
|
|
226
259
|
!llama_vocab_is_control(vocab, new_token_id) &&
|
|
227
260
|
!llama_vocab_is_eog(vocab, new_token_id)) {
|
|
@@ -250,21 +283,49 @@ void LlamaCompletionWorker::Execute() {
|
|
|
250
283
|
if (_has_callback) {
|
|
251
284
|
// TODO: When we got possible stop words (startsWith)
|
|
252
285
|
// we should avoid calling the callback, wait for the next token
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
286
|
+
struct TokenData {
|
|
287
|
+
std::string token;
|
|
288
|
+
std::string content;
|
|
289
|
+
std::string reasoning_content;
|
|
290
|
+
std::vector<common_chat_tool_call> tool_calls;
|
|
291
|
+
std::string accumulated_text;
|
|
292
|
+
};
|
|
293
|
+
|
|
294
|
+
auto partial = getPartialOutput(_result.text);
|
|
295
|
+
TokenData *token_data = new TokenData{token, partial.content, partial.reasoning_content, partial.tool_calls, _result.text};
|
|
296
|
+
|
|
297
|
+
_tsfn.BlockingCall(token_data, [](Napi::Env env, Napi::Function jsCallback,
|
|
298
|
+
TokenData *data) {
|
|
256
299
|
auto obj = Napi::Object::New(env);
|
|
257
|
-
obj.Set("token", Napi::String::New(env,
|
|
258
|
-
|
|
300
|
+
obj.Set("token", Napi::String::New(env, data->token));
|
|
301
|
+
if (!data->content.empty()) {
|
|
302
|
+
obj.Set("content", Napi::String::New(env, data->content));
|
|
303
|
+
}
|
|
304
|
+
if (!data->reasoning_content.empty()) {
|
|
305
|
+
obj.Set("reasoning_content", Napi::String::New(env, data->reasoning_content));
|
|
306
|
+
}
|
|
307
|
+
if (!data->tool_calls.empty()) {
|
|
308
|
+
Napi::Array tool_calls = Napi::Array::New(env);
|
|
309
|
+
for (size_t i = 0; i < data->tool_calls.size(); i++) {
|
|
310
|
+
const auto &tc = data->tool_calls[i];
|
|
311
|
+
Napi::Object tool_call = Napi::Object::New(env);
|
|
312
|
+
tool_call.Set("type", "function");
|
|
313
|
+
Napi::Object function = Napi::Object::New(env);
|
|
314
|
+
function.Set("name", tc.name);
|
|
315
|
+
function.Set("arguments", tc.arguments);
|
|
316
|
+
tool_call.Set("function", function);
|
|
317
|
+
if (!tc.id.empty()) {
|
|
318
|
+
tool_call.Set("id", tc.id);
|
|
319
|
+
}
|
|
320
|
+
tool_calls.Set(i, tool_call);
|
|
321
|
+
}
|
|
322
|
+
obj.Set("tool_calls", tool_calls);
|
|
323
|
+
}
|
|
324
|
+
obj.Set("accumulated_text", Napi::String::New(env, data->accumulated_text));
|
|
325
|
+
delete data;
|
|
259
326
|
jsCallback.Call({obj});
|
|
260
327
|
});
|
|
261
328
|
}
|
|
262
|
-
// is it an end of generation?
|
|
263
|
-
if (llama_vocab_is_eog(vocab, new_token_id)) {
|
|
264
|
-
_result.stopped_eos = true;
|
|
265
|
-
// TODO: EOS token should be cut
|
|
266
|
-
break;
|
|
267
|
-
}
|
|
268
329
|
// check for stop words
|
|
269
330
|
if (!_stop_words.empty()) {
|
|
270
331
|
const size_t stop_pos =
|
|
@@ -316,15 +377,7 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
316
377
|
|
|
317
378
|
chat_syntax.thinking_forced_open = _thinking_forced_open;
|
|
318
379
|
|
|
319
|
-
|
|
320
|
-
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
321
|
-
} else if (_reasoning_format == "deepseek-legacy") {
|
|
322
|
-
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
|
323
|
-
} else if (_reasoning_format == "auto") {
|
|
324
|
-
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
|
325
|
-
} else {
|
|
326
|
-
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
327
|
-
}
|
|
380
|
+
chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
|
|
328
381
|
common_chat_msg message = common_chat_parse(
|
|
329
382
|
_result.text,
|
|
330
383
|
false,
|
|
@@ -42,6 +42,14 @@ protected:
|
|
|
42
42
|
void OnError(const Napi::Error &err) override;
|
|
43
43
|
|
|
44
44
|
private:
|
|
45
|
+
struct PartialOutput {
|
|
46
|
+
std::string content = "";
|
|
47
|
+
std::string reasoning_content = "";
|
|
48
|
+
std::vector<common_chat_tool_call> tool_calls;
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
PartialOutput getPartialOutput(const std::string &generated_text);
|
|
52
|
+
|
|
45
53
|
LlamaSessionPtr _sess;
|
|
46
54
|
common_params _params;
|
|
47
55
|
std::vector<std::string> _stop_words;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -636,6 +636,15 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
636
636
|
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
637
637
|
parallel_tool_calls, tool_choice, enable_thinking,
|
|
638
638
|
add_generation_prompt, now_str, chat_template_kwargs);
|
|
639
|
+
} catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
|
|
640
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
641
|
+
return env.Undefined();
|
|
642
|
+
} catch (const std::invalid_argument& e) {
|
|
643
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
644
|
+
return env.Undefined();
|
|
645
|
+
} catch (const std::runtime_error& e) {
|
|
646
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
647
|
+
return env.Undefined();
|
|
639
648
|
} catch (const std::exception &e) {
|
|
640
649
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
641
650
|
return env.Undefined();
|
package/src/common.hpp
CHANGED
|
@@ -461,7 +461,14 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
|
|
|
461
461
|
}
|
|
462
462
|
|
|
463
463
|
// Clear all KV cache entries after position n_past
|
|
464
|
-
|
|
464
|
+
auto * kv = llama_get_memory(ctx);
|
|
465
|
+
bool clear_result = llama_memory_seq_rm(kv, 0, n_past, -1);
|
|
466
|
+
if (!clear_result) {
|
|
467
|
+
fprintf(stdout, "[DEBUG] llama_memory_seq_rm failed (likely using a non-Transformer model)! Trying full clear...");
|
|
468
|
+
llama_memory_clear(kv, false);
|
|
469
|
+
n_past = 0;
|
|
470
|
+
new_n_past = n_past;
|
|
471
|
+
}
|
|
465
472
|
|
|
466
473
|
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
467
474
|
|
|
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
|
|
12
12
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
|
13
13
|
endif()
|
|
14
14
|
|
|
15
|
+
message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
|
|
16
|
+
|
|
15
17
|
# Add path to modules
|
|
16
18
|
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
|
17
19
|
|
|
@@ -749,6 +749,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
749
749
|
// utils
|
|
750
750
|
//
|
|
751
751
|
|
|
752
|
+
// Helper function to parse tensor buffer override strings
|
|
753
|
+
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
|
|
754
|
+
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
|
755
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
756
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
757
|
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
758
|
+
if (buft) {
|
|
759
|
+
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
for (const auto & override : string_split<std::string>(value, ',')) {
|
|
764
|
+
std::string::size_type pos = override.find('=');
|
|
765
|
+
if (pos == std::string::npos) {
|
|
766
|
+
throw std::invalid_argument("invalid value");
|
|
767
|
+
}
|
|
768
|
+
std::string tensor_name = override.substr(0, pos);
|
|
769
|
+
std::string buffer_type = override.substr(pos + 1);
|
|
770
|
+
|
|
771
|
+
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
772
|
+
printf("Available buffer types:\n");
|
|
773
|
+
for (const auto & it : buft_list) {
|
|
774
|
+
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
775
|
+
}
|
|
776
|
+
throw std::invalid_argument("unknown buffer type");
|
|
777
|
+
}
|
|
778
|
+
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
779
|
+
static std::list<std::string> buft_overrides;
|
|
780
|
+
buft_overrides.push_back(tensor_name);
|
|
781
|
+
overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
|
|
752
785
|
struct handle_model_result {
|
|
753
786
|
bool found_mmproj = false;
|
|
754
787
|
common_params_model mmproj;
|
|
@@ -993,6 +1026,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
993
1026
|
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
994
1027
|
}
|
|
995
1028
|
|
|
1029
|
+
if (!params.speculative.tensor_buft_overrides.empty()) {
|
|
1030
|
+
params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
|
|
1031
|
+
}
|
|
1032
|
+
|
|
996
1033
|
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
|
997
1034
|
throw std::runtime_error(string_format(
|
|
998
1035
|
"error: the supplied chat template is not supported: %s%s\n",
|
|
@@ -1201,6 +1238,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
1201
1238
|
common_params_print_completion(ctx_arg);
|
|
1202
1239
|
exit(0);
|
|
1203
1240
|
}
|
|
1241
|
+
params.lr.init();
|
|
1204
1242
|
} catch (const std::invalid_argument & ex) {
|
|
1205
1243
|
fprintf(stderr, "%s\n", ex.what());
|
|
1206
1244
|
ctx_arg.params = params_org;
|
|
@@ -1469,6 +1507,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1469
1507
|
params.swa_full = true;
|
|
1470
1508
|
}
|
|
1471
1509
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1510
|
+
add_opt(common_arg(
|
|
1511
|
+
{"--swa-checkpoints"}, "N",
|
|
1512
|
+
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
|
|
1513
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
|
|
1514
|
+
[](common_params & params, int value) {
|
|
1515
|
+
params.n_swa_checkpoints = value;
|
|
1516
|
+
}
|
|
1517
|
+
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
1472
1518
|
add_opt(common_arg(
|
|
1473
1519
|
{"--kv-unified", "-kvu"},
|
|
1474
1520
|
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
@@ -1484,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1484
1530
|
params.ctx_shift = false;
|
|
1485
1531
|
}
|
|
1486
1532
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
1533
|
+
add_opt(common_arg(
|
|
1534
|
+
{"--context-shift"},
|
|
1535
|
+
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
1536
|
+
[](common_params & params) {
|
|
1537
|
+
params.ctx_shift = true;
|
|
1538
|
+
}
|
|
1539
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
1487
1540
|
add_opt(common_arg(
|
|
1488
1541
|
{"--chunks"}, "N",
|
|
1489
1542
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
@@ -1777,7 +1830,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1777
1830
|
[](common_params & params, const std::string & value) {
|
|
1778
1831
|
params.sampling.top_n_sigma = std::stof(value);
|
|
1779
1832
|
}
|
|
1780
|
-
).
|
|
1833
|
+
).set_sparam());
|
|
1781
1834
|
add_opt(common_arg(
|
|
1782
1835
|
{"--xtc-probability"}, "N",
|
|
1783
1836
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
@@ -2349,40 +2402,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2349
2402
|
add_opt(common_arg(
|
|
2350
2403
|
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
|
2351
2404
|
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
|
2352
|
-
|
|
2353
|
-
if (buft_list.empty()) {
|
|
2354
|
-
// enumerate all the devices and add their buffer types to the list
|
|
2355
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
2356
|
-
auto * dev = ggml_backend_dev_get(i);
|
|
2357
|
-
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
2358
|
-
if (buft) {
|
|
2359
|
-
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
2360
|
-
}
|
|
2361
|
-
}
|
|
2362
|
-
}
|
|
2363
|
-
|
|
2364
|
-
for (const auto & override : string_split<std::string>(value, ',')) {
|
|
2365
|
-
std::string::size_type pos = override.find('=');
|
|
2366
|
-
if (pos == std::string::npos) {
|
|
2367
|
-
throw std::invalid_argument("invalid value");
|
|
2368
|
-
}
|
|
2369
|
-
std::string tensor_name = override.substr(0, pos);
|
|
2370
|
-
std::string buffer_type = override.substr(pos + 1);
|
|
2371
|
-
|
|
2372
|
-
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
2373
|
-
printf("Available buffer types:\n");
|
|
2374
|
-
for (const auto & it : buft_list) {
|
|
2375
|
-
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
2376
|
-
}
|
|
2377
|
-
throw std::invalid_argument("unknown buffer type");
|
|
2378
|
-
}
|
|
2379
|
-
// keep strings alive and avoid leaking memory by storing them in a static vector
|
|
2380
|
-
static std::list<std::string> buft_overrides;
|
|
2381
|
-
buft_overrides.push_back(tensor_name);
|
|
2382
|
-
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
|
2383
|
-
}
|
|
2405
|
+
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
|
2384
2406
|
}
|
|
2385
2407
|
));
|
|
2408
|
+
add_opt(common_arg(
|
|
2409
|
+
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
|
|
2410
|
+
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
2411
|
+
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
|
2412
|
+
}
|
|
2413
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
2386
2414
|
add_opt(common_arg(
|
|
2387
2415
|
{"--cpu-moe", "-cmoe"},
|
|
2388
2416
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
@@ -2405,6 +2433,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2405
2433
|
}
|
|
2406
2434
|
}
|
|
2407
2435
|
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
|
2436
|
+
add_opt(common_arg(
|
|
2437
|
+
{"--cpu-moe-draft", "-cmoed"},
|
|
2438
|
+
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
|
2439
|
+
[](common_params & params) {
|
|
2440
|
+
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
|
2441
|
+
}
|
|
2442
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
2443
|
+
add_opt(common_arg(
|
|
2444
|
+
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
|
2445
|
+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
|
2446
|
+
[](common_params & params, int value) {
|
|
2447
|
+
if (value < 0) {
|
|
2448
|
+
throw std::invalid_argument("invalid value");
|
|
2449
|
+
}
|
|
2450
|
+
for (int i = 0; i < value; ++i) {
|
|
2451
|
+
static std::list<std::string> buft_overrides_draft;
|
|
2452
|
+
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
|
2453
|
+
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
2454
|
+
}
|
|
2455
|
+
}
|
|
2456
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
2408
2457
|
add_opt(common_arg(
|
|
2409
2458
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
2410
2459
|
"number of layers to store in VRAM",
|
|
@@ -2655,7 +2704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2655
2704
|
[](common_params & params, const std::string & value) {
|
|
2656
2705
|
params.out_file = value;
|
|
2657
2706
|
}
|
|
2658
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
|
2707
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
|
2659
2708
|
add_opt(common_arg(
|
|
2660
2709
|
{"-ofreq", "--output-frequency"}, "N",
|
|
2661
2710
|
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
|
@@ -2949,11 +2998,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2949
2998
|
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
|
2950
2999
|
"(default: auto)",
|
|
2951
3000
|
[](common_params & params, const std::string & value) {
|
|
2952
|
-
|
|
2953
|
-
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
|
2954
|
-
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2955
|
-
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
|
|
2956
|
-
else { throw std::invalid_argument("invalid value"); }
|
|
3001
|
+
params.reasoning_format = common_reasoning_format_from_name(value);
|
|
2957
3002
|
}
|
|
2958
3003
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
2959
3004
|
add_opt(common_arg(
|
|
@@ -3134,7 +3179,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3134
3179
|
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
|
3135
3180
|
}
|
|
3136
3181
|
}
|
|
3137
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
3182
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3138
3183
|
add_opt(common_arg(
|
|
3139
3184
|
{"-tbd", "--threads-batch-draft"}, "N",
|
|
3140
3185
|
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
|
@@ -3144,7 +3189,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3144
3189
|
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
|
3145
3190
|
}
|
|
3146
3191
|
}
|
|
3147
|
-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
|
3192
|
+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
3148
3193
|
add_opt(common_arg(
|
|
3149
3194
|
{"-Cd", "--cpu-mask-draft"}, "M",
|
|
3150
3195
|
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
|
@@ -3537,5 +3582,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3537
3582
|
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3538
3583
|
|
|
3539
3584
|
|
|
3585
|
+
add_opt(
|
|
3586
|
+
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
|
|
3587
|
+
string_format(
|
|
3588
|
+
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
|
|
3589
|
+
(double) params.lr.lr0),
|
|
3590
|
+
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
|
|
3591
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3592
|
+
add_opt(
|
|
3593
|
+
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
|
3594
|
+
string_format(
|
|
3595
|
+
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
|
3596
|
+
(double) params.lr.lr_min),
|
|
3597
|
+
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
|
3598
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3599
|
+
add_opt(
|
|
3600
|
+
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
|
3601
|
+
string_format(
|
|
3602
|
+
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
|
3603
|
+
(double) params.lr.decay_epochs),
|
|
3604
|
+
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
|
3605
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3606
|
+
add_opt(common_arg(
|
|
3607
|
+
{ "-wd", "--weight-decay" }, "WD",
|
|
3608
|
+
string_format(
|
|
3609
|
+
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
|
3610
|
+
(double) params.lr.wd),
|
|
3611
|
+
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
|
3612
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3613
|
+
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
|
3614
|
+
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
|
3615
|
+
(double) params.val_split),
|
|
3616
|
+
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
|
3617
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3618
|
+
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
|
3619
|
+
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
|
3620
|
+
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
|
3621
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3622
|
+
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
|
3623
|
+
[](common_params & params, const std::string & name) {
|
|
3624
|
+
params.optimizer = common_opt_get_optimizer(name.c_str());
|
|
3625
|
+
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
|
3626
|
+
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
|
3627
|
+
}
|
|
3628
|
+
})
|
|
3629
|
+
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3630
|
+
|
|
3540
3631
|
return ctx_arg;
|
|
3541
3632
|
}
|
|
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
|
|
55
55
|
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
|
56
56
|
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
|
57
57
|
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
|
58
|
-
std::string arguments =
|
|
58
|
+
std::string arguments = "";
|
|
59
|
+
if (tool_call.contains("arguments")) {
|
|
60
|
+
if (tool_call.at("arguments").is_object()) {
|
|
61
|
+
arguments = tool_call.at("arguments").dump();
|
|
62
|
+
} else {
|
|
63
|
+
arguments = tool_call.at("arguments");
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
59
67
|
return add_tool_call(name, id, arguments);
|
|
60
68
|
}
|
|
61
69
|
|