@fugood/llama.node 1.1.1 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -0
- package/lib/index.js +3 -0
- package/lib/index.ts +6 -0
- package/package.json +14 -14
- package/src/LlamaCompletionWorker.cpp +3 -2
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +50 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +71 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +103 -596
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +62 -305
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +94 -673
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +16 -249
- package/src/llama.cpp/src/llama-arch.cpp +22 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2 -2
- package/src/llama.cpp/src/llama-graph.cpp +94 -0
- package/src/llama.cpp/src/llama-graph.h +12 -0
- package/src/llama.cpp/src/llama-hparams.cpp +9 -3
- package/src/llama.cpp/src/llama-hparams.h +11 -4
- package/src/llama.cpp/src/llama-model.cpp +195 -8
- package/src/tts_utils.h +3 -3
package/lib/binding.ts
CHANGED
|
@@ -55,6 +55,10 @@ export type LlamaModelOptions = {
|
|
|
55
55
|
* Try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix.
|
|
56
56
|
*/
|
|
57
57
|
kv_unified?: boolean
|
|
58
|
+
/**
|
|
59
|
+
* Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
60
|
+
*/
|
|
61
|
+
swa_full?: boolean
|
|
58
62
|
use_mlock?: boolean
|
|
59
63
|
use_mmap?: boolean
|
|
60
64
|
vocab_only?: boolean
|
|
@@ -102,6 +106,8 @@ export type LlamaCompletionOptions = {
|
|
|
102
106
|
dry_base?: number
|
|
103
107
|
dry_allowed_length?: number
|
|
104
108
|
dry_penalty_last_n?: number
|
|
109
|
+
dry_sequence_breakers?: string[]
|
|
110
|
+
top_n_sigma?: number
|
|
105
111
|
n_predict?: number
|
|
106
112
|
max_length?: number
|
|
107
113
|
max_tokens?: number
|
|
@@ -111,6 +117,9 @@ export type LlamaCompletionOptions = {
|
|
|
111
117
|
grammar_lazy?: boolean
|
|
112
118
|
grammar_triggers?: { type: number; value: string; token?: number }[]
|
|
113
119
|
preserved_tokens?: string[]
|
|
120
|
+
json_schema?: string
|
|
121
|
+
logit_bias?: number[][]
|
|
122
|
+
ignore_eos?: boolean
|
|
114
123
|
/**
|
|
115
124
|
* Path(s) to media file(s) to process before generating text.
|
|
116
125
|
* When provided, the media will be processed and added to the context.
|
|
@@ -134,6 +143,7 @@ export type LlamaCompletionResult = {
|
|
|
134
143
|
tokens_evaluated: number
|
|
135
144
|
truncated: boolean
|
|
136
145
|
context_full: boolean
|
|
146
|
+
interrupted: boolean
|
|
137
147
|
audio_tokens?: Array<number>
|
|
138
148
|
timings: {
|
|
139
149
|
prompt_n: number
|
|
@@ -265,6 +275,9 @@ export interface LlamaContext {
|
|
|
265
275
|
parallel_tool_calls?: boolean
|
|
266
276
|
tool_choice?: string
|
|
267
277
|
enable_thinking?: boolean
|
|
278
|
+
add_generation_prompt?: boolean
|
|
279
|
+
now?: string | number
|
|
280
|
+
chat_template_kwargs?: Record<string, string>
|
|
268
281
|
},
|
|
269
282
|
): JinjaFormattedChatResult | string
|
|
270
283
|
completion(
|
package/lib/index.js
CHANGED
|
@@ -145,6 +145,9 @@ class LlamaContextWrapper {
|
|
|
145
145
|
parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
|
|
146
146
|
tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
|
|
147
147
|
enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
|
|
148
|
+
add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
|
|
149
|
+
now: params === null || params === void 0 ? void 0 : params.now,
|
|
150
|
+
chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
|
|
148
151
|
});
|
|
149
152
|
if (!useJinja) {
|
|
150
153
|
return {
|
package/lib/index.ts
CHANGED
|
@@ -166,6 +166,9 @@ class LlamaContextWrapper {
|
|
|
166
166
|
parallel_tool_calls?: boolean
|
|
167
167
|
tool_choice?: string,
|
|
168
168
|
enable_thinking?: boolean,
|
|
169
|
+
add_generation_prompt?: boolean,
|
|
170
|
+
now?: string | number,
|
|
171
|
+
chat_template_kwargs?: Record<string, string>,
|
|
169
172
|
},
|
|
170
173
|
): FormattedChatResult {
|
|
171
174
|
const {
|
|
@@ -186,6 +189,9 @@ class LlamaContextWrapper {
|
|
|
186
189
|
parallel_tool_calls: params?.parallel_tool_calls,
|
|
187
190
|
tool_choice: params?.tool_choice,
|
|
188
191
|
enable_thinking: params?.enable_thinking ?? true,
|
|
192
|
+
add_generation_prompt: params?.add_generation_prompt,
|
|
193
|
+
now: params?.now,
|
|
194
|
+
chat_template_kwargs: params?.chat_template_kwargs,
|
|
189
195
|
})
|
|
190
196
|
|
|
191
197
|
if (!useJinja) {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.3",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.3",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.3",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.3",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.3",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.3",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.3",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.3",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.3",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.3",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.3",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.3",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.3",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.3"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -129,7 +129,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
129
129
|
_sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
|
|
130
130
|
|
|
131
131
|
auto embd = _sess->tokens_ptr();
|
|
132
|
-
for (int i = 0; (i < max_len ||
|
|
132
|
+
for (int i = 0; (i < max_len || _interrupted) && !_params.vocab_only; i++) {
|
|
133
133
|
// check if we need to remove some tokens
|
|
134
134
|
if (embd->size() >= _params.n_ctx) {
|
|
135
135
|
if (!_params.ctx_shift) {
|
|
@@ -256,6 +256,7 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
256
256
|
_result.tokens_predicted));
|
|
257
257
|
result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
|
|
258
258
|
result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
|
|
259
|
+
result.Set("interrupted", Napi::Boolean::New(env, _interrupted));
|
|
259
260
|
result.Set("text", Napi::String::New(env, _result.text.c_str()));
|
|
260
261
|
result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
|
|
261
262
|
result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
|
|
@@ -267,7 +268,7 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
267
268
|
Napi::Array tool_calls = Napi::Array::New(Napi::AsyncWorker::Env());
|
|
268
269
|
std::string reasoning_content = "";
|
|
269
270
|
std::string content;
|
|
270
|
-
if (!
|
|
271
|
+
if (!_interrupted) {
|
|
271
272
|
try {
|
|
272
273
|
common_chat_syntax chat_syntax;
|
|
273
274
|
chat_syntax.format = static_cast<common_chat_format>(_chat_format);
|
|
@@ -34,7 +34,7 @@ public:
|
|
|
34
34
|
|
|
35
35
|
void OnComplete(std::function<void()> cb) { _onComplete = cb; }
|
|
36
36
|
|
|
37
|
-
void SetStop() {
|
|
37
|
+
void SetStop() { _interrupted = true; }
|
|
38
38
|
|
|
39
39
|
protected:
|
|
40
40
|
void Execute() override;
|
|
@@ -52,7 +52,7 @@ private:
|
|
|
52
52
|
std::vector<llama_token> _guide_tokens;
|
|
53
53
|
std::function<void()> _onComplete;
|
|
54
54
|
bool _has_callback = false;
|
|
55
|
-
bool
|
|
55
|
+
bool _interrupted = false;
|
|
56
56
|
Napi::ThreadSafeFunction _tsfn;
|
|
57
57
|
bool _next_token_uses_guide_token = true;
|
|
58
58
|
bool _has_vocoder;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -248,6 +248,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
248
248
|
get_option<std::string>(options, "cache_type_v", "f16").c_str());
|
|
249
249
|
params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
|
|
250
250
|
params.kv_unified = get_option<bool>(options, "kv_unified", false);
|
|
251
|
+
params.swa_full = get_option<bool>(options, "swa_full", false);
|
|
251
252
|
|
|
252
253
|
params.use_mlock = get_option<bool>(options, "use_mlock", false);
|
|
253
254
|
params.use_mmap = get_option<bool>(options, "use_mmap", true);
|
|
@@ -504,7 +505,10 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
504
505
|
const std::string &chat_template, const std::string &json_schema,
|
|
505
506
|
const std::string &tools, const bool ¶llel_tool_calls,
|
|
506
507
|
const std::string &tool_choice,
|
|
507
|
-
const bool &enable_thinking
|
|
508
|
+
const bool &enable_thinking,
|
|
509
|
+
const bool &add_generation_prompt,
|
|
510
|
+
const std::string &now_str,
|
|
511
|
+
const std::map<std::string, std::string> &chat_template_kwargs
|
|
508
512
|
) {
|
|
509
513
|
common_chat_templates_inputs inputs;
|
|
510
514
|
inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
|
|
@@ -520,6 +524,21 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
520
524
|
inputs.json_schema = json::parse(json_schema);
|
|
521
525
|
}
|
|
522
526
|
inputs.enable_thinking = enable_thinking;
|
|
527
|
+
inputs.add_generation_prompt = add_generation_prompt;
|
|
528
|
+
|
|
529
|
+
// Handle now parameter - parse timestamp or use current time
|
|
530
|
+
if (!now_str.empty()) {
|
|
531
|
+
try {
|
|
532
|
+
// Try to parse as timestamp (seconds since epoch)
|
|
533
|
+
auto timestamp = std::stoll(now_str);
|
|
534
|
+
inputs.now = std::chrono::system_clock::from_time_t(timestamp);
|
|
535
|
+
} catch (...) {
|
|
536
|
+
// If parsing fails, use current time
|
|
537
|
+
inputs.now = std::chrono::system_clock::now();
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
inputs.chat_template_kwargs = chat_template_kwargs;
|
|
523
542
|
|
|
524
543
|
// If chat_template is provided, create new one and use it (probably slow)
|
|
525
544
|
if (!chat_template.empty()) {
|
|
@@ -594,12 +613,26 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
594
613
|
get_option<bool>(params, "parallel_tool_calls", false);
|
|
595
614
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
596
615
|
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
616
|
+
auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
|
|
617
|
+
auto now_str = get_option<std::string>(params, "now", "");
|
|
618
|
+
|
|
619
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
620
|
+
if (params.Has("chat_template_kwargs") && params.Get("chat_template_kwargs").IsObject()) {
|
|
621
|
+
auto kwargs_obj = params.Get("chat_template_kwargs").As<Napi::Object>();
|
|
622
|
+
auto props = kwargs_obj.GetPropertyNames();
|
|
623
|
+
for (uint32_t i = 0; i < props.Length(); i++) {
|
|
624
|
+
auto key = props.Get(i).ToString().Utf8Value();
|
|
625
|
+
auto val = kwargs_obj.Get(key).ToString().Utf8Value();
|
|
626
|
+
chat_template_kwargs[key] = val;
|
|
627
|
+
}
|
|
628
|
+
}
|
|
597
629
|
|
|
598
630
|
common_chat_params chatParams;
|
|
599
631
|
try {
|
|
600
632
|
chatParams = getFormattedChatWithJinja(
|
|
601
633
|
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
602
|
-
parallel_tool_calls, tool_choice, enable_thinking
|
|
634
|
+
parallel_tool_calls, tool_choice, enable_thinking,
|
|
635
|
+
add_generation_prompt, now_str, chat_template_kwargs);
|
|
603
636
|
} catch (const std::exception &e) {
|
|
604
637
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
605
638
|
return env.Undefined();
|
|
@@ -808,13 +841,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
808
841
|
auto tool_choice =
|
|
809
842
|
get_option<std::string>(options, "tool_choice", "none");
|
|
810
843
|
auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
|
|
844
|
+
auto add_generation_prompt = get_option<bool>(options, "add_generation_prompt", true);
|
|
845
|
+
auto now_str = get_option<std::string>(options, "now", "");
|
|
846
|
+
|
|
847
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
848
|
+
if (options.Has("chat_template_kwargs") && options.Get("chat_template_kwargs").IsObject()) {
|
|
849
|
+
auto kwargs_obj = options.Get("chat_template_kwargs").As<Napi::Object>();
|
|
850
|
+
auto props = kwargs_obj.GetPropertyNames();
|
|
851
|
+
for (uint32_t i = 0; i < props.Length(); i++) {
|
|
852
|
+
auto key = props.Get(i).ToString().Utf8Value();
|
|
853
|
+
auto val = kwargs_obj.Get(key).ToString().Utf8Value();
|
|
854
|
+
chat_template_kwargs[key] = val;
|
|
855
|
+
}
|
|
856
|
+
}
|
|
811
857
|
|
|
812
858
|
common_chat_params chatParams;
|
|
813
859
|
|
|
814
860
|
try {
|
|
815
861
|
chatParams = getFormattedChatWithJinja(
|
|
816
862
|
_sess, _templates, json_stringify(messages), chat_template,
|
|
817
|
-
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking
|
|
863
|
+
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
|
|
864
|
+
add_generation_prompt, now_str, chat_template_kwargs);
|
|
818
865
|
} catch (const std::exception &e) {
|
|
819
866
|
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
820
867
|
return env.Undefined();
|