@fugood/llama.node 1.0.0-beta.6 → 1.0.0-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +2 -0
- package/lib/index.js +2 -0
- package/lib/index.ts +3 -1
- package/package.json +14 -14
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +7 -3
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +12 -6
- package/src/common.hpp +1 -1
package/lib/binding.ts
CHANGED
package/lib/index.js
CHANGED
|
@@ -131,6 +131,7 @@ class LlamaContextWrapper {
|
|
|
131
131
|
};
|
|
132
132
|
}
|
|
133
133
|
getFormattedChat(messages, template, params) {
|
|
134
|
+
var _a;
|
|
134
135
|
const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
|
|
135
136
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
136
137
|
let tmpl;
|
|
@@ -143,6 +144,7 @@ class LlamaContextWrapper {
|
|
|
143
144
|
tools: params === null || params === void 0 ? void 0 : params.tools,
|
|
144
145
|
parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
|
|
145
146
|
tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
|
|
147
|
+
enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
|
|
146
148
|
});
|
|
147
149
|
if (!useJinja) {
|
|
148
150
|
return {
|
package/lib/index.ts
CHANGED
|
@@ -158,7 +158,8 @@ class LlamaContextWrapper {
|
|
|
158
158
|
response_format?: CompletionResponseFormat
|
|
159
159
|
tools?: object
|
|
160
160
|
parallel_tool_calls?: object
|
|
161
|
-
tool_choice?: string
|
|
161
|
+
tool_choice?: string,
|
|
162
|
+
enable_thinking?: boolean,
|
|
162
163
|
},
|
|
163
164
|
): FormattedChatResult {
|
|
164
165
|
const {
|
|
@@ -178,6 +179,7 @@ class LlamaContextWrapper {
|
|
|
178
179
|
tools: params?.tools,
|
|
179
180
|
parallel_tool_calls: params?.parallel_tool_calls,
|
|
180
181
|
tool_choice: params?.tool_choice,
|
|
182
|
+
enable_thinking: params?.enable_thinking ?? true,
|
|
181
183
|
})
|
|
182
184
|
|
|
183
185
|
if (!useJinja) {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.0.0-beta.
|
|
4
|
+
"version": "1.0.0-beta.7",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -70,19 +70,19 @@
|
|
|
70
70
|
"CMakeLists.txt"
|
|
71
71
|
],
|
|
72
72
|
"optionalDependencies": {
|
|
73
|
-
"@fugood/node-llama-linux-x64": "1.0.0-beta.
|
|
74
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.
|
|
75
|
-
"@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.
|
|
76
|
-
"@fugood/node-llama-linux-arm64": "1.0.0-beta.
|
|
77
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.
|
|
79
|
-
"@fugood/node-llama-win32-x64": "1.0.0-beta.
|
|
80
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.
|
|
81
|
-
"@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.
|
|
82
|
-
"@fugood/node-llama-win32-arm64": "1.0.0-beta.
|
|
83
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.
|
|
84
|
-
"@fugood/node-llama-darwin-x64": "1.0.0-beta.
|
|
85
|
-
"@fugood/node-llama-darwin-arm64": "1.0.0-beta.
|
|
73
|
+
"@fugood/node-llama-linux-x64": "1.0.0-beta.7",
|
|
74
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.7",
|
|
75
|
+
"@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.7",
|
|
76
|
+
"@fugood/node-llama-linux-arm64": "1.0.0-beta.7",
|
|
77
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.7",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.7",
|
|
79
|
+
"@fugood/node-llama-win32-x64": "1.0.0-beta.7",
|
|
80
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.7",
|
|
81
|
+
"@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.7",
|
|
82
|
+
"@fugood/node-llama-win32-arm64": "1.0.0-beta.7",
|
|
83
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.7",
|
|
84
|
+
"@fugood/node-llama-darwin-x64": "1.0.0-beta.7",
|
|
85
|
+
"@fugood/node-llama-darwin-arm64": "1.0.0-beta.7"
|
|
86
86
|
},
|
|
87
87
|
"devDependencies": {
|
|
88
88
|
"@babel/preset-env": "^7.24.4",
|
package/src/EmbeddingWorker.cpp
CHANGED
|
@@ -8,7 +8,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
|
|
|
8
8
|
_params(params) {}
|
|
9
9
|
|
|
10
10
|
void EmbeddingWorker::Execute() {
|
|
11
|
-
|
|
11
|
+
llama_memory_clear(llama_get_memory(_sess->context()), true);
|
|
12
12
|
auto tokens = ::common_tokenize(_sess->context(), _text, true);
|
|
13
13
|
// add SEP if not present
|
|
14
14
|
auto vocab = llama_model_get_vocab(_sess->model());
|
|
@@ -29,11 +29,13 @@ LlamaCompletionWorker::LlamaCompletionWorker(
|
|
|
29
29
|
common_params params,
|
|
30
30
|
std::vector<std::string> stop_words,
|
|
31
31
|
int32_t chat_format,
|
|
32
|
+
bool thinking_forced_open,
|
|
32
33
|
std::string reasoning_format,
|
|
33
34
|
const std::vector<std::string> &media_paths,
|
|
34
35
|
const std::vector<llama_token> &guide_tokens)
|
|
35
36
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
36
37
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
38
|
+
_thinking_forced_open(thinking_forced_open),
|
|
37
39
|
_reasoning_format(reasoning_format),
|
|
38
40
|
_media_paths(media_paths), _guide_tokens(guide_tokens) {
|
|
39
41
|
if (!callback.IsEmpty()) {
|
|
@@ -113,7 +115,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
113
115
|
--n_cur;
|
|
114
116
|
}
|
|
115
117
|
n_input -= n_cur;
|
|
116
|
-
|
|
118
|
+
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_cur, -1);
|
|
117
119
|
}
|
|
118
120
|
// Set the tokens
|
|
119
121
|
_sess->set_tokens(std::move(prompt_tokens));
|
|
@@ -135,8 +137,9 @@ void LlamaCompletionWorker::Execute() {
|
|
|
135
137
|
const int n_left = n_cur - n_keep - 1;
|
|
136
138
|
const int n_discard = n_left / 2;
|
|
137
139
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
+
auto mem = llama_get_memory(ctx);
|
|
141
|
+
llama_memory_seq_rm(mem, 0, n_keep + 1, n_keep + n_discard + 1);
|
|
142
|
+
llama_memory_seq_add(mem, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
|
|
140
143
|
|
|
141
144
|
// shift the tokens
|
|
142
145
|
embd->insert(embd->begin() + n_keep + 1,
|
|
@@ -240,6 +243,7 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
240
243
|
try {
|
|
241
244
|
common_chat_syntax chat_syntax;
|
|
242
245
|
chat_syntax.format = static_cast<common_chat_format>(_chat_format);
|
|
246
|
+
chat_syntax.thinking_forced_open = _thinking_forced_open;
|
|
243
247
|
|
|
244
248
|
if (_reasoning_format == "deepseek") {
|
|
245
249
|
chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
@@ -20,6 +20,7 @@ public:
|
|
|
20
20
|
Napi::Function callback, common_params params,
|
|
21
21
|
std::vector<std::string> stop_words,
|
|
22
22
|
int32_t chat_format,
|
|
23
|
+
bool thinking_forced_open,
|
|
23
24
|
std::string reasoning_format,
|
|
24
25
|
const std::vector<std::string> &media_paths = {},
|
|
25
26
|
const std::vector<llama_token> &guide_tokens = {});
|
|
@@ -42,6 +43,7 @@ private:
|
|
|
42
43
|
common_params _params;
|
|
43
44
|
std::vector<std::string> _stop_words;
|
|
44
45
|
int32_t _chat_format;
|
|
46
|
+
bool _thinking_forced_open;
|
|
45
47
|
std::string _reasoning_format;
|
|
46
48
|
std::vector<std::string> _media_paths;
|
|
47
49
|
std::vector<llama_token> _guide_tokens;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -499,7 +499,9 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
499
499
|
const common_chat_templates_ptr &templates, const std::string &messages,
|
|
500
500
|
const std::string &chat_template, const std::string &json_schema,
|
|
501
501
|
const std::string &tools, const bool ¶llel_tool_calls,
|
|
502
|
-
const std::string &tool_choice
|
|
502
|
+
const std::string &tool_choice,
|
|
503
|
+
const bool &enable_thinking
|
|
504
|
+
) {
|
|
503
505
|
common_chat_templates_inputs inputs;
|
|
504
506
|
inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
|
|
505
507
|
auto useTools = !tools.empty();
|
|
@@ -513,6 +515,7 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
513
515
|
if (!json_schema.empty()) {
|
|
514
516
|
inputs.json_schema = json::parse(json_schema);
|
|
515
517
|
}
|
|
518
|
+
inputs.enable_thinking = enable_thinking;
|
|
516
519
|
|
|
517
520
|
// If chat_template is provided, create new one and use it (probably slow)
|
|
518
521
|
if (!chat_template.empty()) {
|
|
@@ -586,12 +589,11 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
586
589
|
auto parallel_tool_calls =
|
|
587
590
|
get_option<bool>(params, "parallel_tool_calls", false);
|
|
588
591
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
592
|
+
auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
|
|
589
593
|
|
|
590
594
|
auto chatParams = getFormattedChatWithJinja(
|
|
591
595
|
_sess, _templates, messages, chat_template, json_schema_str, tools_str,
|
|
592
|
-
parallel_tool_calls, tool_choice);
|
|
593
|
-
|
|
594
|
-
console_log(env, std::string("format: ") + std::to_string(chatParams.format));
|
|
596
|
+
parallel_tool_calls, tool_choice, enable_thinking);
|
|
595
597
|
|
|
596
598
|
Napi::Object result = Napi::Object::New(env);
|
|
597
599
|
result.Set("prompt", chatParams.prompt);
|
|
@@ -612,6 +614,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
612
614
|
grammar_triggers.Set(i, triggerObj);
|
|
613
615
|
}
|
|
614
616
|
result.Set("grammar_triggers", grammar_triggers);
|
|
617
|
+
result.Set("thinking_forced_open", chatParams.thinking_forced_open);
|
|
615
618
|
// preserved_tokens: string[]
|
|
616
619
|
Napi::Array preserved_tokens = Napi::Array::New(env);
|
|
617
620
|
for (size_t i = 0; i < chatParams.preserved_tokens.size(); i++) {
|
|
@@ -685,6 +688,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
685
688
|
}
|
|
686
689
|
|
|
687
690
|
int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
|
|
691
|
+
bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
|
|
688
692
|
std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
|
|
689
693
|
|
|
690
694
|
common_params params = _sess->params();
|
|
@@ -793,14 +797,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
793
797
|
get_option<bool>(options, "parallel_tool_calls", false);
|
|
794
798
|
auto tool_choice =
|
|
795
799
|
get_option<std::string>(options, "tool_choice", "none");
|
|
800
|
+
auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
|
|
796
801
|
|
|
797
802
|
auto chatParams = getFormattedChatWithJinja(
|
|
798
803
|
_sess, _templates, json_stringify(messages), chat_template,
|
|
799
|
-
json_schema_str, tools_str, parallel_tool_calls, tool_choice);
|
|
804
|
+
json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
|
|
800
805
|
|
|
801
806
|
params.prompt = chatParams.prompt;
|
|
802
807
|
|
|
803
808
|
chat_format = chatParams.format;
|
|
809
|
+
thinking_forced_open = chatParams.thinking_forced_open;
|
|
804
810
|
|
|
805
811
|
for (const auto &token : chatParams.preserved_tokens) {
|
|
806
812
|
auto ids =
|
|
@@ -895,7 +901,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
895
901
|
|
|
896
902
|
auto *worker =
|
|
897
903
|
new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
|
|
898
|
-
chat_format, reasoning_format, media_paths, guide_tokens);
|
|
904
|
+
chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens);
|
|
899
905
|
worker->Queue();
|
|
900
906
|
_wip = worker;
|
|
901
907
|
worker->OnComplete([this]() { _wip = nullptr; });
|
package/src/common.hpp
CHANGED
|
@@ -461,7 +461,7 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
|
|
|
461
461
|
}
|
|
462
462
|
|
|
463
463
|
// Clear all KV cache entries after position n_past
|
|
464
|
-
|
|
464
|
+
llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
|
|
465
465
|
|
|
466
466
|
size_t num_chunks = mtmd_input_chunks_size(chunks);
|
|
467
467
|
|