@fugood/llama.node 1.0.0-beta.6 → 1.0.0-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.ts CHANGED
@@ -79,6 +79,8 @@ export type LlamaCompletionOptions = {
79
79
  tools?: object
80
80
  parallel_tool_calls?: boolean
81
81
  tool_choice?: string
82
+ enable_thinking?: boolean
83
+ thinking_forced_open?: boolean
82
84
  prompt?: string
83
85
  temperature?: number
84
86
  top_k?: number
package/lib/index.js CHANGED
@@ -131,6 +131,7 @@ class LlamaContextWrapper {
131
131
  };
132
132
  }
133
133
  getFormattedChat(messages, template, params) {
134
+ var _a;
134
135
  const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
135
136
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
136
137
  let tmpl;
@@ -143,6 +144,7 @@ class LlamaContextWrapper {
143
144
  tools: params === null || params === void 0 ? void 0 : params.tools,
144
145
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
145
146
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
147
+ enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
146
148
  });
147
149
  if (!useJinja) {
148
150
  return {
package/lib/index.ts CHANGED
@@ -158,7 +158,8 @@ class LlamaContextWrapper {
158
158
  response_format?: CompletionResponseFormat
159
159
  tools?: object
160
160
  parallel_tool_calls?: object
161
- tool_choice?: string
161
+ tool_choice?: string,
162
+ enable_thinking?: boolean,
162
163
  },
163
164
  ): FormattedChatResult {
164
165
  const {
@@ -178,6 +179,7 @@ class LlamaContextWrapper {
178
179
  tools: params?.tools,
179
180
  parallel_tool_calls: params?.parallel_tool_calls,
180
181
  tool_choice: params?.tool_choice,
182
+ enable_thinking: params?.enable_thinking ?? true,
181
183
  })
182
184
 
183
185
  if (!useJinja) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.0-beta.6",
4
+ "version": "1.0.0-beta.7",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.0-beta.6",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.6",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.6",
76
- "@fugood/node-llama-linux-arm64": "1.0.0-beta.6",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.6",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.6",
79
- "@fugood/node-llama-win32-x64": "1.0.0-beta.6",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.6",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.6",
82
- "@fugood/node-llama-win32-arm64": "1.0.0-beta.6",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.6",
84
- "@fugood/node-llama-darwin-x64": "1.0.0-beta.6",
85
- "@fugood/node-llama-darwin-arm64": "1.0.0-beta.6"
73
+ "@fugood/node-llama-linux-x64": "1.0.0-beta.7",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.0-beta.7",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.0-beta.7",
76
+ "@fugood/node-llama-linux-arm64": "1.0.0-beta.7",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.0-beta.7",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.0-beta.7",
79
+ "@fugood/node-llama-win32-x64": "1.0.0-beta.7",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.0-beta.7",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.0-beta.7",
82
+ "@fugood/node-llama-win32-arm64": "1.0.0-beta.7",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.0-beta.7",
84
+ "@fugood/node-llama-darwin-x64": "1.0.0-beta.7",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.0-beta.7"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -8,7 +8,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
8
8
  _params(params) {}
9
9
 
10
10
  void EmbeddingWorker::Execute() {
11
- llama_kv_self_clear(_sess->context());
11
+ llama_memory_clear(llama_get_memory(_sess->context()), true);
12
12
  auto tokens = ::common_tokenize(_sess->context(), _text, true);
13
13
  // add SEP if not present
14
14
  auto vocab = llama_model_get_vocab(_sess->model());
@@ -29,11 +29,13 @@ LlamaCompletionWorker::LlamaCompletionWorker(
29
29
  common_params params,
30
30
  std::vector<std::string> stop_words,
31
31
  int32_t chat_format,
32
+ bool thinking_forced_open,
32
33
  std::string reasoning_format,
33
34
  const std::vector<std::string> &media_paths,
34
35
  const std::vector<llama_token> &guide_tokens)
35
36
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
36
37
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
38
+ _thinking_forced_open(thinking_forced_open),
37
39
  _reasoning_format(reasoning_format),
38
40
  _media_paths(media_paths), _guide_tokens(guide_tokens) {
39
41
  if (!callback.IsEmpty()) {
@@ -113,7 +115,7 @@ void LlamaCompletionWorker::Execute() {
113
115
  --n_cur;
114
116
  }
115
117
  n_input -= n_cur;
116
- llama_kv_self_seq_rm(ctx, 0, n_cur, -1);
118
+ llama_memory_seq_rm(llama_get_memory(ctx), 0, n_cur, -1);
117
119
  }
118
120
  // Set the tokens
119
121
  _sess->set_tokens(std::move(prompt_tokens));
@@ -135,8 +137,9 @@ void LlamaCompletionWorker::Execute() {
135
137
  const int n_left = n_cur - n_keep - 1;
136
138
  const int n_discard = n_left / 2;
137
139
 
138
- llama_kv_self_seq_rm(ctx, 0, n_keep + 1, n_keep + n_discard + 1);
139
- llama_kv_self_seq_add(ctx, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
140
+ auto mem = llama_get_memory(ctx);
141
+ llama_memory_seq_rm(mem, 0, n_keep + 1, n_keep + n_discard + 1);
142
+ llama_memory_seq_add(mem, 0, n_keep + 1 + n_discard, n_cur, -n_discard);
140
143
 
141
144
  // shift the tokens
142
145
  embd->insert(embd->begin() + n_keep + 1,
@@ -240,6 +243,7 @@ void LlamaCompletionWorker::OnOK() {
240
243
  try {
241
244
  common_chat_syntax chat_syntax;
242
245
  chat_syntax.format = static_cast<common_chat_format>(_chat_format);
246
+ chat_syntax.thinking_forced_open = _thinking_forced_open;
243
247
 
244
248
  if (_reasoning_format == "deepseek") {
245
249
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
@@ -20,6 +20,7 @@ public:
20
20
  Napi::Function callback, common_params params,
21
21
  std::vector<std::string> stop_words,
22
22
  int32_t chat_format,
23
+ bool thinking_forced_open,
23
24
  std::string reasoning_format,
24
25
  const std::vector<std::string> &media_paths = {},
25
26
  const std::vector<llama_token> &guide_tokens = {});
@@ -42,6 +43,7 @@ private:
42
43
  common_params _params;
43
44
  std::vector<std::string> _stop_words;
44
45
  int32_t _chat_format;
46
+ bool _thinking_forced_open;
45
47
  std::string _reasoning_format;
46
48
  std::vector<std::string> _media_paths;
47
49
  std::vector<llama_token> _guide_tokens;
@@ -499,7 +499,9 @@ common_chat_params getFormattedChatWithJinja(
499
499
  const common_chat_templates_ptr &templates, const std::string &messages,
500
500
  const std::string &chat_template, const std::string &json_schema,
501
501
  const std::string &tools, const bool &parallel_tool_calls,
502
- const std::string &tool_choice) {
502
+ const std::string &tool_choice,
503
+ const bool &enable_thinking
504
+ ) {
503
505
  common_chat_templates_inputs inputs;
504
506
  inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
505
507
  auto useTools = !tools.empty();
@@ -513,6 +515,7 @@ common_chat_params getFormattedChatWithJinja(
513
515
  if (!json_schema.empty()) {
514
516
  inputs.json_schema = json::parse(json_schema);
515
517
  }
518
+ inputs.enable_thinking = enable_thinking;
516
519
 
517
520
  // If chat_template is provided, create new one and use it (probably slow)
518
521
  if (!chat_template.empty()) {
@@ -586,12 +589,11 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
586
589
  auto parallel_tool_calls =
587
590
  get_option<bool>(params, "parallel_tool_calls", false);
588
591
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
592
+ auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
589
593
 
590
594
  auto chatParams = getFormattedChatWithJinja(
591
595
  _sess, _templates, messages, chat_template, json_schema_str, tools_str,
592
- parallel_tool_calls, tool_choice);
593
-
594
- console_log(env, std::string("format: ") + std::to_string(chatParams.format));
596
+ parallel_tool_calls, tool_choice, enable_thinking);
595
597
 
596
598
  Napi::Object result = Napi::Object::New(env);
597
599
  result.Set("prompt", chatParams.prompt);
@@ -612,6 +614,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
612
614
  grammar_triggers.Set(i, triggerObj);
613
615
  }
614
616
  result.Set("grammar_triggers", grammar_triggers);
617
+ result.Set("thinking_forced_open", chatParams.thinking_forced_open);
615
618
  // preserved_tokens: string[]
616
619
  Napi::Array preserved_tokens = Napi::Array::New(env);
617
620
  for (size_t i = 0; i < chatParams.preserved_tokens.size(); i++) {
@@ -685,6 +688,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
685
688
  }
686
689
 
687
690
  int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
691
+ bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
688
692
  std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
689
693
 
690
694
  common_params params = _sess->params();
@@ -793,14 +797,16 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
793
797
  get_option<bool>(options, "parallel_tool_calls", false);
794
798
  auto tool_choice =
795
799
  get_option<std::string>(options, "tool_choice", "none");
800
+ auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
796
801
 
797
802
  auto chatParams = getFormattedChatWithJinja(
798
803
  _sess, _templates, json_stringify(messages), chat_template,
799
- json_schema_str, tools_str, parallel_tool_calls, tool_choice);
804
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
800
805
 
801
806
  params.prompt = chatParams.prompt;
802
807
 
803
808
  chat_format = chatParams.format;
809
+ thinking_forced_open = chatParams.thinking_forced_open;
804
810
 
805
811
  for (const auto &token : chatParams.preserved_tokens) {
806
812
  auto ids =
@@ -895,7 +901,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
895
901
 
896
902
  auto *worker =
897
903
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
898
- chat_format, reasoning_format, media_paths, guide_tokens);
904
+ chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens);
899
905
  worker->Queue();
900
906
  _wip = worker;
901
907
  worker->OnComplete([this]() { _wip = nullptr; });
package/src/common.hpp CHANGED
@@ -461,7 +461,7 @@ processMediaPrompt(llama_context *ctx, mtmd_context *mtmd_ctx,
461
461
  }
462
462
 
463
463
  // Clear all KV cache entries after position n_past
464
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
464
+ llama_memory_seq_rm(llama_get_memory(ctx), 0, n_past, -1);
465
465
 
466
466
  size_t num_chunks = mtmd_input_chunks_size(chunks);
467
467