@fugood/llama.node 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -114,6 +114,9 @@ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
114
114
  set(LLAMA_CURL OFF CACHE BOOL "Build curl")
115
115
 
116
116
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
117
+
118
+ add_definitions(-DGGML_MAX_NAME=80)
119
+
117
120
  add_subdirectory("src/llama.cpp")
118
121
  add_subdirectory("src/llama.cpp/tools/mtmd")
119
122
 
package/lib/binding.ts CHANGED
@@ -55,6 +55,10 @@ export type LlamaModelOptions = {
55
55
  * Try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix.
56
56
  */
57
57
  kv_unified?: boolean
58
+ /**
59
+ * Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
60
+ */
61
+ swa_full?: boolean
58
62
  use_mlock?: boolean
59
63
  use_mmap?: boolean
60
64
  vocab_only?: boolean
@@ -102,6 +106,8 @@ export type LlamaCompletionOptions = {
102
106
  dry_base?: number
103
107
  dry_allowed_length?: number
104
108
  dry_penalty_last_n?: number
109
+ dry_sequence_breakers?: string[]
110
+ top_n_sigma?: number
105
111
  n_predict?: number
106
112
  max_length?: number
107
113
  max_tokens?: number
@@ -111,6 +117,9 @@ export type LlamaCompletionOptions = {
111
117
  grammar_lazy?: boolean
112
118
  grammar_triggers?: { type: number; value: string; token?: number }[]
113
119
  preserved_tokens?: string[]
120
+ json_schema?: string
121
+ logit_bias?: number[][]
122
+ ignore_eos?: boolean
114
123
  /**
115
124
  * Path(s) to media file(s) to process before generating text.
116
125
  * When provided, the media will be processed and added to the context.
@@ -134,6 +143,7 @@ export type LlamaCompletionResult = {
134
143
  tokens_evaluated: number
135
144
  truncated: boolean
136
145
  context_full: boolean
146
+ interrupted: boolean
137
147
  audio_tokens?: Array<number>
138
148
  timings: {
139
149
  prompt_n: number
@@ -265,6 +275,9 @@ export interface LlamaContext {
265
275
  parallel_tool_calls?: boolean
266
276
  tool_choice?: string
267
277
  enable_thinking?: boolean
278
+ add_generation_prompt?: boolean
279
+ now?: string | number
280
+ chat_template_kwargs?: Record<string, string>
268
281
  },
269
282
  ): JinjaFormattedChatResult | string
270
283
  completion(
package/lib/index.js CHANGED
@@ -145,6 +145,9 @@ class LlamaContextWrapper {
145
145
  parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
146
146
  tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
147
147
  enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
148
+ add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
149
+ now: params === null || params === void 0 ? void 0 : params.now,
150
+ chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
148
151
  });
149
152
  if (!useJinja) {
150
153
  return {
package/lib/index.ts CHANGED
@@ -166,6 +166,9 @@ class LlamaContextWrapper {
166
166
  parallel_tool_calls?: boolean
167
167
  tool_choice?: string,
168
168
  enable_thinking?: boolean,
169
+ add_generation_prompt?: boolean,
170
+ now?: string | number,
171
+ chat_template_kwargs?: Record<string, string>,
169
172
  },
170
173
  ): FormattedChatResult {
171
174
  const {
@@ -186,6 +189,9 @@ class LlamaContextWrapper {
186
189
  parallel_tool_calls: params?.parallel_tool_calls,
187
190
  tool_choice: params?.tool_choice,
188
191
  enable_thinking: params?.enable_thinking ?? true,
192
+ add_generation_prompt: params?.add_generation_prompt,
193
+ now: params?.now,
194
+ chat_template_kwargs: params?.chat_template_kwargs,
189
195
  })
190
196
 
191
197
  if (!useJinja) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.2",
4
+ "version": "1.1.4",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.2",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.2",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.2",
77
- "@fugood/node-llama-linux-arm64": "1.1.2",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.2",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.2",
80
- "@fugood/node-llama-win32-x64": "1.1.2",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.2",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.2",
83
- "@fugood/node-llama-win32-arm64": "1.1.2",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.2",
85
- "@fugood/node-llama-darwin-x64": "1.1.2",
86
- "@fugood/node-llama-darwin-arm64": "1.1.2"
74
+ "@fugood/node-llama-linux-x64": "1.1.4",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.4",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.4",
77
+ "@fugood/node-llama-linux-arm64": "1.1.4",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.4",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.4",
80
+ "@fugood/node-llama-win32-x64": "1.1.4",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.4",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.4",
83
+ "@fugood/node-llama-win32-arm64": "1.1.4",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.4",
85
+ "@fugood/node-llama-darwin-x64": "1.1.4",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.4"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -64,6 +64,7 @@ void LlamaCompletionWorker::Execute() {
64
64
  size_t n_input = 0;
65
65
  const auto model = _sess->model();
66
66
  auto vocab = llama_model_get_vocab(model);
67
+ const bool is_enc_dec = llama_model_has_encoder(model);
67
68
 
68
69
  const bool add_bos = llama_vocab_get_add_bos(vocab);
69
70
  auto ctx = _sess->context();
@@ -110,7 +111,7 @@ void LlamaCompletionWorker::Execute() {
110
111
  } else {
111
112
  // Text-only path
112
113
  std::vector<llama_token> prompt_tokens =
113
- ::common_tokenize(ctx, _params.prompt, add_bos, true);
114
+ ::common_tokenize(ctx, _params.prompt, add_bos || is_enc_dec, true);
114
115
  n_input = prompt_tokens.size();
115
116
 
116
117
  if (_sess->tokens_ptr()->size() > 0) {
@@ -126,10 +127,48 @@ void LlamaCompletionWorker::Execute() {
126
127
  }
127
128
 
128
129
  const int max_len = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
129
- _sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
130
-
131
130
  auto embd = _sess->tokens_ptr();
132
- for (int i = 0; (i < max_len || _stop) && !_params.vocab_only; i++) {
131
+ embd->reserve(embd->size() + max_len);
132
+
133
+ if (is_enc_dec) {
134
+ if (n_input > 0) {
135
+ // Decode tokens in batches using n_batch as chunk size
136
+ int n_past_batch = n_cur;
137
+ int n_remaining = n_input;
138
+
139
+ while (n_remaining > 0) {
140
+ int n_eval = n_remaining;
141
+ if (n_eval > _params.n_batch) {
142
+ n_eval = _params.n_batch;
143
+ }
144
+
145
+ int ret = llama_encode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
146
+ if (ret < 0) {
147
+ SetError("Failed to encode token batch, code: " + std::to_string(ret) +
148
+ ", n_eval: " + std::to_string(n_eval) +
149
+ ", n_past_batch: " + std::to_string(n_past_batch));
150
+ _sess->get_mutex().unlock();
151
+ return;
152
+ }
153
+
154
+ n_past_batch += n_eval;
155
+ n_remaining -= n_eval;
156
+ n_cur += n_eval;
157
+ }
158
+ }
159
+ _result.tokens_evaluated += n_input;
160
+
161
+ llama_token decode_bos = llama_model_decoder_start_token(model);
162
+ if (decode_bos == LLAMA_TOKEN_NULL) {
163
+ decode_bos = llama_vocab_bos(vocab);
164
+ }
165
+
166
+ embd->emplace_back(decode_bos);
167
+ common_sampler_accept(sampling.get(), decode_bos, false);
168
+ n_input = 1;
169
+ }
170
+
171
+ for (int i = 0; (i < max_len || _interrupted) && !_params.vocab_only; i++) {
133
172
  // check if we need to remove some tokens
134
173
  if (embd->size() >= _params.n_ctx) {
135
174
  if (!_params.ctx_shift) {
@@ -166,13 +205,14 @@ void LlamaCompletionWorker::Execute() {
166
205
  if (n_eval > _params.n_batch) {
167
206
  n_eval = _params.n_batch;
168
207
  }
169
-
208
+
170
209
  int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
171
210
  if (ret < 0) {
172
211
  SetError("Failed to decode token batch, code: " + std::to_string(ret) +
173
212
  ", n_eval: " + std::to_string(n_eval) +
174
213
  ", n_past_batch: " + std::to_string(n_past_batch));
175
- break;
214
+ _sess->get_mutex().unlock();
215
+ return;
176
216
  }
177
217
 
178
218
  n_past_batch += n_eval;
@@ -256,6 +296,7 @@ void LlamaCompletionWorker::OnOK() {
256
296
  _result.tokens_predicted));
257
297
  result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
258
298
  result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
299
+ result.Set("interrupted", Napi::Boolean::New(env, _interrupted));
259
300
  result.Set("text", Napi::String::New(env, _result.text.c_str()));
260
301
  result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
261
302
  result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
@@ -267,7 +308,7 @@ void LlamaCompletionWorker::OnOK() {
267
308
  Napi::Array tool_calls = Napi::Array::New(Napi::AsyncWorker::Env());
268
309
  std::string reasoning_content = "";
269
310
  std::string content;
270
- if (!_stop) {
311
+ if (!_interrupted) {
271
312
  try {
272
313
  common_chat_syntax chat_syntax;
273
314
  chat_syntax.format = static_cast<common_chat_format>(_chat_format);
@@ -34,7 +34,7 @@ public:
34
34
 
35
35
  void OnComplete(std::function<void()> cb) { _onComplete = cb; }
36
36
 
37
- void SetStop() { _stop = true; }
37
+ void SetStop() { _interrupted = true; }
38
38
 
39
39
  protected:
40
40
  void Execute() override;
@@ -52,7 +52,7 @@ private:
52
52
  std::vector<llama_token> _guide_tokens;
53
53
  std::function<void()> _onComplete;
54
54
  bool _has_callback = false;
55
- bool _stop = false;
55
+ bool _interrupted = false;
56
56
  Napi::ThreadSafeFunction _tsfn;
57
57
  bool _next_token_uses_guide_token = true;
58
58
  bool _has_vocoder;
@@ -248,6 +248,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
248
248
  get_option<std::string>(options, "cache_type_v", "f16").c_str());
249
249
  params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
250
250
  params.kv_unified = get_option<bool>(options, "kv_unified", false);
251
+ params.swa_full = get_option<bool>(options, "swa_full", false);
251
252
 
252
253
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
253
254
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
@@ -504,7 +505,10 @@ common_chat_params getFormattedChatWithJinja(
504
505
  const std::string &chat_template, const std::string &json_schema,
505
506
  const std::string &tools, const bool &parallel_tool_calls,
506
507
  const std::string &tool_choice,
507
- const bool &enable_thinking
508
+ const bool &enable_thinking,
509
+ const bool &add_generation_prompt,
510
+ const std::string &now_str,
511
+ const std::map<std::string, std::string> &chat_template_kwargs
508
512
  ) {
509
513
  common_chat_templates_inputs inputs;
510
514
  inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
@@ -520,6 +524,21 @@ common_chat_params getFormattedChatWithJinja(
520
524
  inputs.json_schema = json::parse(json_schema);
521
525
  }
522
526
  inputs.enable_thinking = enable_thinking;
527
+ inputs.add_generation_prompt = add_generation_prompt;
528
+
529
+ // Handle now parameter - parse timestamp or use current time
530
+ if (!now_str.empty()) {
531
+ try {
532
+ // Try to parse as timestamp (seconds since epoch)
533
+ auto timestamp = std::stoll(now_str);
534
+ inputs.now = std::chrono::system_clock::from_time_t(timestamp);
535
+ } catch (...) {
536
+ // If parsing fails, use current time
537
+ inputs.now = std::chrono::system_clock::now();
538
+ }
539
+ }
540
+
541
+ inputs.chat_template_kwargs = chat_template_kwargs;
523
542
 
524
543
  // If chat_template is provided, create new one and use it (probably slow)
525
544
  if (!chat_template.empty()) {
@@ -594,12 +613,26 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
594
613
  get_option<bool>(params, "parallel_tool_calls", false);
595
614
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
596
615
  auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
616
+ auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
617
+ auto now_str = get_option<std::string>(params, "now", "");
618
+
619
+ std::map<std::string, std::string> chat_template_kwargs;
620
+ if (params.Has("chat_template_kwargs") && params.Get("chat_template_kwargs").IsObject()) {
621
+ auto kwargs_obj = params.Get("chat_template_kwargs").As<Napi::Object>();
622
+ auto props = kwargs_obj.GetPropertyNames();
623
+ for (uint32_t i = 0; i < props.Length(); i++) {
624
+ auto key = props.Get(i).ToString().Utf8Value();
625
+ auto val = kwargs_obj.Get(key).ToString().Utf8Value();
626
+ chat_template_kwargs[key] = val;
627
+ }
628
+ }
597
629
 
598
630
  common_chat_params chatParams;
599
631
  try {
600
632
  chatParams = getFormattedChatWithJinja(
601
633
  _sess, _templates, messages, chat_template, json_schema_str, tools_str,
602
- parallel_tool_calls, tool_choice, enable_thinking);
634
+ parallel_tool_calls, tool_choice, enable_thinking,
635
+ add_generation_prompt, now_str, chat_template_kwargs);
603
636
  } catch (const std::exception &e) {
604
637
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
605
638
  return env.Undefined();
@@ -808,13 +841,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
808
841
  auto tool_choice =
809
842
  get_option<std::string>(options, "tool_choice", "none");
810
843
  auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
844
+ auto add_generation_prompt = get_option<bool>(options, "add_generation_prompt", true);
845
+ auto now_str = get_option<std::string>(options, "now", "");
846
+
847
+ std::map<std::string, std::string> chat_template_kwargs;
848
+ if (options.Has("chat_template_kwargs") && options.Get("chat_template_kwargs").IsObject()) {
849
+ auto kwargs_obj = options.Get("chat_template_kwargs").As<Napi::Object>();
850
+ auto props = kwargs_obj.GetPropertyNames();
851
+ for (uint32_t i = 0; i < props.Length(); i++) {
852
+ auto key = props.Get(i).ToString().Utf8Value();
853
+ auto val = kwargs_obj.Get(key).ToString().Utf8Value();
854
+ chat_template_kwargs[key] = val;
855
+ }
856
+ }
811
857
 
812
858
  common_chat_params chatParams;
813
859
 
814
860
  try {
815
861
  chatParams = getFormattedChatWithJinja(
816
862
  _sess, _templates, json_stringify(messages), chat_template,
817
- json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
863
+ json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
864
+ add_generation_prompt, now_str, chat_template_kwargs);
818
865
  } catch (const std::exception &e) {
819
866
  Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
820
867
  return env.Undefined();
package/src/tts_utils.h CHANGED
@@ -68,7 +68,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
68
68
  static const char *OUTETTS_V1_GRAMMAR = R"(
69
69
  root ::= NL? wordAudioBlock+ audioEnd NL eos?
70
70
  wordAudioBlock ::= WORD codeBlock NL
71
- codeBlock ::= TIME CODE{1,144}
71
+ codeBlock ::= TIME CODE*
72
72
  eos ::= "<|im_end|>"
73
73
  codeStart ::= "<|code_start|>"
74
74
  codeEnd ::= "<|code_end|>"
@@ -85,7 +85,7 @@ static const char *OUTETTS_V2_GRAMMAR = R"(
85
85
  root ::= NL? content+ audioEnd NL eos?
86
86
  content ::= wordAudioBlock | emotionBlock
87
87
  wordAudioBlock ::= WORD punch* codeBlock space NL
88
- codeBlock ::= TIME CODE{1,144}
88
+ codeBlock ::= TIME CODE*
89
89
  emotionBlock ::= emotionStart TEXT emotionEnd space NL
90
90
  TEXT ::= [A-Za-z0-9 .,?!]+
91
91
  eos ::= "<|im_end|>"
@@ -94,7 +94,7 @@ emotionEnd ::= "<|emotion_end|>"
94
94
  audioEnd ::= "<|audio_end|>"
95
95
  space ::= "<|space|>"
96
96
  WORD ::= [A-Za-z]+
97
- NL ::= "\n"
97
+ NL ::= [\n]
98
98
  TIME ::= "<|t_" DECIMAL "|>"
99
99
  CODE ::= "<|" DIGITS "|>"
100
100
  DIGITS ::= [0-9]+