@fugood/llama.node 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -102,6 +102,10 @@ file(
102
102
  "src/LoadSessionWorker.h"
103
103
  "src/SaveSessionWorker.cpp"
104
104
  "src/SaveSessionWorker.h"
105
+ "src/DecodeAudioTokenWorker.cpp"
106
+ "src/DecodeAudioTokenWorker.h"
107
+ "src/tts_utils.cpp"
108
+ "src/tts_utils.h"
105
109
  )
106
110
 
107
111
  add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -114,6 +114,11 @@ export type LlamaCompletionOptions = {
114
114
  * Supports both file paths and base64 data URLs.
115
115
  */
116
116
  media_paths?: string | string[]
117
+ /**
118
+ * Guide tokens to use for audio completion.
119
+ * Help prevent hallucinations by forcing the TTS to use the correct words.
120
+ */
121
+ guide_tokens?: Int32Array
117
122
  }
118
123
 
119
124
  export type LlamaCompletionResult = {
@@ -208,6 +213,47 @@ export interface LlamaContext {
208
213
  */
209
214
  releaseMultimodal(): Promise<void>
210
215
 
216
+ /**
217
+ * Load a vocoder model
218
+ * @param path Path to the vocoder model
219
+ * @returns Promise resolving to true if loading was successful
220
+ */
221
+ initVocoder(path: string): Promise<boolean>
222
+
223
+ /**
224
+ * Unload the vocoder model
225
+ * @returns Promise resolving to true if unloading was successful
226
+ */
227
+ releaseVocoder(): Promise<void>
228
+
229
+ /**
230
+ * Check if the vocoder model is enabled
231
+ * @returns Promise resolving to true if the vocoder model is enabled
232
+ */
233
+ isVocoderEnabled(): boolean
234
+
235
+ /**
236
+ * Get the formatted prompt for audio completion
237
+ * @param speaker Speaker name or null
238
+ * @param text Text to complete
239
+ * @returns Formatted audio completion
240
+ */
241
+ getFormattedAudioCompletion(speaker: string|null, text: string): string
242
+
243
+ /**
244
+ * Get guide tokens for audio completion
245
+ * @param text Text to complete
246
+ * @returns Guide tokens
247
+ */
248
+ getAudioCompletionGuideTokens(text: string): Int32Array
249
+
250
+ /**
251
+ * Decode audio tokens to audio data
252
+ * @param tokens Tokens to decode
253
+ * @returns Decoded audio tokens
254
+ */
255
+ decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
256
+
211
257
  // static
212
258
  loadModelInfo(path: string, skip: string[]): Promise<Object>
213
259
  toggleNativeLog(
package/lib/index.js CHANGED
@@ -204,6 +204,24 @@ class LlamaContextWrapper {
204
204
  getMultimodalSupport() {
205
205
  return this.ctx.getMultimodalSupport();
206
206
  }
207
+ initVocoder(path) {
208
+ return this.ctx.initVocoder(path);
209
+ }
210
+ releaseVocoder() {
211
+ return this.ctx.releaseVocoder();
212
+ }
213
+ isVocoderEnabled() {
214
+ return this.ctx.isVocoderEnabled();
215
+ }
216
+ getFormattedAudioCompletion(speaker, text) {
217
+ return this.ctx.getFormattedAudioCompletion(speaker, text);
218
+ }
219
+ getAudioCompletionGuideTokens(text) {
220
+ return this.ctx.getAudioCompletionGuideTokens(text);
221
+ }
222
+ decodeAudioTokens(tokens) {
223
+ return this.ctx.decodeAudioTokens(tokens);
224
+ }
207
225
  }
208
226
  const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
209
227
  var _a, _b;
package/lib/index.ts CHANGED
@@ -269,6 +269,30 @@ class LlamaContextWrapper {
269
269
  }> {
270
270
  return this.ctx.getMultimodalSupport()
271
271
  }
272
+
273
+ initVocoder(path: string): Promise<boolean> {
274
+ return this.ctx.initVocoder(path)
275
+ }
276
+
277
+ releaseVocoder(): Promise<void> {
278
+ return this.ctx.releaseVocoder()
279
+ }
280
+
281
+ isVocoderEnabled(): boolean {
282
+ return this.ctx.isVocoderEnabled()
283
+ }
284
+
285
+ getFormattedAudioCompletion(speaker: string|null, text: string): string {
286
+ return this.ctx.getFormattedAudioCompletion(speaker, text)
287
+ }
288
+
289
+ getAudioCompletionGuideTokens(text: string): Int32Array {
290
+ return this.ctx.getAudioCompletionGuideTokens(text)
291
+ }
292
+
293
+ decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
294
+ return this.ctx.decodeAudioTokens(tokens)
295
+ }
272
296
  }
273
297
 
274
298
  export const loadModel = async (
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.5.0",
4
+ "version": "0.6.0",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -0,0 +1,40 @@
1
+ #include "DecodeAudioTokenWorker.h"
2
+ #include "tts_utils.h"
3
+ #include <vector>
4
+
5
+ DecodeAudioTokenWorker::DecodeAudioTokenWorker(
6
+ const Napi::CallbackInfo &info, llama_model *model, llama_context *ctx,
7
+ int n_threads, const std::vector<llama_token> &tokens)
8
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _model(model), _ctx(ctx),
9
+ _n_threads(n_threads), _tokens(tokens) {}
10
+
11
+ void DecodeAudioTokenWorker::Execute() {
12
+ const int n_codes = _tokens.size();
13
+ llama_batch batch = llama_batch_init(n_codes, 0, 1);
14
+ for (size_t i = 0; i < _tokens.size(); ++i) {
15
+ common_batch_add(batch, _tokens[i], i, {0}, true);
16
+ }
17
+ if (batch.n_tokens != n_codes) {
18
+ SetError("batch.n_tokens != n_codes");
19
+ return;
20
+ }
21
+ if (llama_encode(_ctx, batch) != 0) {
22
+ SetError("llama_encode() failed");
23
+ return;
24
+ }
25
+ llama_synchronize(_ctx);
26
+ const int n_embd = llama_model_n_embd(_model);
27
+ const float *embd = llama_get_embeddings(_ctx);
28
+ _result = embd_to_audio(embd, n_codes, n_embd, _n_threads);
29
+ }
30
+
31
+ void DecodeAudioTokenWorker::OnOK() {
32
+ auto result =
33
+ Napi::Float32Array::New(Napi::AsyncWorker::Env(), _result.size());
34
+ memcpy(result.Data(), _result.data(), _result.size() * sizeof(float));
35
+ Napi::Promise::Deferred::Resolve(result);
36
+ }
37
+
38
+ void DecodeAudioTokenWorker::OnError(const Napi::Error &err) {
39
+ Napi::Promise::Deferred::Reject(err.Value());
40
+ }
@@ -0,0 +1,22 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ class DecodeAudioTokenWorker : public Napi::AsyncWorker,
5
+ public Napi::Promise::Deferred {
6
+ public:
7
+ DecodeAudioTokenWorker(const Napi::CallbackInfo &info, llama_model *model,
8
+ llama_context *ctx, int n_threads,
9
+ const std::vector<llama_token> &tokens);
10
+
11
+ protected:
12
+ void Execute();
13
+ void OnOK();
14
+ void OnError(const Napi::Error &err);
15
+
16
+ private:
17
+ llama_model *_model;
18
+ llama_context *_ctx;
19
+ int _n_threads;
20
+ std::vector<llama_token> _tokens;
21
+ std::vector<float> _result;
22
+ };
@@ -2,8 +2,10 @@
2
2
  #include "LlamaContext.h"
3
3
 
4
4
  EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
5
- LlamaSessionPtr &sess, std::string text, common_params &params)
6
- : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _params(params) {}
5
+ LlamaSessionPtr &sess, std::string text,
6
+ common_params &params)
7
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text),
8
+ _params(params) {}
7
9
 
8
10
  void EmbeddingWorker::Execute() {
9
11
  llama_kv_self_clear(_sess->context());
@@ -17,8 +19,7 @@ void EmbeddingWorker::Execute() {
17
19
  do {
18
20
  auto ctx = _sess->context();
19
21
  int ret =
20
- llama_decode(ctx,
21
- llama_batch_get_one(tokens.data(), tokens.size()));
22
+ llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()));
22
23
  if (ret < 0) {
23
24
  SetError("Failed to inference, code: " + std::to_string(ret));
24
25
  break;
@@ -37,7 +38,8 @@ void EmbeddingWorker::Execute() {
37
38
  }
38
39
  _result.embedding.resize(n_embd);
39
40
  std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
40
- common_embd_normalize(embedding.data(), out.data(), n_embd, _params.embd_normalize);
41
+ common_embd_normalize(embedding.data(), out.data(), n_embd,
42
+ _params.embd_normalize);
41
43
  memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
42
44
  } while (false);
43
45
  }
@@ -1,7 +1,6 @@
1
1
  #include "LlamaCompletionWorker.h"
2
2
  #include "LlamaContext.h"
3
3
 
4
-
5
4
  size_t findStoppingStrings(const std::string &text,
6
5
  const size_t last_token_size,
7
6
  const std::vector<std::string> &stop_words) {
@@ -27,12 +26,12 @@ size_t findStoppingStrings(const std::string &text,
27
26
  LlamaCompletionWorker::LlamaCompletionWorker(
28
27
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
29
28
  Napi::Function callback, common_params params,
30
- std::vector<std::string> stop_words,
31
- int32_t chat_format,
32
- std::vector<std::string> media_paths)
29
+ std::vector<std::string> stop_words, int32_t chat_format,
30
+ const std::vector<std::string> &media_paths,
31
+ const std::vector<llama_token> &guide_tokens)
33
32
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
34
33
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
35
- _media_paths(media_paths) {
34
+ _media_paths(media_paths), _guide_tokens(guide_tokens) {
36
35
  if (!callback.IsEmpty()) {
37
36
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
38
37
  "LlamaCompletionCallback", 0, 1);
@@ -66,32 +65,27 @@ void LlamaCompletionWorker::Execute() {
66
65
 
67
66
  // Process media if any are provided
68
67
  if (!_media_paths.empty()) {
69
- const auto* mtmd_ctx = _sess->get_mtmd_ctx();
70
-
68
+ const auto *mtmd_ctx = _sess->get_mtmd_ctx();
69
+
71
70
  if (mtmd_ctx != nullptr) {
72
71
  // Process the media and get the tokens
73
72
  try {
74
- n_cur = processMediaPrompt(
75
- ctx,
76
- mtmd_ctx,
77
- _sess,
78
- _params,
79
- _media_paths
80
- );
81
- } catch (const std::exception& e) {
73
+ n_cur = processMediaPrompt(ctx, mtmd_ctx, _sess, _params, _media_paths);
74
+ } catch (const std::exception &e) {
82
75
  SetError(e.what());
83
76
  _sess->get_mutex().unlock();
84
77
  return;
85
78
  }
86
-
79
+
87
80
  if (n_cur <= 0) {
88
81
  SetError("Failed to process media");
89
82
  _sess->get_mutex().unlock();
90
83
  return;
91
84
  }
92
85
 
93
- fprintf(stdout, "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
94
- n_cur, _sess->tokens_ptr()->size());
86
+ fprintf(stdout,
87
+ "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
88
+ n_cur, _sess->tokens_ptr()->size());
95
89
 
96
90
  n_input = _sess->tokens_ptr()->size();
97
91
  if (n_cur == n_input) {
@@ -105,9 +99,10 @@ void LlamaCompletionWorker::Execute() {
105
99
  }
106
100
  } else {
107
101
  // Text-only path
108
- std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
102
+ std::vector<llama_token> prompt_tokens =
103
+ ::common_tokenize(ctx, _params.prompt, add_bos);
109
104
  n_input = prompt_tokens.size();
110
-
105
+
111
106
  if (_sess->tokens_ptr()->size() > 0) {
112
107
  n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
113
108
  if (n_cur == n_input) {
@@ -132,7 +127,7 @@ void LlamaCompletionWorker::Execute() {
132
127
  _result.context_full = true;
133
128
  break;
134
129
  }
135
-
130
+
136
131
  const int n_left = n_cur - n_keep - 1;
137
132
  const int n_discard = n_left / 2;
138
133
 
@@ -147,21 +142,27 @@ void LlamaCompletionWorker::Execute() {
147
142
  n_cur -= n_discard;
148
143
  _result.truncated = true;
149
144
  }
150
-
145
+
151
146
  // For multimodal input, n_past might already be set
152
147
  // Only decode text tokens if we have any input left
153
148
  if (n_input > 0) {
154
- int ret = llama_decode(
155
- ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
149
+ int ret =
150
+ llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
156
151
  if (ret < 0) {
157
152
  SetError("Failed to decode token, code: " + std::to_string(ret));
158
153
  break;
159
154
  }
160
155
  }
161
-
156
+
162
157
  // sample the next token
163
- const llama_token new_token_id =
164
- common_sampler_sample(sampling.get(), ctx, -1);
158
+ llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
159
+ if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
160
+ !llama_vocab_is_control(vocab, new_token_id) &&
161
+ !llama_vocab_is_eog(vocab, new_token_id)) {
162
+ new_token_id = _guide_tokens[0];
163
+ _guide_tokens.erase(_guide_tokens.begin());
164
+ }
165
+ _next_token_uses_guide_token = (new_token_id == 198);
165
166
  common_sampler_accept(sampling.get(), new_token_id, true);
166
167
  // prepare the next batch
167
168
  embd->emplace_back(new_token_id);
@@ -214,20 +215,15 @@ void LlamaCompletionWorker::Execute() {
214
215
  void LlamaCompletionWorker::OnOK() {
215
216
  auto env = Napi::AsyncWorker::Env();
216
217
  auto result = Napi::Object::New(env);
217
- result.Set("tokens_evaluated", Napi::Number::New(env,
218
- _result.tokens_evaluated));
218
+ result.Set("tokens_evaluated",
219
+ Napi::Number::New(env, _result.tokens_evaluated));
219
220
  result.Set("tokens_predicted", Napi::Number::New(Napi::AsyncWorker::Env(),
220
221
  _result.tokens_predicted));
221
- result.Set("truncated",
222
- Napi::Boolean::New(env, _result.truncated));
223
- result.Set("context_full",
224
- Napi::Boolean::New(env, _result.context_full));
225
- result.Set("text",
226
- Napi::String::New(env, _result.text.c_str()));
227
- result.Set("stopped_eos",
228
- Napi::Boolean::New(env, _result.stopped_eos));
229
- result.Set("stopped_words",
230
- Napi::Boolean::New(env, _result.stopped_words));
222
+ result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
223
+ result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
224
+ result.Set("text", Napi::String::New(env, _result.text.c_str()));
225
+ result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
226
+ result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
231
227
  result.Set("stopping_word",
232
228
  Napi::String::New(env, _result.stopping_word.c_str()));
233
229
  result.Set("stopped_limited",
@@ -238,7 +234,8 @@ void LlamaCompletionWorker::OnOK() {
238
234
  std::string content;
239
235
  if (!_stop) {
240
236
  try {
241
- common_chat_msg message = common_chat_parse(_result.text, static_cast<common_chat_format>(_chat_format));
237
+ common_chat_msg message = common_chat_parse(
238
+ _result.text, static_cast<common_chat_format>(_chat_format));
242
239
  if (!message.reasoning_content.empty()) {
243
240
  reasoning_content = message.reasoning_content;
244
241
  }
@@ -266,7 +263,8 @@ void LlamaCompletionWorker::OnOK() {
266
263
  result.Set("tool_calls", tool_calls);
267
264
  }
268
265
  if (!reasoning_content.empty()) {
269
- result.Set("reasoning_content", Napi::String::New(env, reasoning_content.c_str()));
266
+ result.Set("reasoning_content",
267
+ Napi::String::New(env, reasoning_content.c_str()));
270
268
  }
271
269
  if (!content.empty()) {
272
270
  result.Set("content", Napi::String::New(env, content.c_str()));
@@ -276,17 +274,33 @@ void LlamaCompletionWorker::OnOK() {
276
274
  const auto timings_token = llama_perf_context(ctx);
277
275
 
278
276
  auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
279
- timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
280
- timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
281
- timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
282
- timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
283
- timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
284
- timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
285
- timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
286
- timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
277
+ timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
278
+ timings_token.n_p_eval));
279
+ timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
280
+ timings_token.t_p_eval_ms));
281
+ timingsResult.Set(
282
+ "prompt_per_token_ms",
283
+ Napi::Number::New(Napi::AsyncWorker::Env(),
284
+ timings_token.t_p_eval_ms / timings_token.n_p_eval));
285
+ timingsResult.Set("prompt_per_second",
286
+ Napi::Number::New(Napi::AsyncWorker::Env(),
287
+ 1e3 / timings_token.t_p_eval_ms *
288
+ timings_token.n_p_eval));
289
+ timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(),
290
+ timings_token.n_eval));
291
+ timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
292
+ timings_token.t_eval_ms));
293
+ timingsResult.Set(
294
+ "predicted_per_token_ms",
295
+ Napi::Number::New(Napi::AsyncWorker::Env(),
296
+ timings_token.t_eval_ms / timings_token.n_eval));
297
+ timingsResult.Set(
298
+ "predicted_per_second",
299
+ Napi::Number::New(Napi::AsyncWorker::Env(),
300
+ 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
287
301
 
288
302
  result.Set("timings", timingsResult);
289
-
303
+
290
304
  Napi::Promise::Deferred::Resolve(result);
291
305
  }
292
306
 
@@ -20,19 +20,16 @@ public:
20
20
  Napi::Function callback, common_params params,
21
21
  std::vector<std::string> stop_words,
22
22
  int32_t chat_format,
23
- std::vector<std::string> media_paths = {});
23
+ const std::vector<std::string> &media_paths = {},
24
+ const std::vector<llama_token> &guide_tokens = {});
24
25
 
25
26
  ~LlamaCompletionWorker();
26
27
 
27
28
  Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
28
29
 
29
- void OnComplete(std::function<void()> cb) {
30
- _onComplete = cb;
31
- }
30
+ void OnComplete(std::function<void()> cb) { _onComplete = cb; }
32
31
 
33
- void SetStop() {
34
- _stop = true;
35
- }
32
+ void SetStop() { _stop = true; }
36
33
 
37
34
  protected:
38
35
  void Execute() override;
@@ -45,10 +42,12 @@ private:
45
42
  std::vector<std::string> _stop_words;
46
43
  int32_t _chat_format;
47
44
  std::vector<std::string> _media_paths;
45
+ std::vector<llama_token> _guide_tokens;
48
46
  std::function<void()> _onComplete;
49
47
  bool _has_callback = false;
50
48
  bool _stop = false;
51
49
  Napi::ThreadSafeFunction _tsfn;
50
+ bool _next_token_uses_guide_token = true;
52
51
  struct {
53
52
  size_t tokens_evaluated = 0;
54
53
  size_t tokens_predicted = 0;