@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -1,7 +1,6 @@
1
1
  #include "LlamaCompletionWorker.h"
2
2
  #include "LlamaContext.h"
3
3
 
4
-
5
4
  size_t findStoppingStrings(const std::string &text,
6
5
  const size_t last_token_size,
7
6
  const std::vector<std::string> &stop_words) {
@@ -27,12 +26,12 @@ size_t findStoppingStrings(const std::string &text,
27
26
  LlamaCompletionWorker::LlamaCompletionWorker(
28
27
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
29
28
  Napi::Function callback, common_params params,
30
- std::vector<std::string> stop_words,
31
- int32_t chat_format,
32
- std::vector<std::string> image_paths)
29
+ std::vector<std::string> stop_words, int32_t chat_format,
30
+ const std::vector<std::string> &media_paths,
31
+ const std::vector<llama_token> &guide_tokens)
33
32
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
34
33
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
35
- _image_paths(image_paths) {
34
+ _media_paths(media_paths), _guide_tokens(guide_tokens) {
36
35
  if (!callback.IsEmpty()) {
37
36
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
38
37
  "LlamaCompletionCallback", 0, 1);
@@ -64,34 +63,29 @@ void LlamaCompletionWorker::Execute() {
64
63
  LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
65
64
  common_sampler_free};
66
65
 
67
- // Process images if any are provided
68
- if (!_image_paths.empty()) {
69
- const auto* mtmd_ctx = _sess->get_mtmd_ctx();
70
-
66
+ // Process media if any are provided
67
+ if (!_media_paths.empty()) {
68
+ const auto *mtmd_ctx = _sess->get_mtmd_ctx();
69
+
71
70
  if (mtmd_ctx != nullptr) {
72
- // Process the images and get the tokens
71
+ // Process the media and get the tokens
73
72
  try {
74
- n_cur = process_image_prompt(
75
- ctx,
76
- mtmd_ctx,
77
- _sess,
78
- _params,
79
- _image_paths
80
- );
81
- } catch (const std::exception& e) {
73
+ n_cur = processMediaPrompt(ctx, mtmd_ctx, _sess, _params, _media_paths);
74
+ } catch (const std::exception &e) {
82
75
  SetError(e.what());
83
76
  _sess->get_mutex().unlock();
84
77
  return;
85
78
  }
86
-
79
+
87
80
  if (n_cur <= 0) {
88
- SetError("Failed to process images");
81
+ SetError("Failed to process media");
89
82
  _sess->get_mutex().unlock();
90
83
  return;
91
84
  }
92
85
 
93
- fprintf(stdout, "[DEBUG] Image processing successful, n_cur=%zu, tokens=%zu\n",
94
- n_cur, _sess->tokens_ptr()->size());
86
+ fprintf(stdout,
87
+ "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
88
+ n_cur, _sess->tokens_ptr()->size());
95
89
 
96
90
  n_input = _sess->tokens_ptr()->size();
97
91
  if (n_cur == n_input) {
@@ -105,9 +99,10 @@ void LlamaCompletionWorker::Execute() {
105
99
  }
106
100
  } else {
107
101
  // Text-only path
108
- std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
102
+ std::vector<llama_token> prompt_tokens =
103
+ ::common_tokenize(ctx, _params.prompt, add_bos);
109
104
  n_input = prompt_tokens.size();
110
-
105
+
111
106
  if (_sess->tokens_ptr()->size() > 0) {
112
107
  n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
113
108
  if (n_cur == n_input) {
@@ -132,7 +127,7 @@ void LlamaCompletionWorker::Execute() {
132
127
  _result.context_full = true;
133
128
  break;
134
129
  }
135
-
130
+
136
131
  const int n_left = n_cur - n_keep - 1;
137
132
  const int n_discard = n_left / 2;
138
133
 
@@ -147,21 +142,27 @@ void LlamaCompletionWorker::Execute() {
147
142
  n_cur -= n_discard;
148
143
  _result.truncated = true;
149
144
  }
150
-
145
+
151
146
  // For multimodal input, n_past might already be set
152
147
  // Only decode text tokens if we have any input left
153
148
  if (n_input > 0) {
154
- int ret = llama_decode(
155
- ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
149
+ int ret =
150
+ llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
156
151
  if (ret < 0) {
157
152
  SetError("Failed to decode token, code: " + std::to_string(ret));
158
153
  break;
159
154
  }
160
155
  }
161
-
156
+
162
157
  // sample the next token
163
- const llama_token new_token_id =
164
- common_sampler_sample(sampling.get(), ctx, -1);
158
+ llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
159
+ if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
160
+ !llama_vocab_is_control(vocab, new_token_id) &&
161
+ !llama_vocab_is_eog(vocab, new_token_id)) {
162
+ new_token_id = _guide_tokens[0];
163
+ _guide_tokens.erase(_guide_tokens.begin());
164
+ }
165
+ _next_token_uses_guide_token = (new_token_id == 198);
165
166
  common_sampler_accept(sampling.get(), new_token_id, true);
166
167
  // prepare the next batch
167
168
  embd->emplace_back(new_token_id);
@@ -214,20 +215,15 @@ void LlamaCompletionWorker::Execute() {
214
215
  void LlamaCompletionWorker::OnOK() {
215
216
  auto env = Napi::AsyncWorker::Env();
216
217
  auto result = Napi::Object::New(env);
217
- result.Set("tokens_evaluated", Napi::Number::New(env,
218
- _result.tokens_evaluated));
218
+ result.Set("tokens_evaluated",
219
+ Napi::Number::New(env, _result.tokens_evaluated));
219
220
  result.Set("tokens_predicted", Napi::Number::New(Napi::AsyncWorker::Env(),
220
221
  _result.tokens_predicted));
221
- result.Set("truncated",
222
- Napi::Boolean::New(env, _result.truncated));
223
- result.Set("context_full",
224
- Napi::Boolean::New(env, _result.context_full));
225
- result.Set("text",
226
- Napi::String::New(env, _result.text.c_str()));
227
- result.Set("stopped_eos",
228
- Napi::Boolean::New(env, _result.stopped_eos));
229
- result.Set("stopped_words",
230
- Napi::Boolean::New(env, _result.stopped_words));
222
+ result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
223
+ result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
224
+ result.Set("text", Napi::String::New(env, _result.text.c_str()));
225
+ result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
226
+ result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
231
227
  result.Set("stopping_word",
232
228
  Napi::String::New(env, _result.stopping_word.c_str()));
233
229
  result.Set("stopped_limited",
@@ -238,7 +234,8 @@ void LlamaCompletionWorker::OnOK() {
238
234
  std::string content;
239
235
  if (!_stop) {
240
236
  try {
241
- common_chat_msg message = common_chat_parse(_result.text, static_cast<common_chat_format>(_chat_format));
237
+ common_chat_msg message = common_chat_parse(
238
+ _result.text, static_cast<common_chat_format>(_chat_format));
242
239
  if (!message.reasoning_content.empty()) {
243
240
  reasoning_content = message.reasoning_content;
244
241
  }
@@ -266,7 +263,8 @@ void LlamaCompletionWorker::OnOK() {
266
263
  result.Set("tool_calls", tool_calls);
267
264
  }
268
265
  if (!reasoning_content.empty()) {
269
- result.Set("reasoning_content", Napi::String::New(env, reasoning_content.c_str()));
266
+ result.Set("reasoning_content",
267
+ Napi::String::New(env, reasoning_content.c_str()));
270
268
  }
271
269
  if (!content.empty()) {
272
270
  result.Set("content", Napi::String::New(env, content.c_str()));
@@ -276,17 +274,33 @@ void LlamaCompletionWorker::OnOK() {
276
274
  const auto timings_token = llama_perf_context(ctx);
277
275
 
278
276
  auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
279
- timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
280
- timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
281
- timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
282
- timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
283
- timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
284
- timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
285
- timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
286
- timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
277
+ timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
278
+ timings_token.n_p_eval));
279
+ timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
280
+ timings_token.t_p_eval_ms));
281
+ timingsResult.Set(
282
+ "prompt_per_token_ms",
283
+ Napi::Number::New(Napi::AsyncWorker::Env(),
284
+ timings_token.t_p_eval_ms / timings_token.n_p_eval));
285
+ timingsResult.Set("prompt_per_second",
286
+ Napi::Number::New(Napi::AsyncWorker::Env(),
287
+ 1e3 / timings_token.t_p_eval_ms *
288
+ timings_token.n_p_eval));
289
+ timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(),
290
+ timings_token.n_eval));
291
+ timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
292
+ timings_token.t_eval_ms));
293
+ timingsResult.Set(
294
+ "predicted_per_token_ms",
295
+ Napi::Number::New(Napi::AsyncWorker::Env(),
296
+ timings_token.t_eval_ms / timings_token.n_eval));
297
+ timingsResult.Set(
298
+ "predicted_per_second",
299
+ Napi::Number::New(Napi::AsyncWorker::Env(),
300
+ 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
287
301
 
288
302
  result.Set("timings", timingsResult);
289
-
303
+
290
304
  Napi::Promise::Deferred::Resolve(result);
291
305
  }
292
306
 
@@ -20,19 +20,16 @@ public:
20
20
  Napi::Function callback, common_params params,
21
21
  std::vector<std::string> stop_words,
22
22
  int32_t chat_format,
23
- std::vector<std::string> image_paths = {});
23
+ const std::vector<std::string> &media_paths = {},
24
+ const std::vector<llama_token> &guide_tokens = {});
24
25
 
25
26
  ~LlamaCompletionWorker();
26
27
 
27
28
  Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
28
29
 
29
- void OnComplete(std::function<void()> cb) {
30
- _onComplete = cb;
31
- }
30
+ void OnComplete(std::function<void()> cb) { _onComplete = cb; }
32
31
 
33
- void SetStop() {
34
- _stop = true;
35
- }
32
+ void SetStop() { _stop = true; }
36
33
 
37
34
  protected:
38
35
  void Execute() override;
@@ -44,11 +41,13 @@ private:
44
41
  common_params _params;
45
42
  std::vector<std::string> _stop_words;
46
43
  int32_t _chat_format;
47
- std::vector<std::string> _image_paths;
44
+ std::vector<std::string> _media_paths;
45
+ std::vector<llama_token> _guide_tokens;
48
46
  std::function<void()> _onComplete;
49
47
  bool _has_callback = false;
50
48
  bool _stop = false;
51
49
  Napi::ThreadSafeFunction _tsfn;
50
+ bool _next_token_uses_guide_token = true;
52
51
  struct {
53
52
  size_t tokens_evaluated = 0;
54
53
  size_t tokens_predicted = 0;