@fugood/llama.node 0.4.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +66 -6
- package/lib/index.js +59 -17
- package/lib/index.ts +74 -23
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +68 -54
- package/src/LlamaCompletionWorker.h +7 -8
- package/src/LlamaContext.cpp +551 -235
- package/src/LlamaContext.h +26 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +23 -14
- package/src/TokenizeWorker.h +2 -2
- package/src/addons.cc +8 -11
- package/src/common.hpp +129 -126
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#include "LlamaCompletionWorker.h"
|
|
2
2
|
#include "LlamaContext.h"
|
|
3
3
|
|
|
4
|
-
|
|
5
4
|
size_t findStoppingStrings(const std::string &text,
|
|
6
5
|
const size_t last_token_size,
|
|
7
6
|
const std::vector<std::string> &stop_words) {
|
|
@@ -27,12 +26,12 @@ size_t findStoppingStrings(const std::string &text,
|
|
|
27
26
|
LlamaCompletionWorker::LlamaCompletionWorker(
|
|
28
27
|
const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
29
28
|
Napi::Function callback, common_params params,
|
|
30
|
-
std::vector<std::string> stop_words,
|
|
31
|
-
|
|
32
|
-
std::vector<
|
|
29
|
+
std::vector<std::string> stop_words, int32_t chat_format,
|
|
30
|
+
const std::vector<std::string> &media_paths,
|
|
31
|
+
const std::vector<llama_token> &guide_tokens)
|
|
33
32
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
34
33
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
35
|
-
|
|
34
|
+
_media_paths(media_paths), _guide_tokens(guide_tokens) {
|
|
36
35
|
if (!callback.IsEmpty()) {
|
|
37
36
|
_tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
|
|
38
37
|
"LlamaCompletionCallback", 0, 1);
|
|
@@ -64,34 +63,29 @@ void LlamaCompletionWorker::Execute() {
|
|
|
64
63
|
LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
|
|
65
64
|
common_sampler_free};
|
|
66
65
|
|
|
67
|
-
// Process
|
|
68
|
-
if (!
|
|
69
|
-
const auto*
|
|
70
|
-
|
|
66
|
+
// Process media if any are provided
|
|
67
|
+
if (!_media_paths.empty()) {
|
|
68
|
+
const auto *mtmd_ctx = _sess->get_mtmd_ctx();
|
|
69
|
+
|
|
71
70
|
if (mtmd_ctx != nullptr) {
|
|
72
|
-
// Process the
|
|
71
|
+
// Process the media and get the tokens
|
|
73
72
|
try {
|
|
74
|
-
n_cur =
|
|
75
|
-
|
|
76
|
-
mtmd_ctx,
|
|
77
|
-
_sess,
|
|
78
|
-
_params,
|
|
79
|
-
_image_paths
|
|
80
|
-
);
|
|
81
|
-
} catch (const std::exception& e) {
|
|
73
|
+
n_cur = processMediaPrompt(ctx, mtmd_ctx, _sess, _params, _media_paths);
|
|
74
|
+
} catch (const std::exception &e) {
|
|
82
75
|
SetError(e.what());
|
|
83
76
|
_sess->get_mutex().unlock();
|
|
84
77
|
return;
|
|
85
78
|
}
|
|
86
|
-
|
|
79
|
+
|
|
87
80
|
if (n_cur <= 0) {
|
|
88
|
-
SetError("Failed to process
|
|
81
|
+
SetError("Failed to process media");
|
|
89
82
|
_sess->get_mutex().unlock();
|
|
90
83
|
return;
|
|
91
84
|
}
|
|
92
85
|
|
|
93
|
-
fprintf(stdout,
|
|
94
|
-
|
|
86
|
+
fprintf(stdout,
|
|
87
|
+
"[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
|
|
88
|
+
n_cur, _sess->tokens_ptr()->size());
|
|
95
89
|
|
|
96
90
|
n_input = _sess->tokens_ptr()->size();
|
|
97
91
|
if (n_cur == n_input) {
|
|
@@ -105,9 +99,10 @@ void LlamaCompletionWorker::Execute() {
|
|
|
105
99
|
}
|
|
106
100
|
} else {
|
|
107
101
|
// Text-only path
|
|
108
|
-
std::vector<llama_token> prompt_tokens =
|
|
102
|
+
std::vector<llama_token> prompt_tokens =
|
|
103
|
+
::common_tokenize(ctx, _params.prompt, add_bos);
|
|
109
104
|
n_input = prompt_tokens.size();
|
|
110
|
-
|
|
105
|
+
|
|
111
106
|
if (_sess->tokens_ptr()->size() > 0) {
|
|
112
107
|
n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
|
|
113
108
|
if (n_cur == n_input) {
|
|
@@ -132,7 +127,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
132
127
|
_result.context_full = true;
|
|
133
128
|
break;
|
|
134
129
|
}
|
|
135
|
-
|
|
130
|
+
|
|
136
131
|
const int n_left = n_cur - n_keep - 1;
|
|
137
132
|
const int n_discard = n_left / 2;
|
|
138
133
|
|
|
@@ -147,21 +142,27 @@ void LlamaCompletionWorker::Execute() {
|
|
|
147
142
|
n_cur -= n_discard;
|
|
148
143
|
_result.truncated = true;
|
|
149
144
|
}
|
|
150
|
-
|
|
145
|
+
|
|
151
146
|
// For multimodal input, n_past might already be set
|
|
152
147
|
// Only decode text tokens if we have any input left
|
|
153
148
|
if (n_input > 0) {
|
|
154
|
-
int ret =
|
|
155
|
-
ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
|
|
149
|
+
int ret =
|
|
150
|
+
llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
|
|
156
151
|
if (ret < 0) {
|
|
157
152
|
SetError("Failed to decode token, code: " + std::to_string(ret));
|
|
158
153
|
break;
|
|
159
154
|
}
|
|
160
155
|
}
|
|
161
|
-
|
|
156
|
+
|
|
162
157
|
// sample the next token
|
|
163
|
-
|
|
164
|
-
|
|
158
|
+
llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
|
|
159
|
+
if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
|
|
160
|
+
!llama_vocab_is_control(vocab, new_token_id) &&
|
|
161
|
+
!llama_vocab_is_eog(vocab, new_token_id)) {
|
|
162
|
+
new_token_id = _guide_tokens[0];
|
|
163
|
+
_guide_tokens.erase(_guide_tokens.begin());
|
|
164
|
+
}
|
|
165
|
+
_next_token_uses_guide_token = (new_token_id == 198);
|
|
165
166
|
common_sampler_accept(sampling.get(), new_token_id, true);
|
|
166
167
|
// prepare the next batch
|
|
167
168
|
embd->emplace_back(new_token_id);
|
|
@@ -214,20 +215,15 @@ void LlamaCompletionWorker::Execute() {
|
|
|
214
215
|
void LlamaCompletionWorker::OnOK() {
|
|
215
216
|
auto env = Napi::AsyncWorker::Env();
|
|
216
217
|
auto result = Napi::Object::New(env);
|
|
217
|
-
result.Set("tokens_evaluated",
|
|
218
|
-
|
|
218
|
+
result.Set("tokens_evaluated",
|
|
219
|
+
Napi::Number::New(env, _result.tokens_evaluated));
|
|
219
220
|
result.Set("tokens_predicted", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
220
221
|
_result.tokens_predicted));
|
|
221
|
-
result.Set("truncated",
|
|
222
|
-
|
|
223
|
-
result.Set("
|
|
224
|
-
|
|
225
|
-
result.Set("
|
|
226
|
-
Napi::String::New(env, _result.text.c_str()));
|
|
227
|
-
result.Set("stopped_eos",
|
|
228
|
-
Napi::Boolean::New(env, _result.stopped_eos));
|
|
229
|
-
result.Set("stopped_words",
|
|
230
|
-
Napi::Boolean::New(env, _result.stopped_words));
|
|
222
|
+
result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
|
|
223
|
+
result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
|
|
224
|
+
result.Set("text", Napi::String::New(env, _result.text.c_str()));
|
|
225
|
+
result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
|
|
226
|
+
result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
|
|
231
227
|
result.Set("stopping_word",
|
|
232
228
|
Napi::String::New(env, _result.stopping_word.c_str()));
|
|
233
229
|
result.Set("stopped_limited",
|
|
@@ -238,7 +234,8 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
238
234
|
std::string content;
|
|
239
235
|
if (!_stop) {
|
|
240
236
|
try {
|
|
241
|
-
common_chat_msg message = common_chat_parse(
|
|
237
|
+
common_chat_msg message = common_chat_parse(
|
|
238
|
+
_result.text, static_cast<common_chat_format>(_chat_format));
|
|
242
239
|
if (!message.reasoning_content.empty()) {
|
|
243
240
|
reasoning_content = message.reasoning_content;
|
|
244
241
|
}
|
|
@@ -266,7 +263,8 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
266
263
|
result.Set("tool_calls", tool_calls);
|
|
267
264
|
}
|
|
268
265
|
if (!reasoning_content.empty()) {
|
|
269
|
-
result.Set("reasoning_content",
|
|
266
|
+
result.Set("reasoning_content",
|
|
267
|
+
Napi::String::New(env, reasoning_content.c_str()));
|
|
270
268
|
}
|
|
271
269
|
if (!content.empty()) {
|
|
272
270
|
result.Set("content", Napi::String::New(env, content.c_str()));
|
|
@@ -276,17 +274,33 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
276
274
|
const auto timings_token = llama_perf_context(ctx);
|
|
277
275
|
|
|
278
276
|
auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
|
|
279
|
-
timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
280
|
-
|
|
281
|
-
timingsResult.Set("
|
|
282
|
-
|
|
283
|
-
timingsResult.Set(
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
277
|
+
timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
278
|
+
timings_token.n_p_eval));
|
|
279
|
+
timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
280
|
+
timings_token.t_p_eval_ms));
|
|
281
|
+
timingsResult.Set(
|
|
282
|
+
"prompt_per_token_ms",
|
|
283
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
284
|
+
timings_token.t_p_eval_ms / timings_token.n_p_eval));
|
|
285
|
+
timingsResult.Set("prompt_per_second",
|
|
286
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
287
|
+
1e3 / timings_token.t_p_eval_ms *
|
|
288
|
+
timings_token.n_p_eval));
|
|
289
|
+
timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
290
|
+
timings_token.n_eval));
|
|
291
|
+
timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
292
|
+
timings_token.t_eval_ms));
|
|
293
|
+
timingsResult.Set(
|
|
294
|
+
"predicted_per_token_ms",
|
|
295
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
296
|
+
timings_token.t_eval_ms / timings_token.n_eval));
|
|
297
|
+
timingsResult.Set(
|
|
298
|
+
"predicted_per_second",
|
|
299
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
300
|
+
1e3 / timings_token.t_eval_ms * timings_token.n_eval));
|
|
287
301
|
|
|
288
302
|
result.Set("timings", timingsResult);
|
|
289
|
-
|
|
303
|
+
|
|
290
304
|
Napi::Promise::Deferred::Resolve(result);
|
|
291
305
|
}
|
|
292
306
|
|
|
@@ -20,19 +20,16 @@ public:
|
|
|
20
20
|
Napi::Function callback, common_params params,
|
|
21
21
|
std::vector<std::string> stop_words,
|
|
22
22
|
int32_t chat_format,
|
|
23
|
-
std::vector<std::string>
|
|
23
|
+
const std::vector<std::string> &media_paths = {},
|
|
24
|
+
const std::vector<llama_token> &guide_tokens = {});
|
|
24
25
|
|
|
25
26
|
~LlamaCompletionWorker();
|
|
26
27
|
|
|
27
28
|
Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
|
|
28
29
|
|
|
29
|
-
void OnComplete(std::function<void()> cb) {
|
|
30
|
-
_onComplete = cb;
|
|
31
|
-
}
|
|
30
|
+
void OnComplete(std::function<void()> cb) { _onComplete = cb; }
|
|
32
31
|
|
|
33
|
-
void SetStop() {
|
|
34
|
-
_stop = true;
|
|
35
|
-
}
|
|
32
|
+
void SetStop() { _stop = true; }
|
|
36
33
|
|
|
37
34
|
protected:
|
|
38
35
|
void Execute() override;
|
|
@@ -44,11 +41,13 @@ private:
|
|
|
44
41
|
common_params _params;
|
|
45
42
|
std::vector<std::string> _stop_words;
|
|
46
43
|
int32_t _chat_format;
|
|
47
|
-
std::vector<std::string>
|
|
44
|
+
std::vector<std::string> _media_paths;
|
|
45
|
+
std::vector<llama_token> _guide_tokens;
|
|
48
46
|
std::function<void()> _onComplete;
|
|
49
47
|
bool _has_callback = false;
|
|
50
48
|
bool _stop = false;
|
|
51
49
|
Napi::ThreadSafeFunction _tsfn;
|
|
50
|
+
bool _next_token_uses_guide_token = true;
|
|
52
51
|
struct {
|
|
53
52
|
size_t tokens_evaluated = 0;
|
|
54
53
|
size_t tokens_predicted = 0;
|