@fugood/llama.node 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +46 -0
- package/lib/index.js +18 -0
- package/lib/index.ts +24 -0
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +64 -50
- package/src/LlamaCompletionWorker.h +6 -7
- package/src/LlamaContext.cpp +519 -222
- package/src/LlamaContext.h +25 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +10 -5
- package/src/addons.cc +8 -11
- package/src/common.hpp +92 -93
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
package/CMakeLists.txt
CHANGED
|
@@ -102,6 +102,10 @@ file(
|
|
|
102
102
|
"src/LoadSessionWorker.h"
|
|
103
103
|
"src/SaveSessionWorker.cpp"
|
|
104
104
|
"src/SaveSessionWorker.h"
|
|
105
|
+
"src/DecodeAudioTokenWorker.cpp"
|
|
106
|
+
"src/DecodeAudioTokenWorker.h"
|
|
107
|
+
"src/tts_utils.cpp"
|
|
108
|
+
"src/tts_utils.h"
|
|
105
109
|
)
|
|
106
110
|
|
|
107
111
|
add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
|
@@ -114,6 +114,11 @@ export type LlamaCompletionOptions = {
|
|
|
114
114
|
* Supports both file paths and base64 data URLs.
|
|
115
115
|
*/
|
|
116
116
|
media_paths?: string | string[]
|
|
117
|
+
/**
|
|
118
|
+
* Guide tokens to use for audio completion.
|
|
119
|
+
* Help prevent hallucinations by forcing the TTS to use the correct words.
|
|
120
|
+
*/
|
|
121
|
+
guide_tokens?: Int32Array
|
|
117
122
|
}
|
|
118
123
|
|
|
119
124
|
export type LlamaCompletionResult = {
|
|
@@ -208,6 +213,47 @@ export interface LlamaContext {
|
|
|
208
213
|
*/
|
|
209
214
|
releaseMultimodal(): Promise<void>
|
|
210
215
|
|
|
216
|
+
/**
|
|
217
|
+
* Load a vocoder model
|
|
218
|
+
* @param path Path to the vocoder model
|
|
219
|
+
* @returns Promise resolving to true if loading was successful
|
|
220
|
+
*/
|
|
221
|
+
initVocoder(path: string): Promise<boolean>
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Unload the vocoder model
|
|
225
|
+
* @returns Promise resolving to true if unloading was successful
|
|
226
|
+
*/
|
|
227
|
+
releaseVocoder(): Promise<void>
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Check if the vocoder model is enabled
|
|
231
|
+
* @returns Promise resolving to true if the vocoder model is enabled
|
|
232
|
+
*/
|
|
233
|
+
isVocoderEnabled(): boolean
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Get the formatted prompt for audio completion
|
|
237
|
+
* @param speaker Speaker name or null
|
|
238
|
+
* @param text Text to complete
|
|
239
|
+
* @returns Formatted audio completion
|
|
240
|
+
*/
|
|
241
|
+
getFormattedAudioCompletion(speaker: string|null, text: string): string
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Get guide tokens for audio completion
|
|
245
|
+
* @param text Text to complete
|
|
246
|
+
* @returns Guide tokens
|
|
247
|
+
*/
|
|
248
|
+
getAudioCompletionGuideTokens(text: string): Int32Array
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Decode audio tokens to audio data
|
|
252
|
+
* @param tokens Tokens to decode
|
|
253
|
+
* @returns Decoded audio tokens
|
|
254
|
+
*/
|
|
255
|
+
decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
|
|
256
|
+
|
|
211
257
|
// static
|
|
212
258
|
loadModelInfo(path: string, skip: string[]): Promise<Object>
|
|
213
259
|
toggleNativeLog(
|
package/lib/index.js
CHANGED
|
@@ -204,6 +204,24 @@ class LlamaContextWrapper {
|
|
|
204
204
|
getMultimodalSupport() {
|
|
205
205
|
return this.ctx.getMultimodalSupport();
|
|
206
206
|
}
|
|
207
|
+
initVocoder(path) {
|
|
208
|
+
return this.ctx.initVocoder(path);
|
|
209
|
+
}
|
|
210
|
+
releaseVocoder() {
|
|
211
|
+
return this.ctx.releaseVocoder();
|
|
212
|
+
}
|
|
213
|
+
isVocoderEnabled() {
|
|
214
|
+
return this.ctx.isVocoderEnabled();
|
|
215
|
+
}
|
|
216
|
+
getFormattedAudioCompletion(speaker, text) {
|
|
217
|
+
return this.ctx.getFormattedAudioCompletion(speaker, text);
|
|
218
|
+
}
|
|
219
|
+
getAudioCompletionGuideTokens(text) {
|
|
220
|
+
return this.ctx.getAudioCompletionGuideTokens(text);
|
|
221
|
+
}
|
|
222
|
+
decodeAudioTokens(tokens) {
|
|
223
|
+
return this.ctx.decodeAudioTokens(tokens);
|
|
224
|
+
}
|
|
207
225
|
}
|
|
208
226
|
const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
209
227
|
var _a, _b;
|
package/lib/index.ts
CHANGED
|
@@ -269,6 +269,30 @@ class LlamaContextWrapper {
|
|
|
269
269
|
}> {
|
|
270
270
|
return this.ctx.getMultimodalSupport()
|
|
271
271
|
}
|
|
272
|
+
|
|
273
|
+
initVocoder(path: string): Promise<boolean> {
|
|
274
|
+
return this.ctx.initVocoder(path)
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
releaseVocoder(): Promise<void> {
|
|
278
|
+
return this.ctx.releaseVocoder()
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
isVocoderEnabled(): boolean {
|
|
282
|
+
return this.ctx.isVocoderEnabled()
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
getFormattedAudioCompletion(speaker: string|null, text: string): string {
|
|
286
|
+
return this.ctx.getFormattedAudioCompletion(speaker, text)
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
getAudioCompletionGuideTokens(text: string): Int32Array {
|
|
290
|
+
return this.ctx.getAudioCompletionGuideTokens(text)
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
|
|
294
|
+
return this.ctx.decodeAudioTokens(tokens)
|
|
295
|
+
}
|
|
272
296
|
}
|
|
273
297
|
|
|
274
298
|
export const loadModel = async (
|
package/package.json
CHANGED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#include "DecodeAudioTokenWorker.h"
|
|
2
|
+
#include "tts_utils.h"
|
|
3
|
+
#include <vector>
|
|
4
|
+
|
|
5
|
+
DecodeAudioTokenWorker::DecodeAudioTokenWorker(
|
|
6
|
+
const Napi::CallbackInfo &info, llama_model *model, llama_context *ctx,
|
|
7
|
+
int n_threads, const std::vector<llama_token> &tokens)
|
|
8
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _model(model), _ctx(ctx),
|
|
9
|
+
_n_threads(n_threads), _tokens(tokens) {}
|
|
10
|
+
|
|
11
|
+
void DecodeAudioTokenWorker::Execute() {
|
|
12
|
+
const int n_codes = _tokens.size();
|
|
13
|
+
llama_batch batch = llama_batch_init(n_codes, 0, 1);
|
|
14
|
+
for (size_t i = 0; i < _tokens.size(); ++i) {
|
|
15
|
+
common_batch_add(batch, _tokens[i], i, {0}, true);
|
|
16
|
+
}
|
|
17
|
+
if (batch.n_tokens != n_codes) {
|
|
18
|
+
SetError("batch.n_tokens != n_codes");
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
if (llama_encode(_ctx, batch) != 0) {
|
|
22
|
+
SetError("llama_encode() failed");
|
|
23
|
+
return;
|
|
24
|
+
}
|
|
25
|
+
llama_synchronize(_ctx);
|
|
26
|
+
const int n_embd = llama_model_n_embd(_model);
|
|
27
|
+
const float *embd = llama_get_embeddings(_ctx);
|
|
28
|
+
_result = embd_to_audio(embd, n_codes, n_embd, _n_threads);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
void DecodeAudioTokenWorker::OnOK() {
|
|
32
|
+
auto result =
|
|
33
|
+
Napi::Float32Array::New(Napi::AsyncWorker::Env(), _result.size());
|
|
34
|
+
memcpy(result.Data(), _result.data(), _result.size() * sizeof(float));
|
|
35
|
+
Napi::Promise::Deferred::Resolve(result);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
void DecodeAudioTokenWorker::OnError(const Napi::Error &err) {
|
|
39
|
+
Napi::Promise::Deferred::Reject(err.Value());
|
|
40
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#include "common.hpp"
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
4
|
+
class DecodeAudioTokenWorker : public Napi::AsyncWorker,
|
|
5
|
+
public Napi::Promise::Deferred {
|
|
6
|
+
public:
|
|
7
|
+
DecodeAudioTokenWorker(const Napi::CallbackInfo &info, llama_model *model,
|
|
8
|
+
llama_context *ctx, int n_threads,
|
|
9
|
+
const std::vector<llama_token> &tokens);
|
|
10
|
+
|
|
11
|
+
protected:
|
|
12
|
+
void Execute();
|
|
13
|
+
void OnOK();
|
|
14
|
+
void OnError(const Napi::Error &err);
|
|
15
|
+
|
|
16
|
+
private:
|
|
17
|
+
llama_model *_model;
|
|
18
|
+
llama_context *_ctx;
|
|
19
|
+
int _n_threads;
|
|
20
|
+
std::vector<llama_token> _tokens;
|
|
21
|
+
std::vector<float> _result;
|
|
22
|
+
};
|
package/src/EmbeddingWorker.cpp
CHANGED
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
#include "LlamaContext.h"
|
|
3
3
|
|
|
4
4
|
EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
|
|
5
|
-
LlamaSessionPtr &sess, std::string text,
|
|
6
|
-
|
|
5
|
+
LlamaSessionPtr &sess, std::string text,
|
|
6
|
+
common_params ¶ms)
|
|
7
|
+
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text),
|
|
8
|
+
_params(params) {}
|
|
7
9
|
|
|
8
10
|
void EmbeddingWorker::Execute() {
|
|
9
11
|
llama_kv_self_clear(_sess->context());
|
|
@@ -17,8 +19,7 @@ void EmbeddingWorker::Execute() {
|
|
|
17
19
|
do {
|
|
18
20
|
auto ctx = _sess->context();
|
|
19
21
|
int ret =
|
|
20
|
-
llama_decode(ctx,
|
|
21
|
-
llama_batch_get_one(tokens.data(), tokens.size()));
|
|
22
|
+
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()));
|
|
22
23
|
if (ret < 0) {
|
|
23
24
|
SetError("Failed to inference, code: " + std::to_string(ret));
|
|
24
25
|
break;
|
|
@@ -37,7 +38,8 @@ void EmbeddingWorker::Execute() {
|
|
|
37
38
|
}
|
|
38
39
|
_result.embedding.resize(n_embd);
|
|
39
40
|
std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
|
|
40
|
-
|
|
41
|
+
common_embd_normalize(embedding.data(), out.data(), n_embd,
|
|
42
|
+
_params.embd_normalize);
|
|
41
43
|
memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
|
|
42
44
|
} while (false);
|
|
43
45
|
}
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#include "LlamaCompletionWorker.h"
|
|
2
2
|
#include "LlamaContext.h"
|
|
3
3
|
|
|
4
|
-
|
|
5
4
|
size_t findStoppingStrings(const std::string &text,
|
|
6
5
|
const size_t last_token_size,
|
|
7
6
|
const std::vector<std::string> &stop_words) {
|
|
@@ -27,12 +26,12 @@ size_t findStoppingStrings(const std::string &text,
|
|
|
27
26
|
LlamaCompletionWorker::LlamaCompletionWorker(
|
|
28
27
|
const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
|
|
29
28
|
Napi::Function callback, common_params params,
|
|
30
|
-
std::vector<std::string> stop_words,
|
|
31
|
-
|
|
32
|
-
std::vector<
|
|
29
|
+
std::vector<std::string> stop_words, int32_t chat_format,
|
|
30
|
+
const std::vector<std::string> &media_paths,
|
|
31
|
+
const std::vector<llama_token> &guide_tokens)
|
|
33
32
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
34
33
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
35
|
-
_media_paths(media_paths) {
|
|
34
|
+
_media_paths(media_paths), _guide_tokens(guide_tokens) {
|
|
36
35
|
if (!callback.IsEmpty()) {
|
|
37
36
|
_tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
|
|
38
37
|
"LlamaCompletionCallback", 0, 1);
|
|
@@ -66,32 +65,27 @@ void LlamaCompletionWorker::Execute() {
|
|
|
66
65
|
|
|
67
66
|
// Process media if any are provided
|
|
68
67
|
if (!_media_paths.empty()) {
|
|
69
|
-
const auto*
|
|
70
|
-
|
|
68
|
+
const auto *mtmd_ctx = _sess->get_mtmd_ctx();
|
|
69
|
+
|
|
71
70
|
if (mtmd_ctx != nullptr) {
|
|
72
71
|
// Process the media and get the tokens
|
|
73
72
|
try {
|
|
74
|
-
n_cur = processMediaPrompt(
|
|
75
|
-
|
|
76
|
-
mtmd_ctx,
|
|
77
|
-
_sess,
|
|
78
|
-
_params,
|
|
79
|
-
_media_paths
|
|
80
|
-
);
|
|
81
|
-
} catch (const std::exception& e) {
|
|
73
|
+
n_cur = processMediaPrompt(ctx, mtmd_ctx, _sess, _params, _media_paths);
|
|
74
|
+
} catch (const std::exception &e) {
|
|
82
75
|
SetError(e.what());
|
|
83
76
|
_sess->get_mutex().unlock();
|
|
84
77
|
return;
|
|
85
78
|
}
|
|
86
|
-
|
|
79
|
+
|
|
87
80
|
if (n_cur <= 0) {
|
|
88
81
|
SetError("Failed to process media");
|
|
89
82
|
_sess->get_mutex().unlock();
|
|
90
83
|
return;
|
|
91
84
|
}
|
|
92
85
|
|
|
93
|
-
fprintf(stdout,
|
|
94
|
-
|
|
86
|
+
fprintf(stdout,
|
|
87
|
+
"[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
|
|
88
|
+
n_cur, _sess->tokens_ptr()->size());
|
|
95
89
|
|
|
96
90
|
n_input = _sess->tokens_ptr()->size();
|
|
97
91
|
if (n_cur == n_input) {
|
|
@@ -105,9 +99,10 @@ void LlamaCompletionWorker::Execute() {
|
|
|
105
99
|
}
|
|
106
100
|
} else {
|
|
107
101
|
// Text-only path
|
|
108
|
-
std::vector<llama_token> prompt_tokens =
|
|
102
|
+
std::vector<llama_token> prompt_tokens =
|
|
103
|
+
::common_tokenize(ctx, _params.prompt, add_bos);
|
|
109
104
|
n_input = prompt_tokens.size();
|
|
110
|
-
|
|
105
|
+
|
|
111
106
|
if (_sess->tokens_ptr()->size() > 0) {
|
|
112
107
|
n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
|
|
113
108
|
if (n_cur == n_input) {
|
|
@@ -132,7 +127,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
132
127
|
_result.context_full = true;
|
|
133
128
|
break;
|
|
134
129
|
}
|
|
135
|
-
|
|
130
|
+
|
|
136
131
|
const int n_left = n_cur - n_keep - 1;
|
|
137
132
|
const int n_discard = n_left / 2;
|
|
138
133
|
|
|
@@ -147,21 +142,27 @@ void LlamaCompletionWorker::Execute() {
|
|
|
147
142
|
n_cur -= n_discard;
|
|
148
143
|
_result.truncated = true;
|
|
149
144
|
}
|
|
150
|
-
|
|
145
|
+
|
|
151
146
|
// For multimodal input, n_past might already be set
|
|
152
147
|
// Only decode text tokens if we have any input left
|
|
153
148
|
if (n_input > 0) {
|
|
154
|
-
int ret =
|
|
155
|
-
ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
|
|
149
|
+
int ret =
|
|
150
|
+
llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
|
|
156
151
|
if (ret < 0) {
|
|
157
152
|
SetError("Failed to decode token, code: " + std::to_string(ret));
|
|
158
153
|
break;
|
|
159
154
|
}
|
|
160
155
|
}
|
|
161
|
-
|
|
156
|
+
|
|
162
157
|
// sample the next token
|
|
163
|
-
|
|
164
|
-
|
|
158
|
+
llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
|
|
159
|
+
if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
|
|
160
|
+
!llama_vocab_is_control(vocab, new_token_id) &&
|
|
161
|
+
!llama_vocab_is_eog(vocab, new_token_id)) {
|
|
162
|
+
new_token_id = _guide_tokens[0];
|
|
163
|
+
_guide_tokens.erase(_guide_tokens.begin());
|
|
164
|
+
}
|
|
165
|
+
_next_token_uses_guide_token = (new_token_id == 198);
|
|
165
166
|
common_sampler_accept(sampling.get(), new_token_id, true);
|
|
166
167
|
// prepare the next batch
|
|
167
168
|
embd->emplace_back(new_token_id);
|
|
@@ -214,20 +215,15 @@ void LlamaCompletionWorker::Execute() {
|
|
|
214
215
|
void LlamaCompletionWorker::OnOK() {
|
|
215
216
|
auto env = Napi::AsyncWorker::Env();
|
|
216
217
|
auto result = Napi::Object::New(env);
|
|
217
|
-
result.Set("tokens_evaluated",
|
|
218
|
-
|
|
218
|
+
result.Set("tokens_evaluated",
|
|
219
|
+
Napi::Number::New(env, _result.tokens_evaluated));
|
|
219
220
|
result.Set("tokens_predicted", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
220
221
|
_result.tokens_predicted));
|
|
221
|
-
result.Set("truncated",
|
|
222
|
-
|
|
223
|
-
result.Set("
|
|
224
|
-
|
|
225
|
-
result.Set("
|
|
226
|
-
Napi::String::New(env, _result.text.c_str()));
|
|
227
|
-
result.Set("stopped_eos",
|
|
228
|
-
Napi::Boolean::New(env, _result.stopped_eos));
|
|
229
|
-
result.Set("stopped_words",
|
|
230
|
-
Napi::Boolean::New(env, _result.stopped_words));
|
|
222
|
+
result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
|
|
223
|
+
result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
|
|
224
|
+
result.Set("text", Napi::String::New(env, _result.text.c_str()));
|
|
225
|
+
result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
|
|
226
|
+
result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
|
|
231
227
|
result.Set("stopping_word",
|
|
232
228
|
Napi::String::New(env, _result.stopping_word.c_str()));
|
|
233
229
|
result.Set("stopped_limited",
|
|
@@ -238,7 +234,8 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
238
234
|
std::string content;
|
|
239
235
|
if (!_stop) {
|
|
240
236
|
try {
|
|
241
|
-
common_chat_msg message = common_chat_parse(
|
|
237
|
+
common_chat_msg message = common_chat_parse(
|
|
238
|
+
_result.text, static_cast<common_chat_format>(_chat_format));
|
|
242
239
|
if (!message.reasoning_content.empty()) {
|
|
243
240
|
reasoning_content = message.reasoning_content;
|
|
244
241
|
}
|
|
@@ -266,7 +263,8 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
266
263
|
result.Set("tool_calls", tool_calls);
|
|
267
264
|
}
|
|
268
265
|
if (!reasoning_content.empty()) {
|
|
269
|
-
result.Set("reasoning_content",
|
|
266
|
+
result.Set("reasoning_content",
|
|
267
|
+
Napi::String::New(env, reasoning_content.c_str()));
|
|
270
268
|
}
|
|
271
269
|
if (!content.empty()) {
|
|
272
270
|
result.Set("content", Napi::String::New(env, content.c_str()));
|
|
@@ -276,17 +274,33 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
276
274
|
const auto timings_token = llama_perf_context(ctx);
|
|
277
275
|
|
|
278
276
|
auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
|
|
279
|
-
timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
280
|
-
|
|
281
|
-
timingsResult.Set("
|
|
282
|
-
|
|
283
|
-
timingsResult.Set(
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
277
|
+
timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
278
|
+
timings_token.n_p_eval));
|
|
279
|
+
timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
280
|
+
timings_token.t_p_eval_ms));
|
|
281
|
+
timingsResult.Set(
|
|
282
|
+
"prompt_per_token_ms",
|
|
283
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
284
|
+
timings_token.t_p_eval_ms / timings_token.n_p_eval));
|
|
285
|
+
timingsResult.Set("prompt_per_second",
|
|
286
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
287
|
+
1e3 / timings_token.t_p_eval_ms *
|
|
288
|
+
timings_token.n_p_eval));
|
|
289
|
+
timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
290
|
+
timings_token.n_eval));
|
|
291
|
+
timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
292
|
+
timings_token.t_eval_ms));
|
|
293
|
+
timingsResult.Set(
|
|
294
|
+
"predicted_per_token_ms",
|
|
295
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
296
|
+
timings_token.t_eval_ms / timings_token.n_eval));
|
|
297
|
+
timingsResult.Set(
|
|
298
|
+
"predicted_per_second",
|
|
299
|
+
Napi::Number::New(Napi::AsyncWorker::Env(),
|
|
300
|
+
1e3 / timings_token.t_eval_ms * timings_token.n_eval));
|
|
287
301
|
|
|
288
302
|
result.Set("timings", timingsResult);
|
|
289
|
-
|
|
303
|
+
|
|
290
304
|
Napi::Promise::Deferred::Resolve(result);
|
|
291
305
|
}
|
|
292
306
|
|
|
@@ -20,19 +20,16 @@ public:
|
|
|
20
20
|
Napi::Function callback, common_params params,
|
|
21
21
|
std::vector<std::string> stop_words,
|
|
22
22
|
int32_t chat_format,
|
|
23
|
-
std::vector<std::string> media_paths = {}
|
|
23
|
+
const std::vector<std::string> &media_paths = {},
|
|
24
|
+
const std::vector<llama_token> &guide_tokens = {});
|
|
24
25
|
|
|
25
26
|
~LlamaCompletionWorker();
|
|
26
27
|
|
|
27
28
|
Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
|
|
28
29
|
|
|
29
|
-
void OnComplete(std::function<void()> cb) {
|
|
30
|
-
_onComplete = cb;
|
|
31
|
-
}
|
|
30
|
+
void OnComplete(std::function<void()> cb) { _onComplete = cb; }
|
|
32
31
|
|
|
33
|
-
void SetStop() {
|
|
34
|
-
_stop = true;
|
|
35
|
-
}
|
|
32
|
+
void SetStop() { _stop = true; }
|
|
36
33
|
|
|
37
34
|
protected:
|
|
38
35
|
void Execute() override;
|
|
@@ -45,10 +42,12 @@ private:
|
|
|
45
42
|
std::vector<std::string> _stop_words;
|
|
46
43
|
int32_t _chat_format;
|
|
47
44
|
std::vector<std::string> _media_paths;
|
|
45
|
+
std::vector<llama_token> _guide_tokens;
|
|
48
46
|
std::function<void()> _onComplete;
|
|
49
47
|
bool _has_callback = false;
|
|
50
48
|
bool _stop = false;
|
|
51
49
|
Napi::ThreadSafeFunction _tsfn;
|
|
50
|
+
bool _next_token_uses_guide_token = true;
|
|
52
51
|
struct {
|
|
53
52
|
size_t tokens_evaluated = 0;
|
|
54
53
|
size_t tokens_predicted = 0;
|