node-llama-cpp 2.5.1 → 2.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -301
- package/dist/chatWrappers/{ChatMLPromptWrapper.d.ts → ChatMLChatPromptWrapper.d.ts} +1 -1
- package/dist/chatWrappers/{ChatMLPromptWrapper.js → ChatMLChatPromptWrapper.js} +2 -2
- package/dist/chatWrappers/ChatMLChatPromptWrapper.js.map +1 -0
- package/dist/chatWrappers/createChatWrapperByBos.js +2 -2
- package/dist/chatWrappers/createChatWrapperByBos.js.map +1 -1
- package/dist/cli/commands/BuildCommand.js +3 -1
- package/dist/cli/commands/BuildCommand.js.map +1 -1
- package/dist/cli/commands/ChatCommand.d.ts +8 -1
- package/dist/cli/commands/ChatCommand.js +88 -21
- package/dist/cli/commands/ChatCommand.js.map +1 -1
- package/dist/cli/commands/DownloadCommand.d.ts +3 -2
- package/dist/cli/commands/DownloadCommand.js +19 -38
- package/dist/cli/commands/DownloadCommand.js.map +1 -1
- package/dist/config.d.ts +5 -0
- package/dist/config.js +7 -0
- package/dist/config.js.map +1 -1
- package/dist/index.d.ts +5 -4
- package/dist/index.js +3 -2
- package/dist/index.js.map +1 -1
- package/dist/llamaEvaluator/LlamaBins.d.ts +3 -3
- package/dist/llamaEvaluator/LlamaBins.js +2 -2
- package/dist/llamaEvaluator/LlamaBins.js.map +1 -1
- package/dist/llamaEvaluator/LlamaChatSession.d.ts +79 -2
- package/dist/llamaEvaluator/LlamaChatSession.js +52 -8
- package/dist/llamaEvaluator/LlamaChatSession.js.map +1 -1
- package/dist/llamaEvaluator/LlamaContext.d.ts +60 -3
- package/dist/llamaEvaluator/LlamaContext.js +36 -4
- package/dist/llamaEvaluator/LlamaContext.js.map +1 -1
- package/dist/llamaEvaluator/LlamaGrammar.d.ts +16 -3
- package/dist/llamaEvaluator/LlamaGrammar.js +23 -4
- package/dist/llamaEvaluator/LlamaGrammar.js.map +1 -1
- package/dist/llamaEvaluator/LlamaGrammarEvaluationState.d.ts +14 -0
- package/dist/llamaEvaluator/LlamaGrammarEvaluationState.js +16 -0
- package/dist/llamaEvaluator/LlamaGrammarEvaluationState.js.map +1 -0
- package/dist/llamaEvaluator/LlamaModel.d.ts +46 -14
- package/dist/llamaEvaluator/LlamaModel.js +23 -16
- package/dist/llamaEvaluator/LlamaModel.js.map +1 -1
- package/dist/state.d.ts +2 -0
- package/dist/state.js +8 -0
- package/dist/state.js.map +1 -0
- package/dist/utils/cloneLlamaCppRepo.d.ts +1 -0
- package/dist/utils/cloneLlamaCppRepo.js +59 -0
- package/dist/utils/cloneLlamaCppRepo.js.map +1 -0
- package/dist/utils/compileLLamaCpp.js +23 -5
- package/dist/utils/compileLLamaCpp.js.map +1 -1
- package/dist/utils/getBin.d.ts +21 -13
- package/dist/utils/gitReleaseBundles.d.ts +2 -0
- package/dist/utils/gitReleaseBundles.js +64 -0
- package/dist/utils/gitReleaseBundles.js.map +1 -0
- package/llama/addon.cpp +184 -110
- package/llama/binariesGithubRelease.json +1 -1
- package/llama/gitRelease.bundle +0 -0
- package/llama/toolchains/darwin.host-x64.target-arm64.cmake +8 -0
- package/llama/toolchains/linux.host-arm64.target-x64.cmake +5 -0
- package/llama/toolchains/linux.host-x64.target-arm64.cmake +5 -0
- package/llama/toolchains/linux.host-x64.target-arm71.cmake +5 -0
- package/llamaBins/linux-arm64/llama-addon.node +0 -0
- package/llamaBins/linux-armv7l/llama-addon.node +0 -0
- package/llamaBins/linux-x64/llama-addon.node +0 -0
- package/llamaBins/mac-arm64/ggml-metal.metal +258 -85
- package/llamaBins/mac-arm64/llama-addon.node +0 -0
- package/llamaBins/mac-x64/ggml-metal.metal +258 -85
- package/llamaBins/mac-x64/llama-addon.node +0 -0
- package/llamaBins/win-x64/llama-addon.node +0 -0
- package/package.json +10 -4
- package/dist/chatWrappers/ChatMLPromptWrapper.js.map +0 -1
- package/llamaBins/linux-ppc64le/llama-addon.node +0 -0
package/llama/addon.cpp
CHANGED
|
@@ -10,21 +10,11 @@
|
|
|
10
10
|
|
|
11
11
|
class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
|
|
12
12
|
public:
|
|
13
|
-
|
|
13
|
+
llama_model_params model_params;
|
|
14
14
|
llama_model* model;
|
|
15
|
-
float temperature;
|
|
16
|
-
int threads;
|
|
17
|
-
int32_t top_k;
|
|
18
|
-
float top_p;
|
|
19
15
|
|
|
20
16
|
LLAMAModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAModel>(info) {
|
|
21
|
-
|
|
22
|
-
params.seed = -1;
|
|
23
|
-
params.n_ctx = 4096;
|
|
24
|
-
temperature = 0.0f;
|
|
25
|
-
threads = 6;
|
|
26
|
-
top_k = 40;
|
|
27
|
-
top_p = 0.95f;
|
|
17
|
+
model_params = llama_model_default_params();
|
|
28
18
|
|
|
29
19
|
// Get the model path
|
|
30
20
|
std::string modelPath = info[0].As<Napi::String>().Utf8Value();
|
|
@@ -32,69 +22,25 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
|
|
|
32
22
|
if (info.Length() > 1 && info[1].IsObject()) {
|
|
33
23
|
Napi::Object options = info[1].As<Napi::Object>();
|
|
34
24
|
|
|
35
|
-
if (options.Has("seed")) {
|
|
36
|
-
params.seed = options.Get("seed").As<Napi::Number>().Int32Value();
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
if (options.Has("contextSize")) {
|
|
40
|
-
params.n_ctx = options.Get("contextSize").As<Napi::Number>().Int32Value();
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
if (options.Has("batchSize")) {
|
|
44
|
-
params.n_batch = options.Get("batchSize").As<Napi::Number>().Int32Value();
|
|
45
|
-
}
|
|
46
|
-
|
|
47
25
|
if (options.Has("gpuLayers")) {
|
|
48
|
-
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
if (options.Has("lowVram")) {
|
|
52
|
-
params.low_vram = options.Get("lowVram").As<Napi::Boolean>().Value();
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
if (options.Has("f16Kv")) {
|
|
56
|
-
params.f16_kv = options.Get("f16Kv").As<Napi::Boolean>().Value();
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
if (options.Has("logitsAll")) {
|
|
60
|
-
params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
|
|
26
|
+
model_params.n_gpu_layers = options.Get("gpuLayers").As<Napi::Number>().Int32Value();
|
|
61
27
|
}
|
|
62
28
|
|
|
63
29
|
if (options.Has("vocabOnly")) {
|
|
64
|
-
|
|
30
|
+
model_params.vocab_only = options.Get("vocabOnly").As<Napi::Boolean>().Value();
|
|
65
31
|
}
|
|
66
32
|
|
|
67
33
|
if (options.Has("useMmap")) {
|
|
68
|
-
|
|
34
|
+
model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
|
|
69
35
|
}
|
|
70
36
|
|
|
71
37
|
if (options.Has("useMlock")) {
|
|
72
|
-
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
if (options.Has("embedding")) {
|
|
76
|
-
params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
if (options.Has("threads")) {
|
|
80
|
-
threads = options.Get("threads").As<Napi::Number>().Int32Value();
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
if (options.Has("temperature")) {
|
|
84
|
-
temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
if (options.Has("topK")) {
|
|
88
|
-
top_k = options.Get("topK").As<Napi::Number>().Int32Value();
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
if (options.Has("topP")) {
|
|
92
|
-
top_p = options.Get("topP").As<Napi::Number>().FloatValue();
|
|
38
|
+
model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
|
|
93
39
|
}
|
|
94
40
|
}
|
|
95
41
|
|
|
96
42
|
llama_backend_init(false);
|
|
97
|
-
model = llama_load_model_from_file(modelPath.c_str(),
|
|
43
|
+
model = llama_load_model_from_file(modelPath.c_str(), model_params);
|
|
98
44
|
|
|
99
45
|
if (model == NULL) {
|
|
100
46
|
Napi::Error::New(info.Env(), "Failed to load model").ThrowAsJavaScriptException();
|
|
@@ -114,7 +60,6 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
|
|
|
114
60
|
class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
|
|
115
61
|
public:
|
|
116
62
|
grammar_parser::parse_state parsed_grammar;
|
|
117
|
-
llama_grammar *grammar = nullptr;
|
|
118
63
|
|
|
119
64
|
LLAMAGrammar(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAGrammar>(info) {
|
|
120
65
|
// Get the model path
|
|
@@ -139,13 +84,31 @@ class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
|
|
|
139
84
|
if (should_print_grammar) {
|
|
140
85
|
grammar_parser::print_grammar(stderr, parsed_grammar);
|
|
141
86
|
}
|
|
87
|
+
}
|
|
142
88
|
|
|
143
|
-
|
|
89
|
+
static void init(Napi::Object exports) {
|
|
90
|
+
exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {}));
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
class LLAMAGrammarEvaluationState : public Napi::ObjectWrap<LLAMAGrammarEvaluationState> {
|
|
95
|
+
public:
|
|
96
|
+
LLAMAGrammar* grammarDef;
|
|
97
|
+
llama_grammar *grammar = nullptr;
|
|
98
|
+
|
|
99
|
+
LLAMAGrammarEvaluationState(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAGrammarEvaluationState>(info) {
|
|
100
|
+
grammarDef = Napi::ObjectWrap<LLAMAGrammar>::Unwrap(info[0].As<Napi::Object>());
|
|
101
|
+
grammarDef->Ref();
|
|
102
|
+
|
|
103
|
+
std::vector<const llama_grammar_element *> grammar_rules(grammarDef->parsed_grammar.c_rules());
|
|
144
104
|
grammar = llama_grammar_init(
|
|
145
|
-
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")
|
|
105
|
+
grammar_rules.data(), grammar_rules.size(), grammarDef->parsed_grammar.symbol_ids.at("root")
|
|
106
|
+
);
|
|
146
107
|
}
|
|
147
108
|
|
|
148
|
-
~
|
|
109
|
+
~LLAMAGrammarEvaluationState() {
|
|
110
|
+
grammarDef->Unref();
|
|
111
|
+
|
|
149
112
|
if (grammar != nullptr) {
|
|
150
113
|
llama_grammar_free(grammar);
|
|
151
114
|
grammar = nullptr;
|
|
@@ -153,42 +116,67 @@ class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
|
|
|
153
116
|
}
|
|
154
117
|
|
|
155
118
|
static void init(Napi::Object exports) {
|
|
156
|
-
exports.Set("
|
|
119
|
+
exports.Set("LLAMAGrammarEvaluationState", DefineClass(exports.Env(), "LLAMAGrammarEvaluationState", {}));
|
|
157
120
|
}
|
|
158
121
|
};
|
|
159
122
|
|
|
160
123
|
class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
|
|
161
124
|
public:
|
|
162
125
|
LLAMAModel* model;
|
|
126
|
+
llama_context_params context_params;
|
|
163
127
|
llama_context* ctx;
|
|
164
|
-
|
|
165
|
-
bool use_grammar = false;
|
|
128
|
+
int n_cur = 0;
|
|
166
129
|
|
|
167
130
|
LLAMAContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAContext>(info) {
|
|
168
131
|
model = Napi::ObjectWrap<LLAMAModel>::Unwrap(info[0].As<Napi::Object>());
|
|
169
132
|
model->Ref();
|
|
170
|
-
|
|
171
|
-
|
|
133
|
+
|
|
134
|
+
context_params = llama_context_default_params();
|
|
135
|
+
context_params.seed = -1;
|
|
136
|
+
context_params.n_ctx = 4096;
|
|
137
|
+
context_params.n_threads = 6;
|
|
138
|
+
context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch;
|
|
172
139
|
|
|
173
140
|
if (info.Length() > 1 && info[1].IsObject()) {
|
|
174
|
-
|
|
141
|
+
Napi::Object options = info[1].As<Napi::Object>();
|
|
175
142
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
143
|
+
if (options.Has("seed")) {
|
|
144
|
+
context_params.seed = options.Get("seed").As<Napi::Number>().Int32Value();
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (options.Has("contextSize")) {
|
|
148
|
+
context_params.n_ctx = options.Get("contextSize").As<Napi::Number>().Int32Value();
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
if (options.Has("batchSize")) {
|
|
152
|
+
context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Int32Value();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (options.Has("f16Kv")) {
|
|
156
|
+
context_params.f16_kv = options.Get("f16Kv").As<Napi::Boolean>().Value();
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (options.Has("logitsAll")) {
|
|
160
|
+
context_params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
if (options.Has("embedding")) {
|
|
164
|
+
context_params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if (options.Has("threads")) {
|
|
168
|
+
context_params.n_threads = options.Get("threads").As<Napi::Number>().Int32Value();
|
|
169
|
+
context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch;
|
|
170
|
+
}
|
|
181
171
|
}
|
|
172
|
+
|
|
173
|
+
ctx = llama_new_context_with_model(model->model, context_params);
|
|
174
|
+
Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx));
|
|
182
175
|
}
|
|
183
176
|
~LLAMAContext() {
|
|
184
177
|
Napi::MemoryManagement::AdjustExternalMemory(Env(), -(int64_t)llama_get_state_size(ctx));
|
|
185
178
|
llama_free(ctx);
|
|
186
179
|
model->Unref();
|
|
187
|
-
|
|
188
|
-
if (use_grammar) {
|
|
189
|
-
grammar->Unref();
|
|
190
|
-
use_grammar = false;
|
|
191
|
-
}
|
|
192
180
|
}
|
|
193
181
|
Napi::Value Encode(const Napi::CallbackInfo& info) {
|
|
194
182
|
std::string text = info[0].As<Napi::String>().Utf8Value();
|
|
@@ -265,34 +253,124 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
|
|
|
265
253
|
|
|
266
254
|
class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
|
|
267
255
|
LLAMAContext* ctx;
|
|
256
|
+
LLAMAGrammarEvaluationState* grammar_evaluation_state;
|
|
257
|
+
bool use_grammar = false;
|
|
268
258
|
std::vector<llama_token> tokens;
|
|
269
259
|
llama_token result;
|
|
260
|
+
float temperature;
|
|
261
|
+
int32_t top_k;
|
|
262
|
+
float top_p;
|
|
263
|
+
float repeat_penalty = 1.10f; // 1.0 = disabled
|
|
264
|
+
float repeat_penalty_presence_penalty = 0.00f; // 0.0 = disabled
|
|
265
|
+
float repeat_penalty_frequency_penalty = 0.00f; // 0.0 = disabled
|
|
266
|
+
std::vector<llama_token> repeat_penalty_tokens;
|
|
267
|
+
bool use_repeat_penalty = false;
|
|
270
268
|
|
|
271
269
|
public:
|
|
272
270
|
LLAMAContextEvalWorker(const Napi::CallbackInfo& info, LLAMAContext* ctx) : Napi::AsyncWorker(info.Env(), "LLAMAContextEvalWorker"), ctx(ctx), Napi::Promise::Deferred(info.Env()) {
|
|
273
271
|
ctx->Ref();
|
|
274
272
|
Napi::Uint32Array tokens = info[0].As<Napi::Uint32Array>();
|
|
273
|
+
|
|
274
|
+
temperature = 0.0f;
|
|
275
|
+
top_k = 40;
|
|
276
|
+
top_p = 0.95f;
|
|
277
|
+
|
|
278
|
+
if (info.Length() > 1 && info[1].IsObject()) {
|
|
279
|
+
Napi::Object options = info[1].As<Napi::Object>();
|
|
280
|
+
|
|
281
|
+
if (options.Has("temperature")) {
|
|
282
|
+
temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
if (options.Has("topK")) {
|
|
286
|
+
top_k = options.Get("topK").As<Napi::Number>().Int32Value();
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if (options.Has("topP")) {
|
|
290
|
+
top_p = options.Get("topP").As<Napi::Number>().FloatValue();
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (options.Has("repeatPenalty")) {
|
|
294
|
+
repeat_penalty = options.Get("repeatPenalty").As<Napi::Number>().FloatValue();
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if (options.Has("repeatPenaltyTokens")) {
|
|
298
|
+
Napi::Uint32Array repeat_penalty_tokens_uint32_array = options.Get("repeatPenaltyTokens").As<Napi::Uint32Array>();
|
|
299
|
+
|
|
300
|
+
repeat_penalty_tokens.reserve(repeat_penalty_tokens_uint32_array.ElementLength());
|
|
301
|
+
for (size_t i = 0; i < repeat_penalty_tokens_uint32_array.ElementLength(); i++) {
|
|
302
|
+
repeat_penalty_tokens.push_back(static_cast<llama_token>(repeat_penalty_tokens_uint32_array[i]));
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
use_repeat_penalty = true;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (options.Has("repeatPenaltyPresencePenalty")) {
|
|
309
|
+
repeat_penalty_presence_penalty = options.Get("repeatPenaltyPresencePenalty").As<Napi::Number>().FloatValue();
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (options.Has("repeatPenaltyFrequencyPenalty")) {
|
|
313
|
+
repeat_penalty_frequency_penalty = options.Get("repeatPenaltyFrequencyPenalty").As<Napi::Number>().FloatValue();
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (options.Has("grammarEvaluationState")) {
|
|
317
|
+
grammar_evaluation_state = Napi::ObjectWrap<LLAMAGrammarEvaluationState>::Unwrap(options.Get("grammarEvaluationState").As<Napi::Object>());
|
|
318
|
+
grammar_evaluation_state->Ref();
|
|
319
|
+
use_grammar = true;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
275
323
|
this->tokens.reserve(tokens.ElementLength());
|
|
276
324
|
for (size_t i = 0; i < tokens.ElementLength(); i++) { this->tokens.push_back(static_cast<llama_token>(tokens[i])); }
|
|
277
325
|
}
|
|
278
|
-
~LLAMAContextEvalWorker() {
|
|
326
|
+
~LLAMAContextEvalWorker() {
|
|
327
|
+
ctx->Unref();
|
|
328
|
+
|
|
329
|
+
if (use_grammar) {
|
|
330
|
+
grammar_evaluation_state->Unref();
|
|
331
|
+
use_grammar = false;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
279
334
|
using Napi::AsyncWorker::Queue;
|
|
280
335
|
using Napi::Promise::Deferred::Promise;
|
|
281
336
|
|
|
282
337
|
protected:
|
|
283
338
|
void Execute() {
|
|
284
|
-
|
|
285
|
-
|
|
339
|
+
llama_batch batch = llama_batch_init(tokens.size(), 0);
|
|
340
|
+
|
|
341
|
+
batch.n_tokens = tokens.size();
|
|
342
|
+
|
|
343
|
+
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
|
344
|
+
batch.token[i] = tokens[i];
|
|
345
|
+
batch.pos[i] = ctx->n_cur;
|
|
346
|
+
batch.seq_id[i] = 0;
|
|
347
|
+
batch.logits[i] = false;
|
|
348
|
+
|
|
349
|
+
ctx->n_cur++;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
batch.logits[batch.n_tokens - 1] = true;
|
|
353
|
+
|
|
354
|
+
// Perform the evaluation using llama_decode.
|
|
355
|
+
int r = llama_decode(ctx->ctx, batch);
|
|
356
|
+
|
|
357
|
+
llama_batch_free(batch);
|
|
358
|
+
|
|
286
359
|
if (r != 0) {
|
|
287
|
-
|
|
360
|
+
if (r == 1) {
|
|
361
|
+
SetError("could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
|
|
362
|
+
} else {
|
|
363
|
+
SetError("Eval has failed");
|
|
364
|
+
}
|
|
365
|
+
|
|
288
366
|
return;
|
|
289
367
|
}
|
|
290
368
|
|
|
291
369
|
llama_token new_token_id = 0;
|
|
292
370
|
|
|
293
371
|
// Select the best prediction.
|
|
294
|
-
auto logits =
|
|
295
|
-
auto n_vocab = llama_n_vocab(ctx->
|
|
372
|
+
auto logits = llama_get_logits_ith(ctx->ctx, batch.n_tokens - 1);
|
|
373
|
+
auto n_vocab = llama_n_vocab(ctx->model->model);
|
|
296
374
|
|
|
297
375
|
std::vector<llama_token_data> candidates;
|
|
298
376
|
candidates.reserve(n_vocab);
|
|
@@ -303,48 +381,43 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
|
|
|
303
381
|
|
|
304
382
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
305
383
|
|
|
306
|
-
float originalEosLogit = 0;
|
|
307
384
|
auto eos_token = llama_token_eos(ctx->ctx);
|
|
308
385
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
llama_sample_grammar(ctx->ctx, &candidates_p, (ctx->grammar)->grammar);
|
|
386
|
+
if (use_repeat_penalty && !repeat_penalty_tokens.empty()) {
|
|
387
|
+
llama_sample_repetition_penalty(
|
|
388
|
+
ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(), repeat_penalty
|
|
389
|
+
);
|
|
390
|
+
llama_sample_frequency_and_presence_penalties(
|
|
391
|
+
ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(),
|
|
392
|
+
repeat_penalty_frequency_penalty, repeat_penalty_presence_penalty
|
|
393
|
+
);
|
|
318
394
|
}
|
|
319
395
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
candidate.logit = originalEosLogit;
|
|
323
|
-
break;
|
|
324
|
-
}
|
|
396
|
+
if (use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
|
|
397
|
+
llama_sample_grammar(ctx->ctx, &candidates_p, (grammar_evaluation_state)->grammar);
|
|
325
398
|
}
|
|
326
399
|
|
|
327
|
-
if (
|
|
400
|
+
if (temperature <= 0) {
|
|
328
401
|
new_token_id = llama_sample_token_greedy(ctx->ctx , &candidates_p);
|
|
329
402
|
} else {
|
|
330
|
-
const int32_t
|
|
403
|
+
const int32_t resolved_top_k = top_k <= 0 ? llama_n_vocab(ctx->model->model) : std::min(top_k, llama_n_vocab(ctx->model->model));
|
|
331
404
|
const int32_t n_probs = 0; // Number of probabilities to keep - 0 = disabled
|
|
332
405
|
const float tfs_z = 1.00f; // Tail free sampling - 1.0 = disabled
|
|
333
406
|
const float typical_p = 1.00f; // Typical probability - 1.0 = disabled
|
|
334
|
-
const float
|
|
407
|
+
const float resolved_top_p = top_p; // Top p sampling - 1.0 = disabled
|
|
335
408
|
|
|
336
409
|
// Temperature sampling
|
|
337
410
|
size_t min_keep = std::max(1, n_probs);
|
|
338
|
-
llama_sample_top_k(ctx->ctx, &candidates_p,
|
|
411
|
+
llama_sample_top_k(ctx->ctx, &candidates_p, resolved_top_k, min_keep);
|
|
339
412
|
llama_sample_tail_free(ctx->ctx, &candidates_p, tfs_z, min_keep);
|
|
340
413
|
llama_sample_typical(ctx->ctx, &candidates_p, typical_p, min_keep);
|
|
341
|
-
llama_sample_top_p(ctx->ctx, &candidates_p,
|
|
342
|
-
llama_sample_temperature(ctx->ctx, &candidates_p,
|
|
414
|
+
llama_sample_top_p(ctx->ctx, &candidates_p, resolved_top_p, min_keep);
|
|
415
|
+
llama_sample_temperature(ctx->ctx, &candidates_p, temperature);
|
|
343
416
|
new_token_id = llama_sample_token(ctx->ctx, &candidates_p);
|
|
344
417
|
}
|
|
345
418
|
|
|
346
|
-
if (new_token_id != eos_token &&
|
|
347
|
-
llama_grammar_accept_token(ctx->ctx, (
|
|
419
|
+
if (new_token_id != eos_token && use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
|
|
420
|
+
llama_grammar_accept_token(ctx->ctx, (grammar_evaluation_state)->grammar, new_token_id);
|
|
348
421
|
}
|
|
349
422
|
|
|
350
423
|
result = new_token_id;
|
|
@@ -372,6 +445,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
|
|
|
372
445
|
});
|
|
373
446
|
LLAMAModel::init(exports);
|
|
374
447
|
LLAMAGrammar::init(exports);
|
|
448
|
+
LLAMAGrammarEvaluationState::init(exports);
|
|
375
449
|
LLAMAContext::init(exports);
|
|
376
450
|
return exports;
|
|
377
451
|
}
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|