node-llama-cpp 2.5.1 → 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +20 -301
  2. package/dist/chatWrappers/{ChatMLPromptWrapper.d.ts → ChatMLChatPromptWrapper.d.ts} +1 -1
  3. package/dist/chatWrappers/{ChatMLPromptWrapper.js → ChatMLChatPromptWrapper.js} +2 -2
  4. package/dist/chatWrappers/ChatMLChatPromptWrapper.js.map +1 -0
  5. package/dist/chatWrappers/createChatWrapperByBos.js +2 -2
  6. package/dist/chatWrappers/createChatWrapperByBos.js.map +1 -1
  7. package/dist/cli/commands/BuildCommand.js +3 -1
  8. package/dist/cli/commands/BuildCommand.js.map +1 -1
  9. package/dist/cli/commands/ChatCommand.d.ts +8 -1
  10. package/dist/cli/commands/ChatCommand.js +88 -21
  11. package/dist/cli/commands/ChatCommand.js.map +1 -1
  12. package/dist/cli/commands/DownloadCommand.d.ts +3 -2
  13. package/dist/cli/commands/DownloadCommand.js +19 -38
  14. package/dist/cli/commands/DownloadCommand.js.map +1 -1
  15. package/dist/config.d.ts +5 -0
  16. package/dist/config.js +7 -0
  17. package/dist/config.js.map +1 -1
  18. package/dist/index.d.ts +5 -4
  19. package/dist/index.js +3 -2
  20. package/dist/index.js.map +1 -1
  21. package/dist/llamaEvaluator/LlamaBins.d.ts +3 -3
  22. package/dist/llamaEvaluator/LlamaBins.js +2 -2
  23. package/dist/llamaEvaluator/LlamaBins.js.map +1 -1
  24. package/dist/llamaEvaluator/LlamaChatSession.d.ts +79 -2
  25. package/dist/llamaEvaluator/LlamaChatSession.js +52 -8
  26. package/dist/llamaEvaluator/LlamaChatSession.js.map +1 -1
  27. package/dist/llamaEvaluator/LlamaContext.d.ts +60 -3
  28. package/dist/llamaEvaluator/LlamaContext.js +36 -4
  29. package/dist/llamaEvaluator/LlamaContext.js.map +1 -1
  30. package/dist/llamaEvaluator/LlamaGrammar.d.ts +16 -3
  31. package/dist/llamaEvaluator/LlamaGrammar.js +23 -4
  32. package/dist/llamaEvaluator/LlamaGrammar.js.map +1 -1
  33. package/dist/llamaEvaluator/LlamaGrammarEvaluationState.d.ts +14 -0
  34. package/dist/llamaEvaluator/LlamaGrammarEvaluationState.js +16 -0
  35. package/dist/llamaEvaluator/LlamaGrammarEvaluationState.js.map +1 -0
  36. package/dist/llamaEvaluator/LlamaModel.d.ts +46 -14
  37. package/dist/llamaEvaluator/LlamaModel.js +23 -16
  38. package/dist/llamaEvaluator/LlamaModel.js.map +1 -1
  39. package/dist/state.d.ts +2 -0
  40. package/dist/state.js +8 -0
  41. package/dist/state.js.map +1 -0
  42. package/dist/utils/cloneLlamaCppRepo.d.ts +1 -0
  43. package/dist/utils/cloneLlamaCppRepo.js +59 -0
  44. package/dist/utils/cloneLlamaCppRepo.js.map +1 -0
  45. package/dist/utils/compileLLamaCpp.js +23 -5
  46. package/dist/utils/compileLLamaCpp.js.map +1 -1
  47. package/dist/utils/getBin.d.ts +21 -13
  48. package/dist/utils/gitReleaseBundles.d.ts +2 -0
  49. package/dist/utils/gitReleaseBundles.js +64 -0
  50. package/dist/utils/gitReleaseBundles.js.map +1 -0
  51. package/llama/addon.cpp +184 -110
  52. package/llama/binariesGithubRelease.json +1 -1
  53. package/llama/gitRelease.bundle +0 -0
  54. package/llama/toolchains/darwin.host-x64.target-arm64.cmake +8 -0
  55. package/llama/toolchains/linux.host-arm64.target-x64.cmake +5 -0
  56. package/llama/toolchains/linux.host-x64.target-arm64.cmake +5 -0
  57. package/llama/toolchains/linux.host-x64.target-arm71.cmake +5 -0
  58. package/llamaBins/linux-arm64/llama-addon.node +0 -0
  59. package/llamaBins/linux-armv7l/llama-addon.node +0 -0
  60. package/llamaBins/linux-x64/llama-addon.node +0 -0
  61. package/llamaBins/mac-arm64/ggml-metal.metal +258 -85
  62. package/llamaBins/mac-arm64/llama-addon.node +0 -0
  63. package/llamaBins/mac-x64/ggml-metal.metal +258 -85
  64. package/llamaBins/mac-x64/llama-addon.node +0 -0
  65. package/llamaBins/win-x64/llama-addon.node +0 -0
  66. package/package.json +10 -4
  67. package/dist/chatWrappers/ChatMLPromptWrapper.js.map +0 -1
  68. package/llamaBins/linux-ppc64le/llama-addon.node +0 -0
package/llama/addon.cpp CHANGED
@@ -10,21 +10,11 @@
10
10
 
11
11
  class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
12
12
  public:
13
- llama_context_params params;
13
+ llama_model_params model_params;
14
14
  llama_model* model;
15
- float temperature;
16
- int threads;
17
- int32_t top_k;
18
- float top_p;
19
15
 
20
16
  LLAMAModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAModel>(info) {
21
- params = llama_context_default_params();
22
- params.seed = -1;
23
- params.n_ctx = 4096;
24
- temperature = 0.0f;
25
- threads = 6;
26
- top_k = 40;
27
- top_p = 0.95f;
17
+ model_params = llama_model_default_params();
28
18
 
29
19
  // Get the model path
30
20
  std::string modelPath = info[0].As<Napi::String>().Utf8Value();
@@ -32,69 +22,25 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
32
22
  if (info.Length() > 1 && info[1].IsObject()) {
33
23
  Napi::Object options = info[1].As<Napi::Object>();
34
24
 
35
- if (options.Has("seed")) {
36
- params.seed = options.Get("seed").As<Napi::Number>().Int32Value();
37
- }
38
-
39
- if (options.Has("contextSize")) {
40
- params.n_ctx = options.Get("contextSize").As<Napi::Number>().Int32Value();
41
- }
42
-
43
- if (options.Has("batchSize")) {
44
- params.n_batch = options.Get("batchSize").As<Napi::Number>().Int32Value();
45
- }
46
-
47
25
  if (options.Has("gpuLayers")) {
48
- params.n_gpu_layers = options.Get("gpuLayers").As<Napi::Number>().Int32Value();
49
- }
50
-
51
- if (options.Has("lowVram")) {
52
- params.low_vram = options.Get("lowVram").As<Napi::Boolean>().Value();
53
- }
54
-
55
- if (options.Has("f16Kv")) {
56
- params.f16_kv = options.Get("f16Kv").As<Napi::Boolean>().Value();
57
- }
58
-
59
- if (options.Has("logitsAll")) {
60
- params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
26
+ model_params.n_gpu_layers = options.Get("gpuLayers").As<Napi::Number>().Int32Value();
61
27
  }
62
28
 
63
29
  if (options.Has("vocabOnly")) {
64
- params.vocab_only = options.Get("vocabOnly").As<Napi::Boolean>().Value();
30
+ model_params.vocab_only = options.Get("vocabOnly").As<Napi::Boolean>().Value();
65
31
  }
66
32
 
67
33
  if (options.Has("useMmap")) {
68
- params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
34
+ model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
69
35
  }
70
36
 
71
37
  if (options.Has("useMlock")) {
72
- params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
73
- }
74
-
75
- if (options.Has("embedding")) {
76
- params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
77
- }
78
-
79
- if (options.Has("threads")) {
80
- threads = options.Get("threads").As<Napi::Number>().Int32Value();
81
- }
82
-
83
- if (options.Has("temperature")) {
84
- temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
85
- }
86
-
87
- if (options.Has("topK")) {
88
- top_k = options.Get("topK").As<Napi::Number>().Int32Value();
89
- }
90
-
91
- if (options.Has("topP")) {
92
- top_p = options.Get("topP").As<Napi::Number>().FloatValue();
38
+ model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
93
39
  }
94
40
  }
95
41
 
96
42
  llama_backend_init(false);
97
- model = llama_load_model_from_file(modelPath.c_str(), params);
43
+ model = llama_load_model_from_file(modelPath.c_str(), model_params);
98
44
 
99
45
  if (model == NULL) {
100
46
  Napi::Error::New(info.Env(), "Failed to load model").ThrowAsJavaScriptException();
@@ -114,7 +60,6 @@ class LLAMAModel : public Napi::ObjectWrap<LLAMAModel> {
114
60
  class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
115
61
  public:
116
62
  grammar_parser::parse_state parsed_grammar;
117
- llama_grammar *grammar = nullptr;
118
63
 
119
64
  LLAMAGrammar(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAGrammar>(info) {
120
65
  // Get the model path
@@ -139,13 +84,31 @@ class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
139
84
  if (should_print_grammar) {
140
85
  grammar_parser::print_grammar(stderr, parsed_grammar);
141
86
  }
87
+ }
142
88
 
143
- std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
89
+ static void init(Napi::Object exports) {
90
+ exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {}));
91
+ }
92
+ };
93
+
94
+ class LLAMAGrammarEvaluationState : public Napi::ObjectWrap<LLAMAGrammarEvaluationState> {
95
+ public:
96
+ LLAMAGrammar* grammarDef;
97
+ llama_grammar *grammar = nullptr;
98
+
99
+ LLAMAGrammarEvaluationState(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAGrammarEvaluationState>(info) {
100
+ grammarDef = Napi::ObjectWrap<LLAMAGrammar>::Unwrap(info[0].As<Napi::Object>());
101
+ grammarDef->Ref();
102
+
103
+ std::vector<const llama_grammar_element *> grammar_rules(grammarDef->parsed_grammar.c_rules());
144
104
  grammar = llama_grammar_init(
145
- grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
105
+ grammar_rules.data(), grammar_rules.size(), grammarDef->parsed_grammar.symbol_ids.at("root")
106
+ );
146
107
  }
147
108
 
148
- ~LLAMAGrammar() {
109
+ ~LLAMAGrammarEvaluationState() {
110
+ grammarDef->Unref();
111
+
149
112
  if (grammar != nullptr) {
150
113
  llama_grammar_free(grammar);
151
114
  grammar = nullptr;
@@ -153,42 +116,67 @@ class LLAMAGrammar : public Napi::ObjectWrap<LLAMAGrammar> {
153
116
  }
154
117
 
155
118
  static void init(Napi::Object exports) {
156
- exports.Set("LLAMAGrammar", DefineClass(exports.Env(), "LLAMAGrammar", {}));
119
+ exports.Set("LLAMAGrammarEvaluationState", DefineClass(exports.Env(), "LLAMAGrammarEvaluationState", {}));
157
120
  }
158
121
  };
159
122
 
160
123
  class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
161
124
  public:
162
125
  LLAMAModel* model;
126
+ llama_context_params context_params;
163
127
  llama_context* ctx;
164
- LLAMAGrammar* grammar;
165
- bool use_grammar = false;
128
+ int n_cur = 0;
166
129
 
167
130
  LLAMAContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<LLAMAContext>(info) {
168
131
  model = Napi::ObjectWrap<LLAMAModel>::Unwrap(info[0].As<Napi::Object>());
169
132
  model->Ref();
170
- ctx = llama_new_context_with_model(model->model, model->params);
171
- Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx));
133
+
134
+ context_params = llama_context_default_params();
135
+ context_params.seed = -1;
136
+ context_params.n_ctx = 4096;
137
+ context_params.n_threads = 6;
138
+ context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch;
172
139
 
173
140
  if (info.Length() > 1 && info[1].IsObject()) {
174
- Napi::Object options = info[1].As<Napi::Object>();
141
+ Napi::Object options = info[1].As<Napi::Object>();
175
142
 
176
- if (options.Has("grammar")) {
177
- grammar = Napi::ObjectWrap<LLAMAGrammar>::Unwrap(options.Get("grammar").As<Napi::Object>());
178
- grammar->Ref();
179
- use_grammar = true;
180
- }
143
+ if (options.Has("seed")) {
144
+ context_params.seed = options.Get("seed").As<Napi::Number>().Int32Value();
145
+ }
146
+
147
+ if (options.Has("contextSize")) {
148
+ context_params.n_ctx = options.Get("contextSize").As<Napi::Number>().Int32Value();
149
+ }
150
+
151
+ if (options.Has("batchSize")) {
152
+ context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Int32Value();
153
+ }
154
+
155
+ if (options.Has("f16Kv")) {
156
+ context_params.f16_kv = options.Get("f16Kv").As<Napi::Boolean>().Value();
157
+ }
158
+
159
+ if (options.Has("logitsAll")) {
160
+ context_params.logits_all = options.Get("logitsAll").As<Napi::Boolean>().Value();
161
+ }
162
+
163
+ if (options.Has("embedding")) {
164
+ context_params.embedding = options.Get("embedding").As<Napi::Boolean>().Value();
165
+ }
166
+
167
+ if (options.Has("threads")) {
168
+ context_params.n_threads = options.Get("threads").As<Napi::Number>().Int32Value();
169
+ context_params.n_threads_batch == -1 ? context_params.n_threads : context_params.n_threads_batch;
170
+ }
181
171
  }
172
+
173
+ ctx = llama_new_context_with_model(model->model, context_params);
174
+ Napi::MemoryManagement::AdjustExternalMemory(Env(), llama_get_state_size(ctx));
182
175
  }
183
176
  ~LLAMAContext() {
184
177
  Napi::MemoryManagement::AdjustExternalMemory(Env(), -(int64_t)llama_get_state_size(ctx));
185
178
  llama_free(ctx);
186
179
  model->Unref();
187
-
188
- if (use_grammar) {
189
- grammar->Unref();
190
- use_grammar = false;
191
- }
192
180
  }
193
181
  Napi::Value Encode(const Napi::CallbackInfo& info) {
194
182
  std::string text = info[0].As<Napi::String>().Utf8Value();
@@ -265,34 +253,124 @@ class LLAMAContext : public Napi::ObjectWrap<LLAMAContext> {
265
253
 
266
254
  class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
267
255
  LLAMAContext* ctx;
256
+ LLAMAGrammarEvaluationState* grammar_evaluation_state;
257
+ bool use_grammar = false;
268
258
  std::vector<llama_token> tokens;
269
259
  llama_token result;
260
+ float temperature;
261
+ int32_t top_k;
262
+ float top_p;
263
+ float repeat_penalty = 1.10f; // 1.0 = disabled
264
+ float repeat_penalty_presence_penalty = 0.00f; // 0.0 = disabled
265
+ float repeat_penalty_frequency_penalty = 0.00f; // 0.0 = disabled
266
+ std::vector<llama_token> repeat_penalty_tokens;
267
+ bool use_repeat_penalty = false;
270
268
 
271
269
  public:
272
270
  LLAMAContextEvalWorker(const Napi::CallbackInfo& info, LLAMAContext* ctx) : Napi::AsyncWorker(info.Env(), "LLAMAContextEvalWorker"), ctx(ctx), Napi::Promise::Deferred(info.Env()) {
273
271
  ctx->Ref();
274
272
  Napi::Uint32Array tokens = info[0].As<Napi::Uint32Array>();
273
+
274
+ temperature = 0.0f;
275
+ top_k = 40;
276
+ top_p = 0.95f;
277
+
278
+ if (info.Length() > 1 && info[1].IsObject()) {
279
+ Napi::Object options = info[1].As<Napi::Object>();
280
+
281
+ if (options.Has("temperature")) {
282
+ temperature = options.Get("temperature").As<Napi::Number>().FloatValue();
283
+ }
284
+
285
+ if (options.Has("topK")) {
286
+ top_k = options.Get("topK").As<Napi::Number>().Int32Value();
287
+ }
288
+
289
+ if (options.Has("topP")) {
290
+ top_p = options.Get("topP").As<Napi::Number>().FloatValue();
291
+ }
292
+
293
+ if (options.Has("repeatPenalty")) {
294
+ repeat_penalty = options.Get("repeatPenalty").As<Napi::Number>().FloatValue();
295
+ }
296
+
297
+ if (options.Has("repeatPenaltyTokens")) {
298
+ Napi::Uint32Array repeat_penalty_tokens_uint32_array = options.Get("repeatPenaltyTokens").As<Napi::Uint32Array>();
299
+
300
+ repeat_penalty_tokens.reserve(repeat_penalty_tokens_uint32_array.ElementLength());
301
+ for (size_t i = 0; i < repeat_penalty_tokens_uint32_array.ElementLength(); i++) {
302
+ repeat_penalty_tokens.push_back(static_cast<llama_token>(repeat_penalty_tokens_uint32_array[i]));
303
+ }
304
+
305
+ use_repeat_penalty = true;
306
+ }
307
+
308
+ if (options.Has("repeatPenaltyPresencePenalty")) {
309
+ repeat_penalty_presence_penalty = options.Get("repeatPenaltyPresencePenalty").As<Napi::Number>().FloatValue();
310
+ }
311
+
312
+ if (options.Has("repeatPenaltyFrequencyPenalty")) {
313
+ repeat_penalty_frequency_penalty = options.Get("repeatPenaltyFrequencyPenalty").As<Napi::Number>().FloatValue();
314
+ }
315
+
316
+ if (options.Has("grammarEvaluationState")) {
317
+ grammar_evaluation_state = Napi::ObjectWrap<LLAMAGrammarEvaluationState>::Unwrap(options.Get("grammarEvaluationState").As<Napi::Object>());
318
+ grammar_evaluation_state->Ref();
319
+ use_grammar = true;
320
+ }
321
+ }
322
+
275
323
  this->tokens.reserve(tokens.ElementLength());
276
324
  for (size_t i = 0; i < tokens.ElementLength(); i++) { this->tokens.push_back(static_cast<llama_token>(tokens[i])); }
277
325
  }
278
- ~LLAMAContextEvalWorker() { ctx->Unref(); }
326
+ ~LLAMAContextEvalWorker() {
327
+ ctx->Unref();
328
+
329
+ if (use_grammar) {
330
+ grammar_evaluation_state->Unref();
331
+ use_grammar = false;
332
+ }
333
+ }
279
334
  using Napi::AsyncWorker::Queue;
280
335
  using Napi::Promise::Deferred::Promise;
281
336
 
282
337
  protected:
283
338
  void Execute() {
284
- // Perform the evaluation using llama_eval.
285
- int r = llama_eval(ctx->ctx, tokens.data(), int(tokens.size()), llama_get_kv_cache_token_count(ctx->ctx), (ctx->model)->threads);
339
+ llama_batch batch = llama_batch_init(tokens.size(), 0);
340
+
341
+ batch.n_tokens = tokens.size();
342
+
343
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
344
+ batch.token[i] = tokens[i];
345
+ batch.pos[i] = ctx->n_cur;
346
+ batch.seq_id[i] = 0;
347
+ batch.logits[i] = false;
348
+
349
+ ctx->n_cur++;
350
+ }
351
+
352
+ batch.logits[batch.n_tokens - 1] = true;
353
+
354
+ // Perform the evaluation using llama_decode.
355
+ int r = llama_decode(ctx->ctx, batch);
356
+
357
+ llama_batch_free(batch);
358
+
286
359
  if (r != 0) {
287
- SetError("Eval has failed");
360
+ if (r == 1) {
361
+ SetError("could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
362
+ } else {
363
+ SetError("Eval has failed");
364
+ }
365
+
288
366
  return;
289
367
  }
290
368
 
291
369
  llama_token new_token_id = 0;
292
370
 
293
371
  // Select the best prediction.
294
- auto logits = llama_get_logits(ctx->ctx);
295
- auto n_vocab = llama_n_vocab(ctx->ctx);
372
+ auto logits = llama_get_logits_ith(ctx->ctx, batch.n_tokens - 1);
373
+ auto n_vocab = llama_n_vocab(ctx->model->model);
296
374
 
297
375
  std::vector<llama_token_data> candidates;
298
376
  candidates.reserve(n_vocab);
@@ -303,48 +381,43 @@ class LLAMAContextEvalWorker : Napi::AsyncWorker, Napi::Promise::Deferred {
303
381
 
304
382
  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
305
383
 
306
- float originalEosLogit = 0;
307
384
  auto eos_token = llama_token_eos(ctx->ctx);
308
385
 
309
- for (auto& candidate : candidates) {
310
- if (candidate.id == eos_token) {
311
- originalEosLogit = candidate.logit;
312
- break;
313
- }
314
- }
315
-
316
- if (ctx->use_grammar) {
317
- llama_sample_grammar(ctx->ctx, &candidates_p, (ctx->grammar)->grammar);
386
+ if (use_repeat_penalty && !repeat_penalty_tokens.empty()) {
387
+ llama_sample_repetition_penalty(
388
+ ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(), repeat_penalty
389
+ );
390
+ llama_sample_frequency_and_presence_penalties(
391
+ ctx->ctx, &candidates_p, repeat_penalty_tokens.data(), repeat_penalty_tokens.size(),
392
+ repeat_penalty_frequency_penalty, repeat_penalty_presence_penalty
393
+ );
318
394
  }
319
395
 
320
- for (auto& candidate : candidates) {
321
- if (candidate.id == eos_token) {
322
- candidate.logit = originalEosLogit;
323
- break;
324
- }
396
+ if (use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
397
+ llama_sample_grammar(ctx->ctx, &candidates_p, (grammar_evaluation_state)->grammar);
325
398
  }
326
399
 
327
- if ((ctx->model)->temperature <= 0) {
400
+ if (temperature <= 0) {
328
401
  new_token_id = llama_sample_token_greedy(ctx->ctx , &candidates_p);
329
402
  } else {
330
- const int32_t top_k = (ctx->model)->top_k <= 0 ? llama_n_vocab(ctx->ctx) : (ctx->model)->top_k;
403
+ const int32_t resolved_top_k = top_k <= 0 ? llama_n_vocab(ctx->model->model) : std::min(top_k, llama_n_vocab(ctx->model->model));
331
404
  const int32_t n_probs = 0; // Number of probabilities to keep - 0 = disabled
332
405
  const float tfs_z = 1.00f; // Tail free sampling - 1.0 = disabled
333
406
  const float typical_p = 1.00f; // Typical probability - 1.0 = disabled
334
- const float top_p = (ctx->model)->top_p; // Top p sampling - 1.0 = disabled
407
+ const float resolved_top_p = top_p; // Top p sampling - 1.0 = disabled
335
408
 
336
409
  // Temperature sampling
337
410
  size_t min_keep = std::max(1, n_probs);
338
- llama_sample_top_k(ctx->ctx, &candidates_p, top_k, min_keep);
411
+ llama_sample_top_k(ctx->ctx, &candidates_p, resolved_top_k, min_keep);
339
412
  llama_sample_tail_free(ctx->ctx, &candidates_p, tfs_z, min_keep);
340
413
  llama_sample_typical(ctx->ctx, &candidates_p, typical_p, min_keep);
341
- llama_sample_top_p(ctx->ctx, &candidates_p, top_p, min_keep);
342
- llama_sample_temperature(ctx->ctx, &candidates_p, (ctx->model)->temperature);;
414
+ llama_sample_top_p(ctx->ctx, &candidates_p, resolved_top_p, min_keep);
415
+ llama_sample_temperature(ctx->ctx, &candidates_p, temperature);
343
416
  new_token_id = llama_sample_token(ctx->ctx, &candidates_p);
344
417
  }
345
418
 
346
- if (new_token_id != eos_token && ctx->use_grammar) {
347
- llama_grammar_accept_token(ctx->ctx, (ctx->grammar)->grammar, new_token_id);
419
+ if (new_token_id != eos_token && use_grammar && (grammar_evaluation_state)->grammar != nullptr) {
420
+ llama_grammar_accept_token(ctx->ctx, (grammar_evaluation_state)->grammar, new_token_id);
348
421
  }
349
422
 
350
423
  result = new_token_id;
@@ -372,6 +445,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
372
445
  });
373
446
  LLAMAModel::init(exports);
374
447
  LLAMAGrammar::init(exports);
448
+ LLAMAGrammarEvaluationState::init(exports);
375
449
  LLAMAContext::init(exports);
376
450
  return exports;
377
451
  }
@@ -1,3 +1,3 @@
1
1
  {
2
- "release": "b1277"
2
+ "release": "b1357"
3
3
  }
Binary file
@@ -0,0 +1,8 @@
1
+ set(CMAKE_SYSTEM_NAME Darwin) # macOS
2
+ set(CMAKE_SYSTEM_PROCESSOR arm64)
3
+
4
+ set(CMAKE_C_COMPILER clang)
5
+ set(CMAKE_CXX_COMPILER clang++)
6
+
7
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -arch arm64")
8
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -arch arm64")
@@ -0,0 +1,5 @@
1
+ set(CMAKE_SYSTEM_NAME Linux)
2
+ set(CMAKE_SYSTEM_PROCESSOR x86_64)
3
+
4
+ set(CMAKE_C_COMPILER x86_64-linux-gnu-gcc)
5
+ set(CMAKE_CXX_COMPILER x86_64-linux-gnu-g++)
@@ -0,0 +1,5 @@
1
+ set(CMAKE_SYSTEM_NAME Linux)
2
+ set(CMAKE_SYSTEM_PROCESSOR aarch64)
3
+
4
+ set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
5
+ set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
@@ -0,0 +1,5 @@
1
+ set(CMAKE_SYSTEM_NAME Linux)
2
+ set(CMAKE_SYSTEM_PROCESSOR arm)
3
+
4
+ set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
5
+ set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)