cui-llama.rn 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/LICENSE +20 -0
  2. package/README.md +330 -0
  3. package/android/build.gradle +107 -0
  4. package/android/gradle.properties +5 -0
  5. package/android/src/main/AndroidManifest.xml +4 -0
  6. package/android/src/main/CMakeLists.txt +69 -0
  7. package/android/src/main/java/com/rnllama/LlamaContext.java +353 -0
  8. package/android/src/main/java/com/rnllama/RNLlama.java +446 -0
  9. package/android/src/main/java/com/rnllama/RNLlamaPackage.java +48 -0
  10. package/android/src/main/jni.cpp +635 -0
  11. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +94 -0
  12. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +95 -0
  13. package/cpp/README.md +4 -0
  14. package/cpp/common.cpp +3237 -0
  15. package/cpp/common.h +467 -0
  16. package/cpp/ggml-aarch64.c +2193 -0
  17. package/cpp/ggml-aarch64.h +39 -0
  18. package/cpp/ggml-alloc.c +1041 -0
  19. package/cpp/ggml-alloc.h +76 -0
  20. package/cpp/ggml-backend-impl.h +153 -0
  21. package/cpp/ggml-backend.c +2225 -0
  22. package/cpp/ggml-backend.h +236 -0
  23. package/cpp/ggml-common.h +1829 -0
  24. package/cpp/ggml-impl.h +655 -0
  25. package/cpp/ggml-metal.h +65 -0
  26. package/cpp/ggml-metal.m +3273 -0
  27. package/cpp/ggml-quants.c +15022 -0
  28. package/cpp/ggml-quants.h +132 -0
  29. package/cpp/ggml.c +22034 -0
  30. package/cpp/ggml.h +2444 -0
  31. package/cpp/grammar-parser.cpp +536 -0
  32. package/cpp/grammar-parser.h +29 -0
  33. package/cpp/json-schema-to-grammar.cpp +1045 -0
  34. package/cpp/json-schema-to-grammar.h +8 -0
  35. package/cpp/json.hpp +24766 -0
  36. package/cpp/llama.cpp +21789 -0
  37. package/cpp/llama.h +1201 -0
  38. package/cpp/log.h +737 -0
  39. package/cpp/rn-llama.hpp +630 -0
  40. package/cpp/sampling.cpp +460 -0
  41. package/cpp/sampling.h +160 -0
  42. package/cpp/sgemm.cpp +1027 -0
  43. package/cpp/sgemm.h +14 -0
  44. package/cpp/unicode-data.cpp +7032 -0
  45. package/cpp/unicode-data.h +20 -0
  46. package/cpp/unicode.cpp +812 -0
  47. package/cpp/unicode.h +64 -0
  48. package/ios/RNLlama.h +11 -0
  49. package/ios/RNLlama.mm +302 -0
  50. package/ios/RNLlama.xcodeproj/project.pbxproj +278 -0
  51. package/ios/RNLlamaContext.h +39 -0
  52. package/ios/RNLlamaContext.mm +426 -0
  53. package/jest/mock.js +169 -0
  54. package/lib/commonjs/NativeRNLlama.js +10 -0
  55. package/lib/commonjs/NativeRNLlama.js.map +1 -0
  56. package/lib/commonjs/grammar.js +574 -0
  57. package/lib/commonjs/grammar.js.map +1 -0
  58. package/lib/commonjs/index.js +151 -0
  59. package/lib/commonjs/index.js.map +1 -0
  60. package/lib/module/NativeRNLlama.js +3 -0
  61. package/lib/module/NativeRNLlama.js.map +1 -0
  62. package/lib/module/grammar.js +566 -0
  63. package/lib/module/grammar.js.map +1 -0
  64. package/lib/module/index.js +129 -0
  65. package/lib/module/index.js.map +1 -0
  66. package/lib/typescript/NativeRNLlama.d.ts +107 -0
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -0
  68. package/lib/typescript/grammar.d.ts +38 -0
  69. package/lib/typescript/grammar.d.ts.map +1 -0
  70. package/lib/typescript/index.d.ts +46 -0
  71. package/lib/typescript/index.d.ts.map +1 -0
  72. package/llama-rn.podspec +56 -0
  73. package/package.json +230 -0
  74. package/src/NativeRNLlama.ts +132 -0
  75. package/src/grammar.ts +849 -0
  76. package/src/index.ts +182 -0
@@ -0,0 +1,460 @@
1
+ #define LLAMA_API_INTERNAL
2
+ #include "sampling.h"
3
+ #include <random>
4
+
5
+ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
6
+ struct llama_sampling_context * result = new llama_sampling_context();
7
+
8
+ result->params = params;
9
+ result->grammar = nullptr;
10
+
11
+ // if there is a grammar, parse it
12
+ if (!params.grammar.empty()) {
13
+ result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
14
+
15
+ // will be empty (default) if there are parse errors
16
+ if (result->parsed_grammar.rules.empty()) {
17
+ fprintf(stderr, "%s: failed to parse grammar\n", __func__);
18
+ delete result;
19
+ return nullptr;
20
+ }
21
+
22
+ // Ensure that there is a "root" node.
23
+ if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
24
+ fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
25
+ delete result;
26
+ return nullptr;
27
+ }
28
+
29
+ std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
30
+
31
+ struct llama_grammar * grammar = llama_grammar_init(
32
+ grammar_rules.data(),
33
+ grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
34
+ if (grammar == nullptr) {
35
+ throw std::runtime_error("Failed to initialize llama_grammar");
36
+ }
37
+ result->grammar = grammar;
38
+ }
39
+
40
+ result->prev.resize(params.n_prev);
41
+
42
+ result->n_valid = 0;
43
+
44
+ llama_sampling_set_rng_seed(result, params.seed);
45
+
46
+ return result;
47
+ }
48
+
49
+ void llama_sampling_free(struct llama_sampling_context * ctx) {
50
+ if (ctx->grammar != NULL) {
51
+ llama_grammar_free(ctx->grammar);
52
+ }
53
+
54
+ delete ctx;
55
+ }
56
+
57
+ void llama_sampling_reset(llama_sampling_context * ctx) {
58
+ if (ctx->grammar != NULL) {
59
+ llama_grammar_free(ctx->grammar);
60
+ ctx->grammar = NULL;
61
+ }
62
+
63
+ if (!ctx->parsed_grammar.rules.empty()) {
64
+ std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
65
+
66
+ struct llama_grammar * grammar = llama_grammar_init(
67
+ grammar_rules.data(),
68
+ grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
69
+ if (grammar == nullptr) {
70
+ throw std::runtime_error("Failed to initialize llama_grammar");
71
+ }
72
+ ctx->grammar = grammar;
73
+ }
74
+
75
+ std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
76
+ ctx->cur.clear();
77
+ ctx->n_valid = 0;
78
+ }
79
+
80
+ void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
81
+ if (seed == LLAMA_DEFAULT_SEED) {
82
+ seed = std::random_device{}();
83
+ }
84
+ ctx->rng.seed(seed);
85
+ }
86
+
87
+ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
88
+ if (dst->grammar) {
89
+ llama_grammar_free(dst->grammar);
90
+ dst->grammar = nullptr;
91
+ }
92
+
93
+ if (src->grammar) {
94
+ dst->grammar = llama_grammar_copy(src->grammar);
95
+ }
96
+
97
+ dst->prev = src->prev;
98
+ }
99
+
100
+ llama_token llama_sampling_last(llama_sampling_context * ctx) {
101
+ return ctx->prev.back();
102
+ }
103
+
104
+ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
105
+ const int size = ctx_sampling->prev.size();
106
+
107
+ n = std::min(n, size);
108
+
109
+ std::string result;
110
+
111
+ for (int i = size - n; i < size; i++) {
112
+ result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
113
+ }
114
+
115
+ return result;
116
+ }
117
+
118
+ std::string llama_sampling_print(const llama_sampling_params & params) {
119
+ char result[1024];
120
+
121
+ snprintf(result, sizeof(result),
122
+ "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
123
+ "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
124
+ "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
125
+ params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
126
+ params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
127
+ params.mirostat, params.mirostat_eta, params.mirostat_tau);
128
+
129
+ return std::string(result);
130
+ }
131
+
132
+ std::string llama_sampling_order_print(const llama_sampling_params & params) {
133
+ std::string result = "CFG -> Penalties ";
134
+ if (params.mirostat == 0) {
135
+ for (auto sampler_type : params.samplers_sequence) {
136
+ const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
137
+ if (!sampler_type_name.empty()) {
138
+ result += "-> " + sampler_type_name + " ";
139
+ }
140
+ }
141
+ } else {
142
+ result += "-> mirostat ";
143
+ }
144
+
145
+ return result;
146
+ }
147
+
148
+ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
149
+ switch (sampler_type) {
150
+ case llama_sampler_type::TOP_K: return "top_k";
151
+ case llama_sampler_type::TFS_Z: return "tfs_z";
152
+ case llama_sampler_type::TYPICAL_P: return "typical_p";
153
+ case llama_sampler_type::TOP_P: return "top_p";
154
+ case llama_sampler_type::MIN_P: return "min_p";
155
+ case llama_sampler_type::TEMPERATURE: return "temperature";
156
+ default : return "";
157
+ }
158
+ }
159
+
160
+ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
161
+ std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
162
+ {"top_k", llama_sampler_type::TOP_K},
163
+ {"top_p", llama_sampler_type::TOP_P},
164
+ {"typical_p", llama_sampler_type::TYPICAL_P},
165
+ {"min_p", llama_sampler_type::MIN_P},
166
+ {"tfs_z", llama_sampler_type::TFS_Z},
167
+ {"temperature", llama_sampler_type::TEMPERATURE}
168
+ };
169
+
170
+ // since samplers names are written multiple ways
171
+ // make it ready for both system names and input names
172
+ std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
173
+ {"top-k", llama_sampler_type::TOP_K},
174
+ {"top-p", llama_sampler_type::TOP_P},
175
+ {"nucleus", llama_sampler_type::TOP_P},
176
+ {"typical-p", llama_sampler_type::TYPICAL_P},
177
+ {"typical", llama_sampler_type::TYPICAL_P},
178
+ {"min-p", llama_sampler_type::MIN_P},
179
+ {"tfs-z", llama_sampler_type::TFS_Z},
180
+ {"tfs", llama_sampler_type::TFS_Z},
181
+ {"temp", llama_sampler_type::TEMPERATURE}
182
+ };
183
+
184
+ std::vector<llama_sampler_type> sampler_types;
185
+ sampler_types.reserve(names.size());
186
+ for (const auto & name : names)
187
+ {
188
+ auto sampler_item = sampler_canonical_name_map.find(name);
189
+ if (sampler_item != sampler_canonical_name_map.end())
190
+ {
191
+ sampler_types.push_back(sampler_item->second);
192
+ }
193
+ else
194
+ {
195
+ if (allow_alt_names)
196
+ {
197
+ sampler_item = sampler_alt_name_map.find(name);
198
+ if (sampler_item != sampler_alt_name_map.end())
199
+ {
200
+ sampler_types.push_back(sampler_item->second);
201
+ }
202
+ }
203
+ }
204
+ }
205
+ return sampler_types;
206
+ }
207
+
208
+ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
209
+ std::unordered_map<char, llama_sampler_type> sampler_name_map {
210
+ {'k', llama_sampler_type::TOP_K},
211
+ {'p', llama_sampler_type::TOP_P},
212
+ {'y', llama_sampler_type::TYPICAL_P},
213
+ {'m', llama_sampler_type::MIN_P},
214
+ {'f', llama_sampler_type::TFS_Z},
215
+ {'t', llama_sampler_type::TEMPERATURE}
216
+ };
217
+
218
+ std::vector<llama_sampler_type> sampler_types;
219
+ sampler_types.reserve(names_string.size());
220
+ for (const auto & c : names_string) {
221
+ const auto sampler_item = sampler_name_map.find(c);
222
+ if (sampler_item != sampler_name_map.end()) {
223
+ sampler_types.push_back(sampler_item->second);
224
+ }
225
+ }
226
+ return sampler_types;
227
+ }
228
+
229
+ // no reasons to expose this function in header
230
+ static void sampler_queue(
231
+ struct llama_context * ctx_main,
232
+ const llama_sampling_params & params,
233
+ llama_token_data_array & cur_p,
234
+ size_t min_keep) {
235
+ const float temp = params.temp;
236
+ const float dynatemp_range = params.dynatemp_range;
237
+ const float dynatemp_exponent = params.dynatemp_exponent;
238
+ const int32_t top_k = params.top_k;
239
+ const float top_p = params.top_p;
240
+ const float min_p = params.min_p;
241
+ const float tfs_z = params.tfs_z;
242
+ const float typical_p = params.typical_p;
243
+ const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
244
+
245
+ for (auto sampler_type : samplers_sequence) {
246
+ switch (sampler_type) {
247
+ case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
248
+ case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
249
+ case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
250
+ case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
251
+ case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
252
+ case llama_sampler_type::TEMPERATURE:
253
+ if (dynatemp_range > 0) {
254
+ float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
255
+ float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
256
+ llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
257
+ } else {
258
+ llama_sample_temp(ctx_main, &cur_p, temp);
259
+ }
260
+ break;
261
+ default : break;
262
+ }
263
+ }
264
+ }
265
+
266
+ static llama_token llama_sampling_sample_impl(
267
+ struct llama_sampling_context * ctx_sampling,
268
+ struct llama_context * ctx_main,
269
+ struct llama_context * ctx_cfg,
270
+ const int idx,
271
+ bool is_resampling) {
272
+ const llama_sampling_params & params = ctx_sampling->params;
273
+
274
+ const float temp = params.temp;
275
+ const int mirostat = params.mirostat;
276
+ const float mirostat_tau = params.mirostat_tau;
277
+ const float mirostat_eta = params.mirostat_eta;
278
+
279
+ std::vector<float> original_logits;
280
+ auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
281
+ if (ctx_sampling->grammar != NULL && !is_resampling) {
282
+ LM_GGML_ASSERT(!original_logits.empty());
283
+ }
284
+ llama_token id = 0;
285
+
286
+ if (temp < 0.0) {
287
+ // greedy sampling, with probs
288
+ llama_sample_softmax(ctx_main, &cur_p);
289
+ id = cur_p.data[0].id;
290
+ } else if (temp == 0.0) {
291
+ // greedy sampling, no probs
292
+ id = llama_sample_token_greedy(ctx_main, &cur_p);
293
+ } else {
294
+ if (mirostat == 1) {
295
+ const int mirostat_m = 100;
296
+ llama_sample_temp(ctx_main, &cur_p, temp);
297
+ id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
298
+ } else if (mirostat == 2) {
299
+ llama_sample_temp(ctx_main, &cur_p, temp);
300
+ id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
301
+ } else {
302
+ // temperature sampling
303
+ size_t min_keep = std::max(1, params.min_keep);
304
+
305
+ sampler_queue(ctx_main, params, cur_p, min_keep);
306
+
307
+ id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
308
+
309
+ //{
310
+ // const int n_top = 10;
311
+ // LOG("top %d candidates:\n", n_top);
312
+
313
+ // for (int i = 0; i < n_top; i++) {
314
+ // const llama_token id = cur_p.data[i].id;
315
+ // (void)id; // To avoid a warning that id is unused when logging is disabled.
316
+ // LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
317
+ // }
318
+ //}
319
+
320
+ //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
321
+ }
322
+ }
323
+
324
+ if (ctx_sampling->grammar != NULL && !is_resampling) {
325
+ // Get a pointer to the logits
326
+ float * logits = llama_get_logits_ith(ctx_main, idx);
327
+
328
+ // Create an array with a single token data element for the sampled id
329
+ llama_token_data single_token_data = {id, logits[id], 0.0f};
330
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
331
+
332
+ // Apply grammar constraints to the single token
333
+ llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
334
+
335
+ // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
336
+ bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
337
+
338
+ // If the token is not valid according to the grammar, perform resampling
339
+ if (!is_valid) {
340
+ LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
341
+
342
+ // Restore logits from the copy
343
+ std::copy(original_logits.begin(), original_logits.end(), logits);
344
+
345
+ return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
346
+ }
347
+ }
348
+
349
+ ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
350
+
351
+ return id;
352
+ }
353
+
354
+ static llama_token_data_array llama_sampling_prepare_impl(
355
+ struct llama_sampling_context * ctx_sampling,
356
+ struct llama_context * ctx_main,
357
+ struct llama_context * ctx_cfg,
358
+ const int idx,
359
+ bool apply_grammar,
360
+ std::vector<float> * original_logits) {
361
+ const llama_sampling_params & params = ctx_sampling->params;
362
+
363
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
364
+
365
+ const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
366
+ const float penalty_repeat = params.penalty_repeat;
367
+ const float penalty_freq = params.penalty_freq;
368
+ const float penalty_present = params.penalty_present;
369
+
370
+ const bool penalize_nl = params.penalize_nl;
371
+
372
+ auto & prev = ctx_sampling->prev;
373
+ auto & cur = ctx_sampling->cur;
374
+
375
+ // Get a pointer to the logits
376
+ float * logits = llama_get_logits_ith(ctx_main, idx);
377
+
378
+ if (ctx_sampling->grammar != NULL && !apply_grammar) {
379
+ LM_GGML_ASSERT(original_logits != NULL);
380
+ // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
381
+ *original_logits = {logits, logits + n_vocab};
382
+ }
383
+
384
+ // apply params.logit_bias map
385
+ for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
386
+ logits[it->first] += it->second;
387
+ }
388
+
389
+ if (ctx_cfg) {
390
+ float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
391
+ llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
392
+ }
393
+
394
+ cur.resize(n_vocab);
395
+
396
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
397
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
398
+ }
399
+
400
+ llama_token_data_array cur_p = { cur.data(), cur.size(), false };
401
+
402
+ // apply penalties
403
+ const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
404
+ const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
405
+ if (penalty_tokens_used_size) {
406
+ const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
407
+
408
+ llama_sample_repetition_penalties(ctx_main, &cur_p,
409
+ penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
410
+ penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
411
+
412
+ if (!penalize_nl) {
413
+ for (size_t idx = 0; idx < cur_p.size; idx++) {
414
+ if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
415
+ cur_p.data[idx].logit = nl_logit;
416
+ break;
417
+ }
418
+ }
419
+ }
420
+ }
421
+
422
+ // apply grammar checks before sampling logic
423
+ if (apply_grammar && ctx_sampling->grammar != NULL) {
424
+ llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
425
+ }
426
+
427
+ return cur_p;
428
+ }
429
+
430
+ llama_token llama_sampling_sample(
431
+ struct llama_sampling_context * ctx_sampling,
432
+ struct llama_context * ctx_main,
433
+ struct llama_context * ctx_cfg,
434
+ const int idx) {
435
+ // Call the implementation function with is_resampling set to false by default
436
+ return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
437
+ }
438
+
439
+ llama_token_data_array llama_sampling_prepare(
440
+ struct llama_sampling_context * ctx_sampling,
441
+ struct llama_context * ctx_main,
442
+ struct llama_context * ctx_cfg,
443
+ const int idx,
444
+ bool apply_grammar,
445
+ std::vector<float> * original_logits) {
446
+ return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
447
+ }
448
+
449
+ void llama_sampling_accept(
450
+ struct llama_sampling_context * ctx_sampling,
451
+ struct llama_context * ctx_main,
452
+ llama_token id,
453
+ bool apply_grammar) {
454
+ ctx_sampling->prev.erase(ctx_sampling->prev.begin());
455
+ ctx_sampling->prev.push_back(id);
456
+
457
+ if (ctx_sampling->grammar != NULL && apply_grammar) {
458
+ llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
459
+ }
460
+ }
package/cpp/sampling.h ADDED
@@ -0,0 +1,160 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include "grammar-parser.h"
6
+
7
+ #include <random>
8
+ #include <string>
9
+ #include <unordered_map>
10
+ #include <vector>
11
+
12
+ // sampler types
13
+ enum class llama_sampler_type : char {
14
+ TOP_K = 'k',
15
+ TOP_P = 'p',
16
+ MIN_P = 'm',
17
+ TFS_Z = 'f',
18
+ TYPICAL_P = 'y',
19
+ TEMPERATURE = 't'
20
+ };
21
+
22
+ // sampling parameters
23
+ typedef struct llama_sampling_params {
24
+ int32_t n_prev = 64; // number of previous tokens to remember
25
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
26
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
27
+ int32_t top_k = 40; // <= 0 to use vocab size
28
+ float top_p = 0.95f; // 1.0 = disabled
29
+ float min_p = 0.05f; // 0.0 = disabled
30
+ float tfs_z = 1.00f; // 1.0 = disabled
31
+ float typical_p = 1.00f; // 1.0 = disabled
32
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
33
+ float dynatemp_range = 0.00f; // 0.0 = disabled
34
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
35
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
36
+ float penalty_repeat = 1.00f; // 1.0 = disabled
37
+ float penalty_freq = 0.00f; // 0.0 = disabled
38
+ float penalty_present = 0.00f; // 0.0 = disabled
39
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
40
+ float mirostat_tau = 5.00f; // target entropy
41
+ float mirostat_eta = 0.10f; // learning rate
42
+ bool penalize_nl = false; // consider newlines as a repeatable token
43
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
44
+
45
+ std::vector<llama_sampler_type> samplers_sequence = {
46
+ llama_sampler_type::TOP_K,
47
+ llama_sampler_type::TFS_Z,
48
+ llama_sampler_type::TYPICAL_P,
49
+ llama_sampler_type::TOP_P,
50
+ llama_sampler_type::MIN_P,
51
+ llama_sampler_type::TEMPERATURE
52
+ };
53
+
54
+ std::string grammar; // optional BNF-like grammar to constrain sampling
55
+
56
+ // Classifier-Free Guidance
57
+ // https://arxiv.org/abs/2306.17806
58
+ std::string cfg_negative_prompt; // string to help guidance
59
+ float cfg_scale = 1.f; // how strong is guidance
60
+
61
+ std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
62
+
63
+ std::vector<llama_token> penalty_prompt_tokens;
64
+ bool use_penalty_prompt_tokens = false;
65
+ } llama_sampling_params;
66
+
67
+ // general sampler context
68
+ // TODO: move to llama.h
69
+ struct llama_sampling_context {
70
+ // parameters that will be used for sampling
71
+ llama_sampling_params params;
72
+
73
+ // mirostat sampler state
74
+ float mirostat_mu;
75
+
76
+ llama_grammar * grammar;
77
+
78
+ // internal
79
+ grammar_parser::parse_state parsed_grammar;
80
+
81
+ // TODO: replace with ring-buffer
82
+ std::vector<llama_token> prev;
83
+ std::vector<llama_token_data> cur;
84
+ size_t n_valid; // Number of correct top tokens with correct probabilities.
85
+
86
+ std::mt19937 rng;
87
+ };
88
+
89
+ #include "common.h"
90
+
91
+ // Create a new sampling context instance.
92
+ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
93
+
94
+ void llama_sampling_free(struct llama_sampling_context * ctx);
95
+
96
+ // Reset the sampler context
97
+ // - clear prev tokens
98
+ // - reset grammar
99
+ void llama_sampling_reset(llama_sampling_context * ctx);
100
+
101
+ // Set the sampler seed
102
+ void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
103
+
104
+ // Copy the sampler context
105
+ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
106
+
107
+ // Get the last sampled token
108
+ llama_token llama_sampling_last(llama_sampling_context * ctx);
109
+
110
+ // Get a string representation of the last sampled tokens
111
+ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
112
+
113
+ // Print sampling parameters into a string
114
+ std::string llama_sampling_print(const llama_sampling_params & params);
115
+
116
+ // Print sampling order into a string
117
+ std::string llama_sampling_order_print(const llama_sampling_params & params);
118
+
119
+ std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
120
+
121
+ std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
122
+ std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
123
+
124
+ // this is a common sampling function used across the examples for convenience
125
+ // it can serve as a starting point for implementing your own sampling function
126
+ // Note: When using multiple sequences, it is the caller's responsibility to call
127
+ // llama_sampling_reset when a sequence ends
128
+ //
129
+ // required:
130
+ // - ctx_main: context to use for sampling
131
+ // - ctx_sampling: sampling-specific context
132
+ //
133
+ // optional:
134
+ // - ctx_cfg: context to use for classifier-free guidance
135
+ // - idx: sample from llama_get_logits_ith(ctx, idx)
136
+ //
137
+ // returns:
138
+ // - token: sampled token
139
+ // - candidates: vector of candidate tokens
140
+ //
141
+ llama_token llama_sampling_sample(
142
+ struct llama_sampling_context * ctx_sampling,
143
+ struct llama_context * ctx_main,
144
+ struct llama_context * ctx_cfg,
145
+ int idx = -1);
146
+
147
+ // Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
148
+ llama_token_data_array llama_sampling_prepare(
149
+ struct llama_sampling_context * ctx_sampling,
150
+ struct llama_context * ctx_main,
151
+ struct llama_context * ctx_cfg,
152
+ int idx = 0,
153
+ bool apply_grammar = true,
154
+ std::vector<float> * original_logits = nullptr);
155
+
156
+ void llama_sampling_accept(
157
+ struct llama_sampling_context * ctx_sampling,
158
+ struct llama_context * ctx_main,
159
+ llama_token id,
160
+ bool apply_grammar);