cui-llama.rn 1.3.0 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/android/src/main/CMakeLists.txt +6 -1
  2. package/android/src/main/jni.cpp +6 -6
  3. package/cpp/amx/amx.cpp +196 -0
  4. package/cpp/amx/amx.h +20 -0
  5. package/cpp/amx/common.h +101 -0
  6. package/cpp/amx/mmq.cpp +2524 -0
  7. package/cpp/amx/mmq.h +16 -0
  8. package/cpp/common.cpp +1981 -1682
  9. package/cpp/common.h +636 -600
  10. package/cpp/ggml-aarch64.c +129 -129
  11. package/cpp/ggml-aarch64.h +19 -19
  12. package/cpp/ggml-alloc.c +1038 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +238 -216
  15. package/cpp/ggml-backend-reg.cpp +423 -195
  16. package/cpp/ggml-backend.cpp +1999 -1997
  17. package/cpp/ggml-backend.h +351 -328
  18. package/cpp/ggml-common.h +1859 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3823 -3560
  21. package/cpp/ggml-cpu-aarch64.h +32 -30
  22. package/cpp/ggml-cpu-impl.h +386 -371
  23. package/cpp/ggml-cpu-quants.c +10835 -10822
  24. package/cpp/ggml-cpu-quants.h +63 -63
  25. package/cpp/ggml-cpu.c +99 -103
  26. package/cpp/ggml-cpu.cpp +69 -17
  27. package/cpp/ggml-cpu.h +152 -177
  28. package/cpp/ggml-impl.h +556 -550
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4426 -4294
  31. package/cpp/ggml-quants.c +5247 -5247
  32. package/cpp/ggml-quants.h +100 -100
  33. package/cpp/ggml-threading.cpp +12 -12
  34. package/cpp/ggml-threading.h +12 -12
  35. package/cpp/ggml.c +7618 -8180
  36. package/cpp/ggml.h +2255 -2411
  37. package/cpp/json-schema-to-grammar.cpp +1045 -0
  38. package/cpp/json-schema-to-grammar.h +8 -0
  39. package/cpp/json.hpp +24766 -0
  40. package/cpp/llama-grammar.cpp +1138 -1138
  41. package/cpp/llama-grammar.h +144 -144
  42. package/cpp/llama-impl.h +181 -181
  43. package/cpp/llama-sampling.cpp +2348 -2348
  44. package/cpp/llama-sampling.h +48 -48
  45. package/cpp/llama-vocab.cpp +1984 -1984
  46. package/cpp/llama-vocab.h +170 -170
  47. package/cpp/llama.cpp +22332 -22132
  48. package/cpp/llama.h +1259 -1253
  49. package/cpp/log.cpp +401 -401
  50. package/cpp/log.h +121 -121
  51. package/cpp/rn-llama.hpp +6 -6
  52. package/cpp/sampling.cpp +505 -466
  53. package/cpp/sampling.h +22 -1
  54. package/cpp/sgemm.cpp +1884 -1884
  55. package/cpp/speculative.cpp +270 -0
  56. package/cpp/speculative.h +28 -0
  57. package/cpp/unicode.cpp +11 -0
  58. package/ios/RNLlamaContext.mm +13 -0
  59. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  60. package/lib/commonjs/grammar.js +4 -2
  61. package/lib/commonjs/grammar.js.map +1 -1
  62. package/lib/commonjs/index.js.map +1 -1
  63. package/lib/module/NativeRNLlama.js.map +1 -1
  64. package/lib/module/grammar.js +2 -1
  65. package/lib/module/grammar.js.map +1 -1
  66. package/lib/module/index.js.map +1 -1
  67. package/lib/typescript/NativeRNLlama.d.ts +94 -4
  68. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  69. package/lib/typescript/grammar.d.ts +5 -6
  70. package/lib/typescript/grammar.d.ts.map +1 -1
  71. package/lib/typescript/index.d.ts +4 -2
  72. package/lib/typescript/index.d.ts.map +1 -1
  73. package/package.json +2 -1
  74. package/src/NativeRNLlama.ts +97 -10
  75. package/src/grammar.ts +10 -8
  76. package/src/index.ts +22 -1
package/cpp/sampling.cpp CHANGED
@@ -1,466 +1,505 @@
1
- #include "sampling.h"
2
-
3
- #include "common.h"
4
-
5
- #include <cmath>
6
- #include <unordered_map>
7
-
8
- // the ring buffer works similarly to std::deque, but with a fixed capacity
9
- // TODO: deduplicate with llama-impl.h
10
- template<typename T>
11
- struct ring_buffer {
12
- ring_buffer(size_t cap) : capacity(cap), data(cap) {}
13
-
14
- T & front() {
15
- if (sz == 0) {
16
- throw std::runtime_error("ring buffer is empty");
17
- }
18
- return data[first];
19
- }
20
-
21
- const T & front() const {
22
- if (sz == 0) {
23
- throw std::runtime_error("ring buffer is empty");
24
- }
25
- return data[first];
26
- }
27
-
28
- T & back() {
29
- if (sz == 0) {
30
- throw std::runtime_error("ring buffer is empty");
31
- }
32
- return data[pos];
33
- }
34
-
35
- const T & back() const {
36
- if (sz == 0) {
37
- throw std::runtime_error("ring buffer is empty");
38
- }
39
- return data[pos];
40
- }
41
-
42
- void push_back(const T & value) {
43
- if (sz == capacity) {
44
- // advance the start when buffer is full
45
- first = (first + 1) % capacity;
46
- } else {
47
- sz++;
48
- }
49
- data[pos] = value;
50
- pos = (pos + 1) % capacity;
51
- }
52
-
53
- T pop_front() {
54
- if (sz == 0) {
55
- throw std::runtime_error("ring buffer is empty");
56
- }
57
- T value = data[first];
58
- first = (first + 1) % capacity;
59
- sz--;
60
- return value;
61
- }
62
-
63
- const T & rat(size_t i) const {
64
- if (i >= sz) {
65
- throw std::runtime_error("ring buffer: index out of bounds");
66
- }
67
- return data[(first + sz - i - 1) % capacity];
68
- }
69
-
70
- std::vector<T> to_vector() const {
71
- std::vector<T> result;
72
- result.reserve(sz);
73
- for (size_t i = 0; i < sz; i++) {
74
- result.push_back(data[(first + i) % capacity]);
75
- }
76
- return result;
77
- }
78
-
79
- void clear() {
80
- // here only reset the status of the buffer
81
- sz = 0;
82
- first = 0;
83
- pos = 0;
84
- }
85
-
86
- bool empty() const {
87
- return sz == 0;
88
- }
89
-
90
- size_t size() const {
91
- return sz;
92
- }
93
-
94
- size_t capacity = 0;
95
- size_t sz = 0;
96
- size_t first = 0;
97
- size_t pos = 0;
98
- std::vector<T> data;
99
- };
100
-
101
- struct common_sampler {
102
- common_sampler_params params;
103
-
104
- struct llama_sampler * grmr;
105
- struct llama_sampler * chain;
106
-
107
- ring_buffer<llama_token> prev;
108
-
109
- std::vector<llama_token_data> cur;
110
-
111
- llama_token_data_array cur_p;
112
-
113
- void set_logits(struct llama_context * ctx, int idx) {
114
- const auto * logits = llama_get_logits_ith(ctx, idx);
115
-
116
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
117
-
118
- cur.resize(n_vocab);
119
-
120
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
121
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
122
- }
123
-
124
- cur_p = { cur.data(), cur.size(), -1, false };
125
- }
126
- };
127
-
128
- std::string common_sampler_params::print() const {
129
- char result[1024];
130
-
131
- snprintf(result, sizeof(result),
132
- "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133
- "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
134
- "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
135
- "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
136
- penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
137
- dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
138
- top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
139
- mirostat, mirostat_eta, mirostat_tau);
140
-
141
- return std::string(result);
142
- }
143
-
144
- struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
145
- llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
146
-
147
- lparams.no_perf = params.no_perf;
148
-
149
- auto * result = new common_sampler {
150
- /* .params = */ params,
151
- /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
152
- /* .chain = */ llama_sampler_chain_init(lparams),
153
- /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
154
- /* .cur = */ {},
155
- /* .cur_p = */ {},
156
- };
157
-
158
- llama_sampler_chain_add(result->chain,
159
- llama_sampler_init_logit_bias(
160
- llama_n_vocab(model),
161
- params.logit_bias.size(),
162
- params.logit_bias.data()));
163
-
164
- llama_sampler_chain_add(result->chain,
165
- llama_sampler_init_penalties(
166
- llama_n_vocab (model),
167
- llama_token_eos(model),
168
- llama_token_nl (model),
169
- params.penalty_last_n,
170
- params.penalty_repeat,
171
- params.penalty_freq,
172
- params.penalty_present,
173
- params.penalize_nl,
174
- params.ignore_eos));
175
-
176
- if (params.mirostat == 0) {
177
- for (const auto & cnstr : params.samplers) {
178
- switch (cnstr) {
179
- case COMMON_SAMPLER_TYPE_DRY:
180
- {
181
- std::vector<const char*> c_breakers;
182
- c_breakers.reserve(params.dry_sequence_breakers.size());
183
- for (const auto& str : params.dry_sequence_breakers) {
184
- c_breakers.push_back(str.c_str());
185
- }
186
-
187
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
188
- }
189
- break;
190
- case COMMON_SAMPLER_TYPE_TOP_K:
191
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
192
- break;
193
- case COMMON_SAMPLER_TYPE_TOP_P:
194
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
195
- break;
196
- case COMMON_SAMPLER_TYPE_MIN_P:
197
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
198
- break;
199
- case COMMON_SAMPLER_TYPE_XTC:
200
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
201
- break;
202
- case COMMON_SAMPLER_TYPE_TYPICAL_P:
203
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
204
- break;
205
- case COMMON_SAMPLER_TYPE_TEMPERATURE:
206
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
207
- break;
208
- case COMMON_SAMPLER_TYPE_INFILL:
209
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210
- break;
211
- default:
212
- LM_GGML_ASSERT(false && "unknown sampler type");
213
- }
214
- }
215
- llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
216
- } else if (params.mirostat == 1) {
217
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
218
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
219
- } else if (params.mirostat == 2) {
220
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
221
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
222
- } else {
223
- LM_GGML_ASSERT(false && "unknown mirostat version");
224
- }
225
-
226
- return result;
227
- }
228
-
229
- void common_sampler_free(struct common_sampler * gsmpl) {
230
- if (gsmpl) {
231
- llama_sampler_free(gsmpl->grmr);
232
-
233
- llama_sampler_free(gsmpl->chain);
234
-
235
- delete gsmpl;
236
- }
237
- }
238
-
239
- void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
240
- if (accept_grammar) {
241
- llama_sampler_accept(gsmpl->grmr, token);
242
- }
243
-
244
- llama_sampler_accept(gsmpl->chain, token);
245
-
246
- gsmpl->prev.push_back(token);
247
- }
248
-
249
- void common_sampler_reset(struct common_sampler * gsmpl) {
250
- llama_sampler_reset(gsmpl->grmr);
251
-
252
- llama_sampler_reset(gsmpl->chain);
253
- }
254
-
255
- struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
256
- return new common_sampler {
257
- /* .params = */ gsmpl->params,
258
- /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
259
- /* .chain = */ llama_sampler_clone(gsmpl->chain),
260
- /* .prev = */ gsmpl->prev,
261
- /* .cur = */ gsmpl->cur,
262
- /* .cur_p = */ gsmpl->cur_p,
263
- };
264
- }
265
-
266
- void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
267
- // TODO: measure grammar performance
268
-
269
- if (gsmpl) {
270
- llama_perf_sampler_print(gsmpl->chain);
271
- }
272
- if (ctx) {
273
- llama_perf_context_print(ctx);
274
- }
275
- }
276
-
277
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
278
- gsmpl->set_logits(ctx, idx);
279
-
280
- auto & grmr = gsmpl->grmr;
281
- auto & chain = gsmpl->chain;
282
- auto & cur_p = gsmpl->cur_p; // initialized by set_logits
283
-
284
- if (grammar_first) {
285
- llama_sampler_apply(grmr, &cur_p);
286
- }
287
-
288
- llama_sampler_apply(chain, &cur_p);
289
-
290
- LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
291
-
292
- const llama_token id = cur_p.data[cur_p.selected].id;
293
-
294
- if (grammar_first) {
295
- return id;
296
- }
297
-
298
- // check if it the sampled token fits the grammar
299
- {
300
- llama_token_data single_token_data = { id, 1.0f, 0.0f };
301
- llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
302
-
303
- llama_sampler_apply(grmr, &single_token_data_array);
304
-
305
- const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
306
- if (is_valid) {
307
- return id;
308
- }
309
- }
310
-
311
- // resampling:
312
- // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
313
- gsmpl->set_logits(ctx, idx);
314
-
315
- llama_sampler_apply(grmr, &cur_p);
316
- llama_sampler_apply(chain, &cur_p);
317
-
318
- LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
319
-
320
- return cur_p.data[cur_p.selected].id;
321
- }
322
-
323
- uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
324
- return llama_sampler_get_seed(gsmpl->chain);
325
- }
326
-
327
- // helpers
328
-
329
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
330
- return &gsmpl->cur_p;
331
- }
332
-
333
- llama_token common_sampler_last(const struct common_sampler * gsmpl) {
334
- return gsmpl->prev.rat(0);
335
- }
336
-
337
- std::string common_sampler_print(const struct common_sampler * gsmpl) {
338
- std::string result = "logits ";
339
-
340
- for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
341
- const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
342
- result += std::string("-> ") + llama_sampler_name(smpl) + " ";
343
- }
344
-
345
- return result;
346
- }
347
-
348
- std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
349
- n = std::min(n, (int) gsmpl->prev.size());
350
-
351
- if (n <= 0) {
352
- return "";
353
- }
354
-
355
- std::string result;
356
- result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
357
-
358
- for (int i = n - 1; i >= 0; i--) {
359
- const llama_token id = gsmpl->prev.rat(i);
360
-
361
- LM_GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
362
-
363
- result += common_token_to_piece(ctx_main, id);
364
- }
365
-
366
- return result;
367
- }
368
-
369
- char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
370
- switch (cnstr) {
371
- case COMMON_SAMPLER_TYPE_DRY: return 'd';
372
- case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
373
- case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
374
- case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
375
- case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
376
- case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
377
- case COMMON_SAMPLER_TYPE_XTC: return 'x';
378
- case COMMON_SAMPLER_TYPE_INFILL: return 'i';
379
- default : return '?';
380
- }
381
- }
382
-
383
- std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
384
- switch (cnstr) {
385
- case COMMON_SAMPLER_TYPE_DRY: return "dry";
386
- case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
387
- case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
388
- case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
389
- case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
390
- case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
391
- case COMMON_SAMPLER_TYPE_XTC: return "xtc";
392
- case COMMON_SAMPLER_TYPE_INFILL: return "infill";
393
- default : return "";
394
- }
395
- }
396
-
397
- std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
398
- std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
399
- { "dry", COMMON_SAMPLER_TYPE_DRY },
400
- { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
401
- { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
402
- { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
403
- { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
404
- { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
405
- { "xtc", COMMON_SAMPLER_TYPE_XTC },
406
- { "infill", COMMON_SAMPLER_TYPE_INFILL },
407
- };
408
-
409
- // since samplers names are written multiple ways
410
- // make it ready for both system names and input names
411
- std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
412
- { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
413
- { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
414
- { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
415
- { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
416
- { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
417
- { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
418
- { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
419
- { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
420
- { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
421
- };
422
-
423
- std::vector<common_sampler_type> samplers;
424
- samplers.reserve(names.size());
425
-
426
- for (const auto & name : names) {
427
- auto sampler = sampler_canonical_name_map.find(name);
428
- if (sampler != sampler_canonical_name_map.end()) {
429
- samplers.push_back(sampler->second);
430
- } else {
431
- if (allow_alt_names) {
432
- sampler = sampler_alt_name_map.find(name);
433
- if (sampler != sampler_alt_name_map.end()) {
434
- samplers.push_back(sampler->second);
435
- }
436
- }
437
- }
438
- }
439
-
440
- return samplers;
441
- }
442
-
443
- std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
444
- std::unordered_map<char, common_sampler_type> sampler_name_map = {
445
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
446
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
447
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
448
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
449
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
450
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
451
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
452
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
453
- };
454
-
455
- std::vector<common_sampler_type> samplers;
456
- samplers.reserve(chars.size());
457
-
458
- for (const auto & c : chars) {
459
- const auto sampler = sampler_name_map.find(c);
460
- if (sampler != sampler_name_map.end()) {
461
- samplers.push_back(sampler->second);
462
- }
463
- }
464
-
465
- return samplers;
466
- }
1
+ #include "sampling.h"
2
+
3
+ #include "common.h"
4
+
5
+ #include <cmath>
6
+ #include <unordered_map>
7
+
8
+ // the ring buffer works similarly to std::deque, but with a fixed capacity
9
+ // TODO: deduplicate with llama-impl.h
10
+ template<typename T>
11
+ struct ring_buffer {
12
+ ring_buffer(size_t cap) : capacity(cap), data(cap) {}
13
+
14
+ T & front() {
15
+ if (sz == 0) {
16
+ throw std::runtime_error("ring buffer is empty");
17
+ }
18
+ return data[first];
19
+ }
20
+
21
+ const T & front() const {
22
+ if (sz == 0) {
23
+ throw std::runtime_error("ring buffer is empty");
24
+ }
25
+ return data[first];
26
+ }
27
+
28
+ T & back() {
29
+ if (sz == 0) {
30
+ throw std::runtime_error("ring buffer is empty");
31
+ }
32
+ return data[pos];
33
+ }
34
+
35
+ const T & back() const {
36
+ if (sz == 0) {
37
+ throw std::runtime_error("ring buffer is empty");
38
+ }
39
+ return data[pos];
40
+ }
41
+
42
+ void push_back(const T & value) {
43
+ if (sz == capacity) {
44
+ // advance the start when buffer is full
45
+ first = (first + 1) % capacity;
46
+ } else {
47
+ sz++;
48
+ }
49
+ data[pos] = value;
50
+ pos = (pos + 1) % capacity;
51
+ }
52
+
53
+ T pop_front() {
54
+ if (sz == 0) {
55
+ throw std::runtime_error("ring buffer is empty");
56
+ }
57
+ T value = data[first];
58
+ first = (first + 1) % capacity;
59
+ sz--;
60
+ return value;
61
+ }
62
+
63
+ const T & rat(size_t i) const {
64
+ if (i >= sz) {
65
+ throw std::runtime_error("ring buffer: index out of bounds");
66
+ }
67
+ return data[(first + sz - i - 1) % capacity];
68
+ }
69
+
70
+ std::vector<T> to_vector() const {
71
+ std::vector<T> result;
72
+ result.reserve(sz);
73
+ for (size_t i = 0; i < sz; i++) {
74
+ result.push_back(data[(first + i) % capacity]);
75
+ }
76
+ return result;
77
+ }
78
+
79
+ void clear() {
80
+ // here only reset the status of the buffer
81
+ sz = 0;
82
+ first = 0;
83
+ pos = 0;
84
+ }
85
+
86
+ bool empty() const {
87
+ return sz == 0;
88
+ }
89
+
90
+ size_t size() const {
91
+ return sz;
92
+ }
93
+
94
+ size_t capacity = 0;
95
+ size_t sz = 0;
96
+ size_t first = 0;
97
+ size_t pos = 0;
98
+ std::vector<T> data;
99
+ };
100
+
101
+ struct common_sampler {
102
+ common_params_sampling params;
103
+
104
+ struct llama_sampler * grmr;
105
+ struct llama_sampler * chain;
106
+
107
+ ring_buffer<llama_token> prev;
108
+
109
+ std::vector<llama_token_data> cur;
110
+
111
+ llama_token_data_array cur_p;
112
+
113
+ void set_logits(struct llama_context * ctx, int idx) {
114
+ const auto * logits = llama_get_logits_ith(ctx, idx);
115
+
116
+ const int n_vocab = llama_n_vocab(llama_get_model(ctx));
117
+
118
+ cur.resize(n_vocab);
119
+
120
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
121
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
122
+ }
123
+
124
+ cur_p = { cur.data(), cur.size(), -1, false };
125
+ }
126
+ };
127
+
128
+ std::string common_params_sampling::print() const {
129
+ char result[1024];
130
+
131
+ snprintf(result, sizeof(result),
132
+ "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133
+ "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
134
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
135
+ "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
136
+ penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
137
+ dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
138
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
139
+ mirostat, mirostat_eta, mirostat_tau);
140
+
141
+ return std::string(result);
142
+ }
143
+
144
+ struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
145
+ llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
146
+
147
+ lparams.no_perf = params.no_perf;
148
+
149
+ auto * result = new common_sampler {
150
+ /* .params = */ params,
151
+ /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
152
+ /* .chain = */ llama_sampler_chain_init(lparams),
153
+ /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
154
+ /* .cur = */ {},
155
+ /* .cur_p = */ {},
156
+ };
157
+
158
+ llama_sampler_chain_add(result->chain,
159
+ llama_sampler_init_logit_bias(
160
+ llama_n_vocab(model),
161
+ params.logit_bias.size(),
162
+ params.logit_bias.data()));
163
+
164
+ llama_sampler_chain_add(result->chain,
165
+ llama_sampler_init_penalties(
166
+ llama_n_vocab (model),
167
+ llama_token_eos(model),
168
+ llama_token_nl (model),
169
+ params.penalty_last_n,
170
+ params.penalty_repeat,
171
+ params.penalty_freq,
172
+ params.penalty_present,
173
+ params.penalize_nl,
174
+ params.ignore_eos));
175
+
176
+ if (params.mirostat == 0) {
177
+ for (const auto & cnstr : params.samplers) {
178
+ switch (cnstr) {
179
+ case COMMON_SAMPLER_TYPE_DRY:
180
+ {
181
+ std::vector<const char*> c_breakers;
182
+ c_breakers.reserve(params.dry_sequence_breakers.size());
183
+ for (const auto& str : params.dry_sequence_breakers) {
184
+ c_breakers.push_back(str.c_str());
185
+ }
186
+
187
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
188
+ }
189
+ break;
190
+ case COMMON_SAMPLER_TYPE_TOP_K:
191
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
192
+ break;
193
+ case COMMON_SAMPLER_TYPE_TOP_P:
194
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
195
+ break;
196
+ case COMMON_SAMPLER_TYPE_MIN_P:
197
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
198
+ break;
199
+ case COMMON_SAMPLER_TYPE_XTC:
200
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
201
+ break;
202
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
203
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
204
+ break;
205
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
206
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
207
+ break;
208
+ case COMMON_SAMPLER_TYPE_INFILL:
209
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210
+ break;
211
+ default:
212
+ LM_GGML_ASSERT(false && "unknown sampler type");
213
+ }
214
+ }
215
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
216
+ } else if (params.mirostat == 1) {
217
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
218
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
219
+ } else if (params.mirostat == 2) {
220
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
221
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
222
+ } else {
223
+ LM_GGML_ASSERT(false && "unknown mirostat version");
224
+ }
225
+
226
+ return result;
227
+ }
228
+
229
+ void common_sampler_free(struct common_sampler * gsmpl) {
230
+ if (gsmpl) {
231
+ llama_sampler_free(gsmpl->grmr);
232
+
233
+ llama_sampler_free(gsmpl->chain);
234
+
235
+ delete gsmpl;
236
+ }
237
+ }
238
+
239
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
240
+ if (accept_grammar) {
241
+ llama_sampler_accept(gsmpl->grmr, token);
242
+ }
243
+
244
+ llama_sampler_accept(gsmpl->chain, token);
245
+
246
+ gsmpl->prev.push_back(token);
247
+ }
248
+
249
+ void common_sampler_reset(struct common_sampler * gsmpl) {
250
+ llama_sampler_reset(gsmpl->grmr);
251
+
252
+ llama_sampler_reset(gsmpl->chain);
253
+ }
254
+
255
+ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
256
+ return new common_sampler {
257
+ /* .params = */ gsmpl->params,
258
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
259
+ /* .chain = */ llama_sampler_clone(gsmpl->chain),
260
+ /* .prev = */ gsmpl->prev,
261
+ /* .cur = */ gsmpl->cur,
262
+ /* .cur_p = */ gsmpl->cur_p,
263
+ };
264
+ }
265
+
266
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
267
+ // TODO: measure grammar performance
268
+
269
+ if (gsmpl) {
270
+ llama_perf_sampler_print(gsmpl->chain);
271
+ }
272
+ if (ctx) {
273
+ llama_perf_context_print(ctx);
274
+ }
275
+ }
276
+
277
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
278
+ gsmpl->set_logits(ctx, idx);
279
+
280
+ auto & grmr = gsmpl->grmr;
281
+ auto & chain = gsmpl->chain;
282
+ auto & cur_p = gsmpl->cur_p; // initialized by set_logits
283
+
284
+ if (grammar_first) {
285
+ llama_sampler_apply(grmr, &cur_p);
286
+ }
287
+
288
+ llama_sampler_apply(chain, &cur_p);
289
+
290
+ LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
291
+
292
+ const llama_token id = cur_p.data[cur_p.selected].id;
293
+
294
+ if (grammar_first) {
295
+ return id;
296
+ }
297
+
298
+ // check if it the sampled token fits the grammar
299
+ {
300
+ llama_token_data single_token_data = { id, 1.0f, 0.0f };
301
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
302
+
303
+ llama_sampler_apply(grmr, &single_token_data_array);
304
+
305
+ const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
306
+ if (is_valid) {
307
+ return id;
308
+ }
309
+ }
310
+
311
+ // resampling:
312
+ // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
313
+ gsmpl->set_logits(ctx, idx);
314
+
315
+ llama_sampler_apply(grmr, &cur_p);
316
+ llama_sampler_apply(chain, &cur_p);
317
+
318
+ LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
319
+
320
+ return cur_p.data[cur_p.selected].id;
321
+ }
322
+
323
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
324
+ LM_GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
325
+
326
+ std::vector<llama_token> result;
327
+ result.reserve(idxs.size());
328
+
329
+ size_t i = 0;
330
+ for (; i < draft.size(); i++) {
331
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
332
+
333
+ common_sampler_accept(gsmpl, id, true);
334
+
335
+ result.push_back(id);
336
+
337
+ if (draft[i] != id) {
338
+ break;
339
+ }
340
+ }
341
+
342
+ if (i == draft.size()) {
343
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
344
+
345
+ common_sampler_accept(gsmpl, id, true);
346
+
347
+ result.push_back(id);
348
+ }
349
+
350
+ return result;
351
+ }
352
+
353
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
354
+ std::vector<int> idxs(draft.size() + 1);
355
+ for (size_t i = 0; i < idxs.size(); ++i) {
356
+ idxs[i] = i;
357
+ }
358
+
359
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
360
+ }
361
+
362
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
363
+ return llama_sampler_get_seed(gsmpl->chain);
364
+ }
365
+
366
+ // helpers
367
+
368
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
369
+ return &gsmpl->cur_p;
370
+ }
371
+
372
+ llama_token common_sampler_last(const struct common_sampler * gsmpl) {
373
+ return gsmpl->prev.rat(0);
374
+ }
375
+
376
+ std::string common_sampler_print(const struct common_sampler * gsmpl) {
377
+ std::string result = "logits ";
378
+
379
+ for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
380
+ const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
381
+ result += std::string("-> ") + llama_sampler_name(smpl) + " ";
382
+ }
383
+
384
+ return result;
385
+ }
386
+
387
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
388
+ n = std::min(n, (int) gsmpl->prev.size());
389
+
390
+ if (n <= 0) {
391
+ return "";
392
+ }
393
+
394
+ std::string result;
395
+ result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
396
+
397
+ for (int i = n - 1; i >= 0; i--) {
398
+ const llama_token id = gsmpl->prev.rat(i);
399
+
400
+ LM_GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
401
+
402
+ result += common_token_to_piece(ctx_main, id);
403
+ }
404
+
405
+ return result;
406
+ }
407
+
408
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
409
+ switch (cnstr) {
410
+ case COMMON_SAMPLER_TYPE_DRY: return 'd';
411
+ case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
412
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
413
+ case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
414
+ case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
415
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
416
+ case COMMON_SAMPLER_TYPE_XTC: return 'x';
417
+ case COMMON_SAMPLER_TYPE_INFILL: return 'i';
418
+ default : return '?';
419
+ }
420
+ }
421
+
422
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
423
+ switch (cnstr) {
424
+ case COMMON_SAMPLER_TYPE_DRY: return "dry";
425
+ case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
426
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
427
+ case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
428
+ case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
429
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
430
+ case COMMON_SAMPLER_TYPE_XTC: return "xtc";
431
+ case COMMON_SAMPLER_TYPE_INFILL: return "infill";
432
+ default : return "";
433
+ }
434
+ }
435
+
436
+ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
437
+ std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
438
+ { "dry", COMMON_SAMPLER_TYPE_DRY },
439
+ { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
440
+ { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
441
+ { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
442
+ { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
443
+ { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
444
+ { "xtc", COMMON_SAMPLER_TYPE_XTC },
445
+ { "infill", COMMON_SAMPLER_TYPE_INFILL },
446
+ };
447
+
448
+ // since samplers names are written multiple ways
449
+ // make it ready for both system names and input names
450
+ std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
451
+ { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
452
+ { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
453
+ { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
454
+ { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
455
+ { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
456
+ { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
457
+ { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
458
+ { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
459
+ { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
460
+ };
461
+
462
+ std::vector<common_sampler_type> samplers;
463
+ samplers.reserve(names.size());
464
+
465
+ for (const auto & name : names) {
466
+ auto sampler = sampler_canonical_name_map.find(name);
467
+ if (sampler != sampler_canonical_name_map.end()) {
468
+ samplers.push_back(sampler->second);
469
+ } else {
470
+ if (allow_alt_names) {
471
+ sampler = sampler_alt_name_map.find(name);
472
+ if (sampler != sampler_alt_name_map.end()) {
473
+ samplers.push_back(sampler->second);
474
+ }
475
+ }
476
+ }
477
+ }
478
+
479
+ return samplers;
480
+ }
481
+
482
+ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
483
+ std::unordered_map<char, common_sampler_type> sampler_name_map = {
484
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
485
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
486
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
487
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
488
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
489
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
490
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
491
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
492
+ };
493
+
494
+ std::vector<common_sampler_type> samplers;
495
+ samplers.reserve(chars.size());
496
+
497
+ for (const auto & c : chars) {
498
+ const auto sampler = sampler_name_map.find(c);
499
+ if (sampler != sampler_name_map.end()) {
500
+ samplers.push_back(sampler->second);
501
+ }
502
+ }
503
+
504
+ return samplers;
505
+ }