llama-cpp-capacitor 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/cpp/LICENSE +21 -0
  2. package/cpp/README.md +4 -0
  3. package/cpp/anyascii.c +22223 -0
  4. package/cpp/anyascii.h +42 -0
  5. package/cpp/chat-parser.cpp +393 -0
  6. package/cpp/chat-parser.h +120 -0
  7. package/cpp/chat.cpp +2315 -0
  8. package/cpp/chat.h +221 -0
  9. package/cpp/common.cpp +1619 -0
  10. package/cpp/common.h +744 -0
  11. package/cpp/ggml-alloc.c +1028 -0
  12. package/cpp/ggml-alloc.h +76 -0
  13. package/cpp/ggml-backend-impl.h +255 -0
  14. package/cpp/ggml-backend-reg.cpp +600 -0
  15. package/cpp/ggml-backend.cpp +2118 -0
  16. package/cpp/ggml-backend.h +354 -0
  17. package/cpp/ggml-common.h +1878 -0
  18. package/cpp/ggml-cpp.h +39 -0
  19. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  20. package/cpp/ggml-cpu/amx/amx.h +8 -0
  21. package/cpp/ggml-cpu/amx/common.h +91 -0
  22. package/cpp/ggml-cpu/amx/mmq.cpp +2512 -0
  23. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  24. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  25. package/cpp/ggml-cpu/arch/arm/quants.c +3650 -0
  26. package/cpp/ggml-cpu/arch/arm/repack.cpp +1891 -0
  27. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  28. package/cpp/ggml-cpu/arch/x86/quants.c +3820 -0
  29. package/cpp/ggml-cpu/arch/x86/repack.cpp +6307 -0
  30. package/cpp/ggml-cpu/arch-fallback.h +215 -0
  31. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  32. package/cpp/ggml-cpu/binary-ops.h +16 -0
  33. package/cpp/ggml-cpu/common.h +73 -0
  34. package/cpp/ggml-cpu/ggml-cpu-impl.h +525 -0
  35. package/cpp/ggml-cpu/ggml-cpu.c +3578 -0
  36. package/cpp/ggml-cpu/ggml-cpu.cpp +672 -0
  37. package/cpp/ggml-cpu/ops.cpp +10587 -0
  38. package/cpp/ggml-cpu/ops.h +114 -0
  39. package/cpp/ggml-cpu/quants.c +1193 -0
  40. package/cpp/ggml-cpu/quants.h +97 -0
  41. package/cpp/ggml-cpu/repack.cpp +1982 -0
  42. package/cpp/ggml-cpu/repack.h +120 -0
  43. package/cpp/ggml-cpu/simd-mappings.h +1184 -0
  44. package/cpp/ggml-cpu/traits.cpp +36 -0
  45. package/cpp/ggml-cpu/traits.h +38 -0
  46. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  47. package/cpp/ggml-cpu/unary-ops.h +28 -0
  48. package/cpp/ggml-cpu/vec.cpp +348 -0
  49. package/cpp/ggml-cpu/vec.h +1121 -0
  50. package/cpp/ggml-cpu.h +145 -0
  51. package/cpp/ggml-impl.h +622 -0
  52. package/cpp/ggml-metal-impl.h +688 -0
  53. package/cpp/ggml-metal.h +66 -0
  54. package/cpp/ggml-metal.m +6833 -0
  55. package/cpp/ggml-opt.cpp +1093 -0
  56. package/cpp/ggml-opt.h +256 -0
  57. package/cpp/ggml-quants.c +5324 -0
  58. package/cpp/ggml-quants.h +106 -0
  59. package/cpp/ggml-threading.cpp +12 -0
  60. package/cpp/ggml-threading.h +14 -0
  61. package/cpp/ggml.c +7108 -0
  62. package/cpp/ggml.h +2492 -0
  63. package/cpp/gguf.cpp +1358 -0
  64. package/cpp/gguf.h +202 -0
  65. package/cpp/json-partial.cpp +256 -0
  66. package/cpp/json-partial.h +38 -0
  67. package/cpp/json-schema-to-grammar.cpp +985 -0
  68. package/cpp/json-schema-to-grammar.h +21 -0
  69. package/cpp/llama-adapter.cpp +388 -0
  70. package/cpp/llama-adapter.h +76 -0
  71. package/cpp/llama-arch.cpp +2355 -0
  72. package/cpp/llama-arch.h +499 -0
  73. package/cpp/llama-batch.cpp +875 -0
  74. package/cpp/llama-batch.h +160 -0
  75. package/cpp/llama-chat.cpp +783 -0
  76. package/cpp/llama-chat.h +65 -0
  77. package/cpp/llama-context.cpp +2748 -0
  78. package/cpp/llama-context.h +306 -0
  79. package/cpp/llama-cparams.cpp +5 -0
  80. package/cpp/llama-cparams.h +41 -0
  81. package/cpp/llama-cpp.h +30 -0
  82. package/cpp/llama-grammar.cpp +1229 -0
  83. package/cpp/llama-grammar.h +173 -0
  84. package/cpp/llama-graph.cpp +1891 -0
  85. package/cpp/llama-graph.h +810 -0
  86. package/cpp/llama-hparams.cpp +180 -0
  87. package/cpp/llama-hparams.h +233 -0
  88. package/cpp/llama-impl.cpp +167 -0
  89. package/cpp/llama-impl.h +61 -0
  90. package/cpp/llama-io.cpp +15 -0
  91. package/cpp/llama-io.h +35 -0
  92. package/cpp/llama-kv-cache-iswa.cpp +318 -0
  93. package/cpp/llama-kv-cache-iswa.h +135 -0
  94. package/cpp/llama-kv-cache.cpp +2059 -0
  95. package/cpp/llama-kv-cache.h +374 -0
  96. package/cpp/llama-kv-cells.h +491 -0
  97. package/cpp/llama-memory-hybrid.cpp +258 -0
  98. package/cpp/llama-memory-hybrid.h +137 -0
  99. package/cpp/llama-memory-recurrent.cpp +1146 -0
  100. package/cpp/llama-memory-recurrent.h +179 -0
  101. package/cpp/llama-memory.cpp +59 -0
  102. package/cpp/llama-memory.h +119 -0
  103. package/cpp/llama-mmap.cpp +600 -0
  104. package/cpp/llama-mmap.h +68 -0
  105. package/cpp/llama-model-loader.cpp +1164 -0
  106. package/cpp/llama-model-loader.h +170 -0
  107. package/cpp/llama-model-saver.cpp +282 -0
  108. package/cpp/llama-model-saver.h +37 -0
  109. package/cpp/llama-model.cpp +19042 -0
  110. package/cpp/llama-model.h +491 -0
  111. package/cpp/llama-sampling.cpp +2575 -0
  112. package/cpp/llama-sampling.h +32 -0
  113. package/cpp/llama-vocab.cpp +3792 -0
  114. package/cpp/llama-vocab.h +176 -0
  115. package/cpp/llama.cpp +358 -0
  116. package/cpp/llama.h +1373 -0
  117. package/cpp/log.cpp +427 -0
  118. package/cpp/log.h +103 -0
  119. package/cpp/minja/chat-template.hpp +550 -0
  120. package/cpp/minja/minja.hpp +3009 -0
  121. package/cpp/nlohmann/json.hpp +25526 -0
  122. package/cpp/nlohmann/json_fwd.hpp +187 -0
  123. package/cpp/regex-partial.cpp +204 -0
  124. package/cpp/regex-partial.h +56 -0
  125. package/cpp/rn-completion.cpp +681 -0
  126. package/cpp/rn-completion.h +116 -0
  127. package/cpp/rn-llama.cpp +345 -0
  128. package/cpp/rn-llama.h +149 -0
  129. package/cpp/rn-mtmd.hpp +602 -0
  130. package/cpp/rn-tts.cpp +591 -0
  131. package/cpp/rn-tts.h +59 -0
  132. package/cpp/sampling.cpp +579 -0
  133. package/cpp/sampling.h +107 -0
  134. package/cpp/tools/mtmd/clip-impl.h +473 -0
  135. package/cpp/tools/mtmd/clip.cpp +4322 -0
  136. package/cpp/tools/mtmd/clip.h +106 -0
  137. package/cpp/tools/mtmd/miniaudio/miniaudio.h +93468 -0
  138. package/cpp/tools/mtmd/mtmd-audio.cpp +769 -0
  139. package/cpp/tools/mtmd/mtmd-audio.h +47 -0
  140. package/cpp/tools/mtmd/mtmd-helper.cpp +460 -0
  141. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  142. package/cpp/tools/mtmd/mtmd.cpp +1066 -0
  143. package/cpp/tools/mtmd/mtmd.h +298 -0
  144. package/cpp/tools/mtmd/stb/stb_image.h +7988 -0
  145. package/cpp/unicode-data.cpp +7034 -0
  146. package/cpp/unicode-data.h +20 -0
  147. package/cpp/unicode.cpp +1061 -0
  148. package/cpp/unicode.h +68 -0
  149. package/package.json +2 -1
@@ -0,0 +1,579 @@
1
+ #include "sampling.h"
2
+
3
+ #include "common.h"
4
+ #include "log.h"
5
+
6
+ #include <cmath>
7
+ #include <unordered_map>
8
+ #include <algorithm>
9
+
10
+ // the ring buffer works similarly to std::deque, but with a fixed capacity
11
+ // TODO: deduplicate with llama-impl.h
12
+ template<typename T>
13
+ struct ring_buffer {
14
+ ring_buffer(size_t cap) : capacity(cap), data(cap) {}
15
+
16
+ T & front() {
17
+ if (sz == 0) {
18
+ throw std::runtime_error("ring buffer is empty");
19
+ }
20
+ return data[first];
21
+ }
22
+
23
+ const T & front() const {
24
+ if (sz == 0) {
25
+ throw std::runtime_error("ring buffer is empty");
26
+ }
27
+ return data[first];
28
+ }
29
+
30
+ T & back() {
31
+ if (sz == 0) {
32
+ throw std::runtime_error("ring buffer is empty");
33
+ }
34
+ return data[pos];
35
+ }
36
+
37
+ const T & back() const {
38
+ if (sz == 0) {
39
+ throw std::runtime_error("ring buffer is empty");
40
+ }
41
+ return data[pos];
42
+ }
43
+
44
+ void push_back(const T & value) {
45
+ if (sz == capacity) {
46
+ // advance the start when buffer is full
47
+ first = (first + 1) % capacity;
48
+ } else {
49
+ sz++;
50
+ }
51
+ data[pos] = value;
52
+ pos = (pos + 1) % capacity;
53
+ }
54
+
55
+ T pop_front() {
56
+ if (sz == 0) {
57
+ throw std::runtime_error("ring buffer is empty");
58
+ }
59
+ T value = data[first];
60
+ first = (first + 1) % capacity;
61
+ sz--;
62
+ return value;
63
+ }
64
+
65
+ const T & rat(size_t i) const {
66
+ if (i >= sz) {
67
+ throw std::runtime_error("ring buffer: index out of bounds");
68
+ }
69
+ return data[(first + sz - i - 1) % capacity];
70
+ }
71
+
72
+ std::vector<T> to_vector() const {
73
+ std::vector<T> result;
74
+ result.reserve(sz);
75
+ for (size_t i = 0; i < sz; i++) {
76
+ result.push_back(data[(first + i) % capacity]);
77
+ }
78
+ return result;
79
+ }
80
+
81
+ void clear() {
82
+ // here only reset the status of the buffer
83
+ sz = 0;
84
+ first = 0;
85
+ pos = 0;
86
+ }
87
+
88
+ bool empty() const {
89
+ return sz == 0;
90
+ }
91
+
92
+ size_t size() const {
93
+ return sz;
94
+ }
95
+
96
+ size_t capacity = 0;
97
+ size_t sz = 0;
98
+ size_t first = 0;
99
+ size_t pos = 0;
100
+ std::vector<T> data;
101
+ };
102
+
103
+ struct common_sampler {
104
+ common_params_sampling params;
105
+
106
+ struct llama_sampler * grmr;
107
+ struct llama_sampler * chain;
108
+
109
+ ring_buffer<llama_token> prev;
110
+
111
+ std::vector<llama_token_data> cur;
112
+
113
+ llama_token_data_array cur_p;
114
+
115
+ void set_logits(struct llama_context * ctx, int idx) {
116
+ const auto * logits = llama_get_logits_ith(ctx, idx);
117
+
118
+ const llama_model * model = llama_get_model(ctx);
119
+ const llama_vocab * vocab = llama_model_get_vocab(model);
120
+
121
+ const int n_vocab = llama_vocab_n_tokens(vocab);
122
+
123
+ cur.resize(n_vocab);
124
+
125
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
126
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
127
+ }
128
+
129
+ cur_p = { cur.data(), cur.size(), -1, false };
130
+ }
131
+ };
132
+
133
+ std::string common_params_sampling::print() const {
134
+ char result[1024];
135
+
136
+ snprintf(result, sizeof(result),
137
+ "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
138
+ "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
139
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
140
+ "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
141
+ penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
142
+ dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
143
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
144
+ mirostat, mirostat_eta, mirostat_tau);
145
+
146
+ return std::string(result);
147
+ }
148
+
149
+ struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
150
+ const llama_vocab * vocab = llama_model_get_vocab(model);
151
+
152
+ llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
153
+
154
+ lparams.no_perf = params.no_perf;
155
+
156
+ struct llama_sampler * grmr;
157
+ if (params.grammar.compare(0, 11, "%llguidance") == 0) {
158
+ #ifdef LLAMA_USE_LLGUIDANCE
159
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
160
+ #else
161
+ LM_GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
162
+ #endif // LLAMA_USE_LLGUIDANCE
163
+ } else {
164
+ std::vector<std::string> trigger_patterns;
165
+ std::vector<std::string> patterns_anywhere;
166
+ std::vector<llama_token> trigger_tokens;
167
+ for (const auto & trigger : params.grammar_triggers) {
168
+ switch (trigger.type) {
169
+ case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
170
+ {
171
+ const auto & word = trigger.value;
172
+ patterns_anywhere.push_back(regex_escape(word));
173
+ break;
174
+ }
175
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
176
+ {
177
+ patterns_anywhere.push_back(trigger.value);
178
+ break;
179
+ }
180
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
181
+ {
182
+ trigger_patterns.push_back(trigger.value);
183
+ break;
184
+ }
185
+ case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
186
+ {
187
+ const auto token = trigger.token;
188
+ trigger_tokens.push_back(token);
189
+ break;
190
+ }
191
+ default:
192
+ LM_GGML_ASSERT(false && "unknown trigger type");
193
+ }
194
+ }
195
+
196
+ if (!patterns_anywhere.empty()) {
197
+ trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
198
+ }
199
+
200
+ std::vector<const char *> trigger_patterns_c;
201
+ trigger_patterns_c.reserve(trigger_patterns.size());
202
+ for (const auto & regex : trigger_patterns) {
203
+ trigger_patterns_c.push_back(regex.c_str());
204
+ }
205
+
206
+ grmr = params.grammar_lazy
207
+ ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
208
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
209
+ trigger_tokens.data(), trigger_tokens.size())
210
+ : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
211
+ if (!grmr) {
212
+ return nullptr;
213
+ }
214
+ }
215
+
216
+ auto * result = new common_sampler {
217
+ /* .params = */ params,
218
+ /* .grmr = */ grmr,
219
+ /* .chain = */ llama_sampler_chain_init(lparams),
220
+ /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
221
+ /* .cur = */ {},
222
+ /* .cur_p = */ {},
223
+ };
224
+
225
+ llama_sampler_chain_add(result->chain,
226
+ llama_sampler_init_logit_bias(
227
+ llama_vocab_n_tokens(vocab),
228
+ params.logit_bias.size(),
229
+ params.logit_bias.data()));
230
+
231
+ if (params.mirostat == 0) {
232
+ for (const auto & cnstr : params.samplers) {
233
+ switch (cnstr) {
234
+ case COMMON_SAMPLER_TYPE_DRY:
235
+ {
236
+ std::vector<const char *> c_breakers;
237
+ c_breakers.reserve(params.dry_sequence_breakers.size());
238
+ for (const auto & str : params.dry_sequence_breakers) {
239
+ c_breakers.push_back(str.c_str());
240
+ }
241
+
242
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
243
+ }
244
+ break;
245
+ case COMMON_SAMPLER_TYPE_TOP_K:
246
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
247
+ break;
248
+ case COMMON_SAMPLER_TYPE_TOP_P:
249
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
250
+ break;
251
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
252
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
253
+ break;
254
+ case COMMON_SAMPLER_TYPE_MIN_P:
255
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
256
+ break;
257
+ case COMMON_SAMPLER_TYPE_XTC:
258
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
259
+ break;
260
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
261
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
262
+ break;
263
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
264
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
265
+ break;
266
+ case COMMON_SAMPLER_TYPE_INFILL:
267
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
268
+ break;
269
+ case COMMON_SAMPLER_TYPE_PENALTIES:
270
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
271
+ break;
272
+ default:
273
+ LM_GGML_ASSERT(false && "unknown sampler type");
274
+ }
275
+ }
276
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
277
+ } else if (params.mirostat == 1) {
278
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
279
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
280
+ } else if (params.mirostat == 2) {
281
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
282
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
283
+ } else {
284
+ LM_GGML_ASSERT(false && "unknown mirostat version");
285
+ }
286
+
287
+ return result;
288
+ }
289
+
290
+ void common_sampler_free(struct common_sampler * gsmpl) {
291
+ if (gsmpl) {
292
+ llama_sampler_free(gsmpl->grmr);
293
+
294
+ llama_sampler_free(gsmpl->chain);
295
+
296
+ delete gsmpl;
297
+ }
298
+ }
299
+
300
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
301
+ if (accept_grammar) {
302
+ llama_sampler_accept(gsmpl->grmr, token);
303
+ }
304
+
305
+ llama_sampler_accept(gsmpl->chain, token);
306
+
307
+ gsmpl->prev.push_back(token);
308
+ }
309
+
310
+ void common_sampler_reset(struct common_sampler * gsmpl) {
311
+ llama_sampler_reset(gsmpl->grmr);
312
+
313
+ llama_sampler_reset(gsmpl->chain);
314
+ }
315
+
316
+ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
317
+ return new common_sampler {
318
+ /* .params = */ gsmpl->params,
319
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
320
+ /* .chain = */ llama_sampler_clone(gsmpl->chain),
321
+ /* .prev = */ gsmpl->prev,
322
+ /* .cur = */ gsmpl->cur,
323
+ /* .cur_p = */ gsmpl->cur_p,
324
+ };
325
+ }
326
+
327
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
328
+ // TODO: measure grammar performance
329
+
330
+ if (gsmpl) {
331
+ llama_perf_sampler_print(gsmpl->chain);
332
+ }
333
+ if (ctx) {
334
+ llama_perf_context_print(ctx);
335
+ }
336
+ }
337
+
338
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
339
+ gsmpl->set_logits(ctx, idx);
340
+
341
+ auto & grmr = gsmpl->grmr;
342
+ auto & chain = gsmpl->chain;
343
+ auto & cur_p = gsmpl->cur_p; // initialized by set_logits
344
+
345
+ if (grammar_first) {
346
+ llama_sampler_apply(grmr, &cur_p);
347
+ }
348
+
349
+ llama_sampler_apply(chain, &cur_p);
350
+
351
+ LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
352
+
353
+ const llama_token id = cur_p.data[cur_p.selected].id;
354
+
355
+ if (grammar_first) {
356
+ return id;
357
+ }
358
+
359
+ // check if it the sampled token fits the grammar
360
+ {
361
+ llama_token_data single_token_data = { id, 1.0f, 0.0f };
362
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
363
+
364
+ llama_sampler_apply(grmr, &single_token_data_array);
365
+
366
+ const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
367
+ if (is_valid) {
368
+ return id;
369
+ }
370
+ }
371
+
372
+ // resampling:
373
+ // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
374
+ gsmpl->set_logits(ctx, idx);
375
+
376
+ llama_sampler_apply(grmr, &cur_p);
377
+ llama_sampler_apply(chain, &cur_p);
378
+
379
+ LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
380
+
381
+ return cur_p.data[cur_p.selected].id;
382
+ }
383
+
384
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
385
+ LM_GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
386
+
387
+ std::vector<llama_token> result;
388
+ result.reserve(idxs.size());
389
+
390
+ size_t i = 0;
391
+ for (; i < draft.size(); i++) {
392
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
393
+
394
+ common_sampler_accept(gsmpl, id, true);
395
+
396
+ result.push_back(id);
397
+
398
+ if (draft[i] != id) {
399
+ break;
400
+ }
401
+ }
402
+
403
+ if (i == draft.size()) {
404
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
405
+
406
+ common_sampler_accept(gsmpl, id, true);
407
+
408
+ result.push_back(id);
409
+ }
410
+
411
+ return result;
412
+ }
413
+
414
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
415
+ std::vector<int> idxs(draft.size() + 1);
416
+ for (size_t i = 0; i < idxs.size(); ++i) {
417
+ idxs[i] = i;
418
+ }
419
+
420
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
421
+ }
422
+
423
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
424
+ return llama_sampler_get_seed(gsmpl->chain);
425
+ }
426
+
427
+ // helpers
428
+
429
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430
+ return &gsmpl->cur_p;
431
+ }
432
+
433
+ llama_token common_sampler_last(const struct common_sampler * gsmpl) {
434
+ return gsmpl->prev.rat(0);
435
+ }
436
+
437
+ std::string common_sampler_print(const struct common_sampler * gsmpl) {
438
+ std::string result = "logits ";
439
+
440
+ for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
441
+ const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
442
+ result += std::string("-> ") + llama_sampler_name(smpl) + " ";
443
+ }
444
+
445
+ return result;
446
+ }
447
+
448
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
449
+ n = std::min(n, (int) gsmpl->prev.size());
450
+
451
+ if (n <= 0) {
452
+ return "";
453
+ }
454
+
455
+ std::string result;
456
+ result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
457
+
458
+ for (int i = n - 1; i >= 0; i--) {
459
+ const llama_token id = gsmpl->prev.rat(i);
460
+
461
+ LM_GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
462
+
463
+ result += common_token_to_piece(ctx_main, id);
464
+ }
465
+
466
+ return result;
467
+ }
468
+
469
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
470
+ switch (cnstr) {
471
+ case COMMON_SAMPLER_TYPE_DRY: return 'd';
472
+ case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
473
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
474
+ case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
475
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
476
+ case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
477
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
478
+ case COMMON_SAMPLER_TYPE_XTC: return 'x';
479
+ case COMMON_SAMPLER_TYPE_INFILL: return 'i';
480
+ case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
481
+ default : return '?';
482
+ }
483
+ }
484
+
485
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
486
+ switch (cnstr) {
487
+ case COMMON_SAMPLER_TYPE_DRY: return "dry";
488
+ case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
489
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
490
+ case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
491
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
492
+ case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
493
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
494
+ case COMMON_SAMPLER_TYPE_XTC: return "xtc";
495
+ case COMMON_SAMPLER_TYPE_INFILL: return "infill";
496
+ case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
497
+ default : return "";
498
+ }
499
+ }
500
+
501
+ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
502
+ std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
503
+ { "dry", COMMON_SAMPLER_TYPE_DRY },
504
+ { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
505
+ { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
506
+ { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
507
+ { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
508
+ { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
509
+ { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
510
+ { "xtc", COMMON_SAMPLER_TYPE_XTC },
511
+ { "infill", COMMON_SAMPLER_TYPE_INFILL },
512
+ { "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
513
+ };
514
+
515
+ // since samplers names are written multiple ways
516
+ // make it ready for both system names and input names
517
+ std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
518
+ { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
519
+ { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
520
+ { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
521
+ { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
522
+ { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
523
+ { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
524
+ { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
525
+ { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
526
+ { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
527
+ { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
528
+ };
529
+
530
+ std::vector<common_sampler_type> samplers;
531
+ samplers.reserve(names.size());
532
+
533
+ for (const auto & name : names) {
534
+ auto sampler = sampler_canonical_name_map.find(name);
535
+ if (sampler != sampler_canonical_name_map.end()) {
536
+ samplers.push_back(sampler->second);
537
+ continue;
538
+ }
539
+ if (allow_alt_names) {
540
+ sampler = sampler_alt_name_map.find(name);
541
+ if (sampler != sampler_alt_name_map.end()) {
542
+ samplers.push_back(sampler->second);
543
+ continue;
544
+ }
545
+ }
546
+ LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
547
+ }
548
+
549
+ return samplers;
550
+ }
551
+
552
+ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
553
+ std::unordered_map<char, common_sampler_type> sampler_name_map = {
554
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
555
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
556
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
557
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
558
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
559
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
560
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
561
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
562
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
563
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
564
+ };
565
+
566
+ std::vector<common_sampler_type> samplers;
567
+ samplers.reserve(chars.size());
568
+
569
+ for (const auto & c : chars) {
570
+ const auto sampler = sampler_name_map.find(c);
571
+ if (sampler != sampler_name_map.end()) {
572
+ samplers.push_back(sampler->second);
573
+ } else {
574
+ LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
575
+ }
576
+ }
577
+
578
+ return samplers;
579
+ }
package/cpp/sampling.h ADDED
@@ -0,0 +1,107 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include "common.h"
6
+
7
+ #include <string>
8
+ #include <vector>
9
+
10
+ // common_sampler extends llama_sampler with additional functionality:
11
+ //
12
+ // - grammar support
13
+ // - custom sampler logic based on the parameters
14
+ // - history of the last accepted tokens
15
+ // - performance metrics
16
+ //
17
+ // This goal is to have a common implementation of the sampling logic shared across the examples.
18
+ // For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
19
+ // complex (top-k, top-p, etc).
20
+ //
21
+ // Another example is related to the grammar. In general, the grammar constraints applied on the full
22
+ // vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
23
+ // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
24
+ // grammar constraints are applied to the full vocabulary and the token is resampled.
25
+ //
26
+ // The common_sampler also maintains a container with the last accepted tokens. In the future, this can
27
+ // be moved into the core llama library.
28
+ //
29
+ // For convenience, the common_sampler also maintains a container with the current candidate tokens.
30
+ // This can be used to access the probabilities of the rest of the non-sampled tokens.
31
+ //
32
+ // TODO: measure grammar performance
33
+ //
34
+
35
+ struct common_sampler;
36
+
37
+ // llama_sampler API overloads
38
+
39
+ struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
40
+
41
+ void common_sampler_free(struct common_sampler * gsmpl);
42
+
43
+ // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
44
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
45
+ void common_sampler_reset (struct common_sampler * gsmpl);
46
+ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
47
+
48
+ // arguments can be nullptr to skip printing
49
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
50
+
51
+ // extended sampling implementation:
52
+ //
53
+ // - set logits
54
+ // - apply the configured sampler chain
55
+ // - check if the token fits the grammar (if any)
56
+ // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
57
+ //
58
+ // if grammar_first is true, the grammar is applied before the samplers (slower)
59
+ // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
60
+ //
61
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
62
+
63
+ // generalized version of common_sampler_sample
64
+ //
65
+ // will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
66
+ // if the sampler disagrees at some point, we stop and return the accepted tokens up to now
67
+ //
68
+ // common_sampler_sample_n(gsmpl, ctx, { idx }, {});
69
+ //
70
+ // is equivalent to
71
+ //
72
+ // common_sampler_sample(gsmpl, ctx, idx);
73
+ // common_sampler_accept(gsmpl, token, true);
74
+ //
75
+ // requires: idxs.size() == draft.size() + 1
76
+ //
77
+ // returns at least 1 token, up to idxs.size()
78
+ //
79
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
80
+
81
+ // assume idxs == [ 0, 1, 2, ..., draft.size() ]
82
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
83
+
84
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
85
+
86
+ // helpers
87
+
88
+ // access the internal list of current candidate tokens
89
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
90
+
91
+ // get the last accepted token
92
+ llama_token common_sampler_last(const struct common_sampler * gsmpl);
93
+
94
+ // print the sampler chain into a string
95
+ std::string common_sampler_print(const struct common_sampler * gsmpl);
96
+
97
+ // get a string representation of the last accepted tokens
98
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
99
+
100
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr);
101
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
102
+
103
+ std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
104
+ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
105
+
106
+ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
107
+ const char * grammar_kind, const char * grammar_data);