cui-llama.rn 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +4 -23
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +13 -7
  4. package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
  5. package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
  6. package/android/src/main/jni.cpp +15 -12
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/cpp/README.md +1 -1
  16. package/cpp/common.cpp +158 -267
  17. package/cpp/common.h +46 -12
  18. package/cpp/ggml-alloc.c +1042 -1037
  19. package/cpp/ggml-backend-impl.h +255 -256
  20. package/cpp/ggml-backend-reg.cpp +582 -582
  21. package/cpp/ggml-backend.cpp +2002 -2002
  22. package/cpp/ggml-backend.h +354 -352
  23. package/cpp/ggml-common.h +1853 -1853
  24. package/cpp/ggml-cpp.h +39 -39
  25. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  26. package/cpp/ggml-cpu-aarch64.h +8 -8
  27. package/cpp/ggml-cpu-impl.h +386 -386
  28. package/cpp/ggml-cpu-quants.c +10920 -10839
  29. package/cpp/ggml-cpu-traits.cpp +36 -36
  30. package/cpp/ggml-cpu-traits.h +38 -38
  31. package/cpp/ggml-cpu.c +329 -60
  32. package/cpp/ggml-cpu.cpp +10 -2
  33. package/cpp/ggml-cpu.h +135 -135
  34. package/cpp/ggml-impl.h +567 -567
  35. package/cpp/ggml-metal-impl.h +17 -17
  36. package/cpp/ggml-metal.m +4884 -4884
  37. package/cpp/ggml-quants.c +5238 -5238
  38. package/cpp/ggml-threading.h +14 -14
  39. package/cpp/ggml.c +6514 -6448
  40. package/cpp/ggml.h +2194 -2163
  41. package/cpp/gguf.cpp +1329 -1325
  42. package/cpp/gguf.h +202 -202
  43. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  44. package/cpp/json-schema-to-grammar.h +8 -8
  45. package/cpp/json.hpp +24766 -24766
  46. package/cpp/llama-adapter.cpp +347 -346
  47. package/cpp/llama-adapter.h +74 -73
  48. package/cpp/llama-arch.cpp +1487 -1434
  49. package/cpp/llama-arch.h +400 -395
  50. package/cpp/llama-batch.cpp +368 -368
  51. package/cpp/llama-batch.h +88 -88
  52. package/cpp/llama-chat.cpp +578 -567
  53. package/cpp/llama-chat.h +52 -51
  54. package/cpp/llama-context.cpp +1775 -1771
  55. package/cpp/llama-context.h +128 -128
  56. package/cpp/llama-cparams.cpp +1 -1
  57. package/cpp/llama-cparams.h +37 -37
  58. package/cpp/llama-cpp.h +30 -30
  59. package/cpp/llama-grammar.cpp +1139 -1139
  60. package/cpp/llama-grammar.h +143 -143
  61. package/cpp/llama-hparams.cpp +71 -71
  62. package/cpp/llama-hparams.h +139 -140
  63. package/cpp/llama-impl.cpp +167 -167
  64. package/cpp/llama-impl.h +61 -61
  65. package/cpp/llama-kv-cache.cpp +718 -718
  66. package/cpp/llama-kv-cache.h +218 -218
  67. package/cpp/llama-mmap.cpp +2 -1
  68. package/cpp/llama-mmap.h +67 -67
  69. package/cpp/llama-model-loader.cpp +1124 -1011
  70. package/cpp/llama-model-loader.h +167 -158
  71. package/cpp/llama-model.cpp +3997 -2202
  72. package/cpp/llama-model.h +370 -391
  73. package/cpp/llama-sampling.cpp +2408 -2406
  74. package/cpp/llama-sampling.h +32 -48
  75. package/cpp/llama-vocab.cpp +3247 -1982
  76. package/cpp/llama-vocab.h +125 -182
  77. package/cpp/llama.cpp +416 -2886
  78. package/cpp/llama.h +1323 -1285
  79. package/cpp/log.cpp +401 -401
  80. package/cpp/log.h +121 -121
  81. package/cpp/rn-llama.cpp +822 -0
  82. package/cpp/rn-llama.h +123 -0
  83. package/cpp/rn-llama.hpp +18 -12
  84. package/cpp/sampling.cpp +505 -500
  85. package/cpp/sgemm.cpp +2597 -2597
  86. package/cpp/speculative.cpp +277 -274
  87. package/cpp/speculative.h +28 -28
  88. package/cpp/unicode.cpp +2 -3
  89. package/ios/CMakeLists.txt +99 -0
  90. package/ios/RNLlama.h +5 -1
  91. package/ios/RNLlama.mm +2 -2
  92. package/ios/RNLlamaContext.h +8 -1
  93. package/ios/RNLlamaContext.mm +15 -11
  94. package/ios/rnllama.xcframework/Info.plist +74 -0
  95. package/jest/mock.js +3 -2
  96. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  97. package/lib/commonjs/index.js +4 -2
  98. package/lib/commonjs/index.js.map +1 -1
  99. package/lib/module/NativeRNLlama.js.map +1 -1
  100. package/lib/module/index.js +4 -2
  101. package/lib/module/index.js.map +1 -1
  102. package/lib/typescript/NativeRNLlama.d.ts +5 -1
  103. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  104. package/lib/typescript/index.d.ts.map +1 -1
  105. package/llama-rn.podspec +8 -2
  106. package/package.json +5 -2
  107. package/src/NativeRNLlama.ts +5 -1
  108. package/src/index.ts +9 -2
@@ -1,274 +1,277 @@
1
- #include "speculative.h"
2
-
3
- #include "log.h"
4
- #include "common.h"
5
- #include "sampling.h"
6
-
7
- #include <cstring>
8
-
9
- #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
10
- #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
11
-
12
- struct common_speculative {
13
- struct llama_context * ctx;
14
- struct common_sampler * smpl;
15
-
16
- llama_batch batch;
17
- llama_tokens prompt;
18
- };
19
-
20
- struct common_speculative * common_speculative_init(
21
- struct llama_context * ctx_dft) {
22
- auto * result = new common_speculative {
23
- /* .ctx = */ ctx_dft,
24
- /* .smpl = */ nullptr,
25
- /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
26
- /* .prompt = */ {},
27
- };
28
-
29
- // TODO: optimize or pass from outside?
30
- #if 0
31
- {
32
- common_params_sampling params;
33
- params.no_perf = false;
34
-
35
- params.top_k = 40;
36
- params.top_p = 0.9;
37
-
38
- params.samplers = {
39
- COMMON_SAMPLER_TYPE_TOP_K,
40
- COMMON_SAMPLER_TYPE_TOP_P,
41
- COMMON_SAMPLER_TYPE_INFILL,
42
- };
43
-
44
- result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
45
- }
46
- #else
47
- {
48
- common_params_sampling params;
49
- params.no_perf = false;
50
-
51
- params.top_k = 10;
52
-
53
- params.samplers = {
54
- COMMON_SAMPLER_TYPE_TOP_K,
55
- };
56
-
57
- result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
58
- }
59
- #endif
60
-
61
- return result;
62
- }
63
-
64
- void common_speculative_free(struct common_speculative * spec) {
65
- if (spec == nullptr) {
66
- return;
67
- }
68
-
69
- common_sampler_free(spec->smpl);
70
-
71
- llama_batch_free(spec->batch);
72
-
73
- delete spec;
74
- }
75
-
76
- bool common_speculative_are_compatible(
77
- const struct llama_context * ctx_tgt,
78
- const struct llama_context * ctx_dft) {
79
- const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
80
- const struct llama_model * model_dft = llama_get_model(ctx_dft);
81
-
82
- const bool vocab_type_tgt = llama_vocab_type(model_tgt);
83
- LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
84
-
85
- const bool vocab_type_dft = llama_vocab_type(model_dft);
86
- LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
87
-
88
- if (vocab_type_tgt != vocab_type_dft) {
89
- LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
90
- "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
91
- return false;
92
- }
93
-
94
- if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
95
- llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
96
- llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
97
- llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
98
- LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
99
- LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
100
- LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
101
- return false;
102
- }
103
-
104
- {
105
- const int n_vocab_tgt = llama_n_vocab(model_tgt);
106
- const int n_vocab_dft = llama_n_vocab(model_dft);
107
-
108
- const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
109
-
110
- if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
111
- LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
112
- "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
113
- __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
114
- return false;
115
- }
116
-
117
- for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
118
- const char * token_text_tgt = llama_token_get_text(model_tgt, i);
119
- const char * token_text_dft = llama_token_get_text(model_dft, i);
120
- if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
121
- LOG_ERR("%s: draft model vocab must match target model to use speculation but "
122
- "token %d content differs - target '%s', draft '%s'\n", __func__, i,
123
- common_token_to_piece(ctx_tgt, i).c_str(),
124
- common_token_to_piece(ctx_dft, i).c_str());
125
- return false;
126
- }
127
- }
128
- }
129
-
130
- return true;
131
- }
132
-
133
- llama_tokens common_speculative_gen_draft(
134
- struct common_speculative * spec,
135
- struct common_speculative_params params,
136
- const llama_tokens & prompt_tgt,
137
- llama_token id_last) {
138
- auto & batch = spec->batch;
139
- auto & ctx = spec->ctx;
140
- auto & smpl = spec->smpl;
141
- auto & prompt = spec->prompt;
142
-
143
- int reuse_i = 0;
144
- int reuse_n = 0;
145
-
146
- const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
147
-
148
- const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
149
-
150
- // reuse as much as possible from the old draft context
151
- // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
152
- for (int i = 0; i < (int) prompt.size(); ++i) {
153
- int cur = 0;
154
- while (i_start + cur < (int) prompt_tgt.size() &&
155
- i + cur < (int) prompt.size() &&
156
- prompt_tgt[i_start + cur] == prompt[i + cur]) {
157
- cur++;
158
- }
159
-
160
- if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
161
- reuse_i = i;
162
- reuse_n = cur;
163
- }
164
- }
165
-
166
- LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
167
-
168
- llama_tokens result;
169
- result.reserve(params.n_draft);
170
-
171
- if (reuse_n == 0) {
172
- llama_kv_cache_clear(ctx);
173
-
174
- prompt.clear();
175
- } else {
176
- // this happens when a previous draft has been discarded (for example, due to being too small), but the
177
- // target model agreed with it. in this case, we simply pass back the previous results to save compute
178
- if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
179
- for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
180
- result.push_back(prompt[i]);
181
-
182
- if (params.n_draft <= (int) result.size()) {
183
- break;
184
- }
185
- }
186
-
187
- return result;
188
- }
189
-
190
- if (reuse_i > 0) {
191
- llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
192
- llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
193
-
194
- prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
195
- }
196
-
197
- if (reuse_n < (int) prompt.size()) {
198
- llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
199
-
200
- prompt.erase(prompt.begin() + reuse_n, prompt.end());
201
- }
202
- }
203
-
204
- // prepare a batch to evaluate any new tokens in the prompt
205
- common_batch_clear(batch);
206
-
207
- for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
208
- //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
209
- common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
210
-
211
- prompt.push_back(prompt_tgt[i]);
212
- }
213
-
214
- // we should rarely end-up here during normal decoding
215
- if (batch.n_tokens > 0) {
216
- //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
217
-
218
- llama_decode(ctx, batch);
219
- }
220
-
221
- const llama_pos n_past = prompt.size();
222
-
223
- LOG_DBG("%s: n_past = %d\n", __func__, n_past);
224
-
225
- common_batch_clear(batch);
226
- common_batch_add (batch, id_last, n_past, { 0 }, true);
227
-
228
- prompt.push_back(id_last);
229
-
230
- //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
231
-
232
- llama_decode(ctx, batch);
233
-
234
- common_sampler_reset(smpl);
235
-
236
- // sample n_draft tokens from the draft model
237
- for (int i = 0; i < params.n_draft; ++i) {
238
- common_batch_clear(batch);
239
-
240
- common_sampler_sample(smpl, ctx, 0, true);
241
-
242
- const auto * cur_p = common_sampler_get_candidates(smpl);
243
-
244
- for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
245
- LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
246
- k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
247
- }
248
-
249
- // add drafted token for each sequence
250
- const llama_token id = cur_p->data[0].id;
251
-
252
- // only collect very high-confidence draft tokens
253
- if (cur_p->data[0].p < params.p_min) {
254
- break;
255
- }
256
-
257
- common_sampler_accept(smpl, id, true);
258
-
259
- result.push_back(id);
260
-
261
- if (params.n_draft <= (int) result.size()) {
262
- break;
263
- }
264
-
265
- common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
266
-
267
- // evaluate the drafted tokens on the draft model
268
- llama_decode(ctx, batch);
269
-
270
- prompt.push_back(id);
271
- }
272
-
273
- return result;
274
- }
1
+ #include "speculative.h"
2
+
3
+ #include "log.h"
4
+ #include "common.h"
5
+ #include "sampling.h"
6
+
7
+ #include <cstring>
8
+
9
+ #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
10
+ #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
11
+
12
+ struct common_speculative {
13
+ struct llama_context * ctx;
14
+ struct common_sampler * smpl;
15
+
16
+ llama_batch batch;
17
+ llama_tokens prompt;
18
+ };
19
+
20
+ struct common_speculative * common_speculative_init(
21
+ struct llama_context * ctx_dft) {
22
+ auto * result = new common_speculative {
23
+ /* .ctx = */ ctx_dft,
24
+ /* .smpl = */ nullptr,
25
+ /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
26
+ /* .prompt = */ {},
27
+ };
28
+
29
+ // TODO: optimize or pass from outside?
30
+ #if 0
31
+ {
32
+ common_params_sampling params;
33
+ params.no_perf = false;
34
+
35
+ params.top_k = 40;
36
+ params.top_p = 0.9;
37
+
38
+ params.samplers = {
39
+ COMMON_SAMPLER_TYPE_TOP_K,
40
+ COMMON_SAMPLER_TYPE_TOP_P,
41
+ COMMON_SAMPLER_TYPE_INFILL,
42
+ };
43
+
44
+ result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
45
+ }
46
+ #else
47
+ {
48
+ common_params_sampling params;
49
+ params.no_perf = false;
50
+
51
+ params.top_k = 10;
52
+
53
+ params.samplers = {
54
+ COMMON_SAMPLER_TYPE_TOP_K,
55
+ };
56
+
57
+ result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
58
+ }
59
+ #endif
60
+
61
+ return result;
62
+ }
63
+
64
+ void common_speculative_free(struct common_speculative * spec) {
65
+ if (spec == nullptr) {
66
+ return;
67
+ }
68
+
69
+ common_sampler_free(spec->smpl);
70
+
71
+ llama_batch_free(spec->batch);
72
+
73
+ delete spec;
74
+ }
75
+
76
+ bool common_speculative_are_compatible(
77
+ const struct llama_context * ctx_tgt,
78
+ const struct llama_context * ctx_dft) {
79
+ const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
80
+ const struct llama_model * model_dft = llama_get_model(ctx_dft);
81
+
82
+ const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
83
+ const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
84
+
85
+ const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
86
+ LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
87
+
88
+ const bool vocab_type_dft = llama_vocab_type(vocab_dft);
89
+ LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
90
+
91
+ if (vocab_type_tgt != vocab_type_dft) {
92
+ LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
93
+ "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
94
+ return false;
95
+ }
96
+
97
+ if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
98
+ llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
99
+ llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
100
+ llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
101
+ LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
102
+ LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
103
+ LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
104
+ return false;
105
+ }
106
+
107
+ {
108
+ const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
109
+ const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
110
+
111
+ const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
112
+
113
+ if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
114
+ LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
115
+ "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
116
+ __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
117
+ return false;
118
+ }
119
+
120
+ for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
121
+ const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
122
+ const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
123
+ if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
124
+ LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
125
+ "token %d content differs - target '%s', draft '%s'\n", __func__, i,
126
+ common_token_to_piece(ctx_tgt, i).c_str(),
127
+ common_token_to_piece(ctx_dft, i).c_str());
128
+ return false;
129
+ }
130
+ }
131
+ }
132
+
133
+ return true;
134
+ }
135
+
136
+ llama_tokens common_speculative_gen_draft(
137
+ struct common_speculative * spec,
138
+ struct common_speculative_params params,
139
+ const llama_tokens & prompt_tgt,
140
+ llama_token id_last) {
141
+ auto & batch = spec->batch;
142
+ auto & ctx = spec->ctx;
143
+ auto & smpl = spec->smpl;
144
+ auto & prompt = spec->prompt;
145
+
146
+ int reuse_i = 0;
147
+ int reuse_n = 0;
148
+
149
+ const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
150
+
151
+ const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
152
+
153
+ // reuse as much as possible from the old draft context
154
+ // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
155
+ for (int i = 0; i < (int) prompt.size(); ++i) {
156
+ int cur = 0;
157
+ while (i_start + cur < (int) prompt_tgt.size() &&
158
+ i + cur < (int) prompt.size() &&
159
+ prompt_tgt[i_start + cur] == prompt[i + cur]) {
160
+ cur++;
161
+ }
162
+
163
+ if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
164
+ reuse_i = i;
165
+ reuse_n = cur;
166
+ }
167
+ }
168
+
169
+ LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
170
+
171
+ llama_tokens result;
172
+ result.reserve(params.n_draft);
173
+
174
+ if (reuse_n == 0) {
175
+ llama_kv_cache_clear(ctx);
176
+
177
+ prompt.clear();
178
+ } else {
179
+ // this happens when a previous draft has been discarded (for example, due to being too small), but the
180
+ // target model agreed with it. in this case, we simply pass back the previous results to save compute
181
+ if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
182
+ for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
183
+ result.push_back(prompt[i]);
184
+
185
+ if (params.n_draft <= (int) result.size()) {
186
+ break;
187
+ }
188
+ }
189
+
190
+ return result;
191
+ }
192
+
193
+ if (reuse_i > 0) {
194
+ llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
195
+ llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
196
+
197
+ prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
198
+ }
199
+
200
+ if (reuse_n < (int) prompt.size()) {
201
+ llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
202
+
203
+ prompt.erase(prompt.begin() + reuse_n, prompt.end());
204
+ }
205
+ }
206
+
207
+ // prepare a batch to evaluate any new tokens in the prompt
208
+ common_batch_clear(batch);
209
+
210
+ for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
211
+ //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
212
+ common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
213
+
214
+ prompt.push_back(prompt_tgt[i]);
215
+ }
216
+
217
+ // we should rarely end-up here during normal decoding
218
+ if (batch.n_tokens > 0) {
219
+ //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
220
+
221
+ llama_decode(ctx, batch);
222
+ }
223
+
224
+ const llama_pos n_past = prompt.size();
225
+
226
+ LOG_DBG("%s: n_past = %d\n", __func__, n_past);
227
+
228
+ common_batch_clear(batch);
229
+ common_batch_add (batch, id_last, n_past, { 0 }, true);
230
+
231
+ prompt.push_back(id_last);
232
+
233
+ //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
234
+
235
+ llama_decode(ctx, batch);
236
+
237
+ common_sampler_reset(smpl);
238
+
239
+ // sample n_draft tokens from the draft model
240
+ for (int i = 0; i < params.n_draft; ++i) {
241
+ common_batch_clear(batch);
242
+
243
+ common_sampler_sample(smpl, ctx, 0, true);
244
+
245
+ const auto * cur_p = common_sampler_get_candidates(smpl);
246
+
247
+ for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
248
+ LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
249
+ k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
250
+ }
251
+
252
+ // add drafted token for each sequence
253
+ const llama_token id = cur_p->data[0].id;
254
+
255
+ // only collect very high-confidence draft tokens
256
+ if (cur_p->data[0].p < params.p_min) {
257
+ break;
258
+ }
259
+
260
+ common_sampler_accept(smpl, id, true);
261
+
262
+ result.push_back(id);
263
+
264
+ if (params.n_draft <= (int) result.size()) {
265
+ break;
266
+ }
267
+
268
+ common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269
+
270
+ // evaluate the drafted tokens on the draft model
271
+ llama_decode(ctx, batch);
272
+
273
+ prompt.push_back(id);
274
+ }
275
+
276
+ return result;
277
+ }
package/cpp/speculative.h CHANGED
@@ -1,28 +1,28 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
- #include "common.h"
5
-
6
- struct common_speculative;
7
-
8
- struct common_speculative_params {
9
- int n_draft = 16; // max drafted tokens
10
- int n_reuse = 256;
11
-
12
- float p_min = 0.9f; // min probabiliy required to accept a token in the draft
13
- };
14
-
15
- struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
16
-
17
- void common_speculative_free(struct common_speculative * spec);
18
-
19
- bool common_speculative_are_compatible(
20
- const struct llama_context * ctx_tgt,
21
- const struct llama_context * ctx_dft);
22
-
23
- // sample up to n_draft tokens and add them to the batch using the draft model
24
- llama_tokens common_speculative_gen_draft(
25
- struct common_speculative * spec,
26
- struct common_speculative_params params,
27
- const llama_tokens & prompt,
28
- llama_token id_last);
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "common.h"
5
+
6
+ struct common_speculative;
7
+
8
+ struct common_speculative_params {
9
+ int n_draft = 16; // max drafted tokens
10
+ int n_reuse = 256;
11
+
12
+ float p_min = 0.9f; // min probabiliy required to accept a token in the draft
13
+ };
14
+
15
+ struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
16
+
17
+ void common_speculative_free(struct common_speculative * spec);
18
+
19
+ bool common_speculative_are_compatible(
20
+ const struct llama_context * ctx_tgt,
21
+ const struct llama_context * ctx_dft);
22
+
23
+ // sample up to n_draft tokens and add them to the batch using the draft model
24
+ llama_tokens common_speculative_gen_draft(
25
+ struct common_speculative * spec,
26
+ struct common_speculative_params params,
27
+ const llama_tokens & prompt,
28
+ llama_token id_last);
package/cpp/unicode.cpp CHANGED
@@ -7,18 +7,17 @@
7
7
 
8
8
  #include <algorithm>
9
9
  #include <cassert>
10
+ #include <codecvt>
10
11
  #include <cstddef>
11
12
  #include <cstdint>
13
+ #include <locale>
12
14
  #include <map>
13
15
  #include <regex>
14
16
  #include <stdexcept>
15
17
  #include <string>
16
18
  #include <unordered_map>
17
- #include <unordered_set>
18
19
  #include <utility>
19
20
  #include <vector>
20
- #include <locale>
21
- #include <codecvt>
22
21
 
23
22
  size_t unicode_len_utf8(char src) {
24
23
  const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };