cui-llama.rn 1.3.0 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +9 -6
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
- package/android/src/main/jni.cpp +15 -15
- package/cpp/common.cpp +1962 -1682
- package/cpp/common.h +645 -600
- package/cpp/ggml-alloc.c +1038 -1040
- package/cpp/ggml-alloc.h +76 -76
- package/cpp/ggml-backend-impl.h +256 -216
- package/cpp/ggml-backend-reg.cpp +552 -195
- package/cpp/ggml-backend.cpp +1999 -1997
- package/cpp/ggml-backend.h +352 -328
- package/cpp/ggml-common.h +1853 -1853
- package/cpp/ggml-cpp.h +38 -38
- package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +4262 -3560
- package/cpp/ggml-cpu-aarch64.h +8 -30
- package/cpp/ggml-cpu-impl.h +386 -371
- package/cpp/ggml-cpu-quants.c +10835 -10822
- package/cpp/ggml-cpu-quants.h +63 -63
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14122 -13975
- package/cpp/ggml-cpu.cpp +618 -663
- package/cpp/ggml-cpu.h +135 -177
- package/cpp/ggml-impl.h +556 -550
- package/cpp/ggml-metal.h +66 -66
- package/cpp/ggml-metal.m +4884 -4294
- package/cpp/ggml-quants.c +5238 -5247
- package/cpp/ggml-quants.h +100 -100
- package/cpp/ggml-threading.cpp +12 -12
- package/cpp/ggml-threading.h +14 -12
- package/cpp/ggml.c +7707 -8180
- package/cpp/ggml.h +2286 -2411
- package/cpp/json-schema-to-grammar.cpp +1045 -0
- package/cpp/json-schema-to-grammar.h +8 -0
- package/cpp/json.hpp +24766 -0
- package/cpp/llama-grammar.cpp +1138 -1138
- package/cpp/llama-grammar.h +144 -144
- package/cpp/llama-impl.h +181 -181
- package/cpp/llama-sampling.cpp +2293 -2348
- package/cpp/llama-sampling.h +48 -48
- package/cpp/llama-vocab.cpp +1985 -1984
- package/cpp/llama-vocab.h +170 -170
- package/cpp/llama.cpp +22836 -22132
- package/cpp/llama.h +1263 -1253
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.hpp +6 -6
- package/cpp/sampling.cpp +500 -466
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +1884 -1884
- package/cpp/speculative.cpp +274 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +62 -51
- package/cpp/unicode.h +9 -10
- package/ios/RNLlamaContext.mm +13 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +38 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +36 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +95 -6
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +40 -4
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +99 -12
- package/src/grammar.ts +10 -8
- package/src/index.ts +68 -3
- package/cpp/ggml-aarch64.c +0 -129
- package/cpp/ggml-aarch64.h +0 -19
@@ -0,0 +1,274 @@
|
|
1
|
+
#include "speculative.h"
|
2
|
+
|
3
|
+
#include "log.h"
|
4
|
+
#include "common.h"
|
5
|
+
#include "sampling.h"
|
6
|
+
|
7
|
+
#include <cstring>
|
8
|
+
|
9
|
+
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
10
|
+
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
11
|
+
|
12
|
+
struct common_speculative {
|
13
|
+
struct llama_context * ctx;
|
14
|
+
struct common_sampler * smpl;
|
15
|
+
|
16
|
+
llama_batch batch;
|
17
|
+
llama_tokens prompt;
|
18
|
+
};
|
19
|
+
|
20
|
+
struct common_speculative * common_speculative_init(
|
21
|
+
struct llama_context * ctx_dft) {
|
22
|
+
auto * result = new common_speculative {
|
23
|
+
/* .ctx = */ ctx_dft,
|
24
|
+
/* .smpl = */ nullptr,
|
25
|
+
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
26
|
+
/* .prompt = */ {},
|
27
|
+
};
|
28
|
+
|
29
|
+
// TODO: optimize or pass from outside?
|
30
|
+
#if 0
|
31
|
+
{
|
32
|
+
common_params_sampling params;
|
33
|
+
params.no_perf = false;
|
34
|
+
|
35
|
+
params.top_k = 40;
|
36
|
+
params.top_p = 0.9;
|
37
|
+
|
38
|
+
params.samplers = {
|
39
|
+
COMMON_SAMPLER_TYPE_TOP_K,
|
40
|
+
COMMON_SAMPLER_TYPE_TOP_P,
|
41
|
+
COMMON_SAMPLER_TYPE_INFILL,
|
42
|
+
};
|
43
|
+
|
44
|
+
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
45
|
+
}
|
46
|
+
#else
|
47
|
+
{
|
48
|
+
common_params_sampling params;
|
49
|
+
params.no_perf = false;
|
50
|
+
|
51
|
+
params.top_k = 10;
|
52
|
+
|
53
|
+
params.samplers = {
|
54
|
+
COMMON_SAMPLER_TYPE_TOP_K,
|
55
|
+
};
|
56
|
+
|
57
|
+
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
58
|
+
}
|
59
|
+
#endif
|
60
|
+
|
61
|
+
return result;
|
62
|
+
}
|
63
|
+
|
64
|
+
void common_speculative_free(struct common_speculative * spec) {
|
65
|
+
if (spec == nullptr) {
|
66
|
+
return;
|
67
|
+
}
|
68
|
+
|
69
|
+
common_sampler_free(spec->smpl);
|
70
|
+
|
71
|
+
llama_batch_free(spec->batch);
|
72
|
+
|
73
|
+
delete spec;
|
74
|
+
}
|
75
|
+
|
76
|
+
bool common_speculative_are_compatible(
|
77
|
+
const struct llama_context * ctx_tgt,
|
78
|
+
const struct llama_context * ctx_dft) {
|
79
|
+
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
80
|
+
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
81
|
+
|
82
|
+
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
83
|
+
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
84
|
+
|
85
|
+
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
86
|
+
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
87
|
+
|
88
|
+
if (vocab_type_tgt != vocab_type_dft) {
|
89
|
+
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
90
|
+
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
91
|
+
return false;
|
92
|
+
}
|
93
|
+
|
94
|
+
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
95
|
+
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
96
|
+
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
97
|
+
llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
|
98
|
+
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
99
|
+
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
|
100
|
+
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
|
101
|
+
return false;
|
102
|
+
}
|
103
|
+
|
104
|
+
{
|
105
|
+
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
106
|
+
const int n_vocab_dft = llama_n_vocab(model_dft);
|
107
|
+
|
108
|
+
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
109
|
+
|
110
|
+
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
111
|
+
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
112
|
+
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
113
|
+
__func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
114
|
+
return false;
|
115
|
+
}
|
116
|
+
|
117
|
+
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
118
|
+
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
119
|
+
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
120
|
+
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
121
|
+
LOG_ERR("%s: draft model vocab must match target model to use speculation but "
|
122
|
+
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
123
|
+
common_token_to_piece(ctx_tgt, i).c_str(),
|
124
|
+
common_token_to_piece(ctx_dft, i).c_str());
|
125
|
+
return false;
|
126
|
+
}
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
return true;
|
131
|
+
}
|
132
|
+
|
133
|
+
llama_tokens common_speculative_gen_draft(
|
134
|
+
struct common_speculative * spec,
|
135
|
+
struct common_speculative_params params,
|
136
|
+
const llama_tokens & prompt_tgt,
|
137
|
+
llama_token id_last) {
|
138
|
+
auto & batch = spec->batch;
|
139
|
+
auto & ctx = spec->ctx;
|
140
|
+
auto & smpl = spec->smpl;
|
141
|
+
auto & prompt = spec->prompt;
|
142
|
+
|
143
|
+
int reuse_i = 0;
|
144
|
+
int reuse_n = 0;
|
145
|
+
|
146
|
+
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
147
|
+
|
148
|
+
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
149
|
+
|
150
|
+
// reuse as much as possible from the old draft context
|
151
|
+
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
152
|
+
for (int i = 0; i < (int) prompt.size(); ++i) {
|
153
|
+
int cur = 0;
|
154
|
+
while (i_start + cur < (int) prompt_tgt.size() &&
|
155
|
+
i + cur < (int) prompt.size() &&
|
156
|
+
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
157
|
+
cur++;
|
158
|
+
}
|
159
|
+
|
160
|
+
if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
|
161
|
+
reuse_i = i;
|
162
|
+
reuse_n = cur;
|
163
|
+
}
|
164
|
+
}
|
165
|
+
|
166
|
+
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
167
|
+
|
168
|
+
llama_tokens result;
|
169
|
+
result.reserve(params.n_draft);
|
170
|
+
|
171
|
+
if (reuse_n == 0) {
|
172
|
+
llama_kv_cache_clear(ctx);
|
173
|
+
|
174
|
+
prompt.clear();
|
175
|
+
} else {
|
176
|
+
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
177
|
+
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
178
|
+
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
179
|
+
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
180
|
+
result.push_back(prompt[i]);
|
181
|
+
|
182
|
+
if (params.n_draft <= (int) result.size()) {
|
183
|
+
break;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
return result;
|
188
|
+
}
|
189
|
+
|
190
|
+
if (reuse_i > 0) {
|
191
|
+
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
192
|
+
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
193
|
+
|
194
|
+
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
195
|
+
}
|
196
|
+
|
197
|
+
if (reuse_n < (int) prompt.size()) {
|
198
|
+
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
|
199
|
+
|
200
|
+
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
// prepare a batch to evaluate any new tokens in the prompt
|
205
|
+
common_batch_clear(batch);
|
206
|
+
|
207
|
+
for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
|
208
|
+
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
209
|
+
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
210
|
+
|
211
|
+
prompt.push_back(prompt_tgt[i]);
|
212
|
+
}
|
213
|
+
|
214
|
+
// we should rarely end-up here during normal decoding
|
215
|
+
if (batch.n_tokens > 0) {
|
216
|
+
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
217
|
+
|
218
|
+
llama_decode(ctx, batch);
|
219
|
+
}
|
220
|
+
|
221
|
+
const llama_pos n_past = prompt.size();
|
222
|
+
|
223
|
+
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
224
|
+
|
225
|
+
common_batch_clear(batch);
|
226
|
+
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
227
|
+
|
228
|
+
prompt.push_back(id_last);
|
229
|
+
|
230
|
+
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
231
|
+
|
232
|
+
llama_decode(ctx, batch);
|
233
|
+
|
234
|
+
common_sampler_reset(smpl);
|
235
|
+
|
236
|
+
// sample n_draft tokens from the draft model
|
237
|
+
for (int i = 0; i < params.n_draft; ++i) {
|
238
|
+
common_batch_clear(batch);
|
239
|
+
|
240
|
+
common_sampler_sample(smpl, ctx, 0, true);
|
241
|
+
|
242
|
+
const auto * cur_p = common_sampler_get_candidates(smpl);
|
243
|
+
|
244
|
+
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
245
|
+
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
246
|
+
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
247
|
+
}
|
248
|
+
|
249
|
+
// add drafted token for each sequence
|
250
|
+
const llama_token id = cur_p->data[0].id;
|
251
|
+
|
252
|
+
// only collect very high-confidence draft tokens
|
253
|
+
if (cur_p->data[0].p < params.p_min) {
|
254
|
+
break;
|
255
|
+
}
|
256
|
+
|
257
|
+
common_sampler_accept(smpl, id, true);
|
258
|
+
|
259
|
+
result.push_back(id);
|
260
|
+
|
261
|
+
if (params.n_draft <= (int) result.size()) {
|
262
|
+
break;
|
263
|
+
}
|
264
|
+
|
265
|
+
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
266
|
+
|
267
|
+
// evaluate the drafted tokens on the draft model
|
268
|
+
llama_decode(ctx, batch);
|
269
|
+
|
270
|
+
prompt.push_back(id);
|
271
|
+
}
|
272
|
+
|
273
|
+
return result;
|
274
|
+
}
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "llama.h"
|
4
|
+
#include "common.h"
|
5
|
+
|
6
|
+
struct common_speculative;
|
7
|
+
|
8
|
+
struct common_speculative_params {
|
9
|
+
int n_draft = 16; // max drafted tokens
|
10
|
+
int n_reuse = 256;
|
11
|
+
|
12
|
+
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
|
13
|
+
};
|
14
|
+
|
15
|
+
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
16
|
+
|
17
|
+
void common_speculative_free(struct common_speculative * spec);
|
18
|
+
|
19
|
+
bool common_speculative_are_compatible(
|
20
|
+
const struct llama_context * ctx_tgt,
|
21
|
+
const struct llama_context * ctx_dft);
|
22
|
+
|
23
|
+
// sample up to n_draft tokens and add them to the batch using the draft model
|
24
|
+
llama_tokens common_speculative_gen_draft(
|
25
|
+
struct common_speculative * spec,
|
26
|
+
struct common_speculative_params params,
|
27
|
+
const llama_tokens & prompt,
|
28
|
+
llama_token id_last);
|
package/cpp/unicode.cpp
CHANGED
@@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
|
71
71
|
throw std::invalid_argument("failed to convert utf8 to codepoint");
|
72
72
|
}
|
73
73
|
|
74
|
-
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t
|
74
|
+
//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cpt) {
|
75
75
|
// std::vector<uint16_t> result;
|
76
|
-
// if (/* 0x0000 <=
|
77
|
-
// result.emplace_back(
|
76
|
+
// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) {
|
77
|
+
// result.emplace_back(cpt);
|
78
78
|
// return result;
|
79
79
|
// }
|
80
|
-
// if (0x10000 <=
|
81
|
-
// result.emplace_back(0xd800 | ((
|
82
|
-
// result.emplace_back(0xdc00 | ((
|
80
|
+
// if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
81
|
+
// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10));
|
82
|
+
// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff));
|
83
83
|
// return result;
|
84
84
|
// }
|
85
85
|
// throw std::invalid_argument("failed to convert codepoint to utf16");
|
@@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
|
120
120
|
// return result;
|
121
121
|
//}
|
122
122
|
|
123
|
-
static std::vector<
|
124
|
-
std::vector<
|
123
|
+
static std::vector<unicode_cpt_flags> unicode_cpt_flags_array() {
|
124
|
+
std::vector<unicode_cpt_flags> cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED);
|
125
125
|
|
126
126
|
assert (unicode_ranges_flags.begin()[0].first == 0);
|
127
127
|
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
@@ -201,7 +201,18 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
|
201
201
|
}
|
202
202
|
|
203
203
|
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
204
|
+
#if defined(__clang__)
|
205
|
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
206
|
+
# pragma clang diagnostic push
|
207
|
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
208
|
+
#endif
|
209
|
+
|
204
210
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
211
|
+
|
212
|
+
#if defined(__clang__)
|
213
|
+
# pragma clang diagnostic pop
|
214
|
+
#endif
|
215
|
+
|
205
216
|
return conv.from_bytes(s);
|
206
217
|
}
|
207
218
|
|
@@ -242,8 +253,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
242
253
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
243
254
|
};
|
244
255
|
|
245
|
-
auto _get_flags = [&] (const size_t pos) ->
|
246
|
-
return (offset_ini <= pos && pos < offset_end) ?
|
256
|
+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
257
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
247
258
|
};
|
248
259
|
|
249
260
|
size_t _prev_end = offset_ini;
|
@@ -360,8 +371,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
360
371
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
361
372
|
};
|
362
373
|
|
363
|
-
auto _get_flags = [&] (const size_t pos) ->
|
364
|
-
return (offset_ini <= pos && pos < offset_end) ?
|
374
|
+
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
375
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
365
376
|
};
|
366
377
|
|
367
378
|
size_t _prev_end = offset_ini;
|
@@ -561,29 +572,29 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
|
561
572
|
// interface
|
562
573
|
//
|
563
574
|
|
564
|
-
std::string unicode_cpt_to_utf8(uint32_t
|
575
|
+
std::string unicode_cpt_to_utf8(uint32_t cpt) {
|
565
576
|
std::string result;
|
566
577
|
|
567
|
-
if (/* 0x00 <=
|
568
|
-
result.push_back(
|
578
|
+
if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
|
579
|
+
result.push_back(cpt);
|
569
580
|
return result;
|
570
581
|
}
|
571
|
-
if (0x80 <=
|
572
|
-
result.push_back(0xc0 | ((
|
573
|
-
result.push_back(0x80 | (
|
582
|
+
if (0x80 <= cpt && cpt <= 0x7ff) {
|
583
|
+
result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
|
584
|
+
result.push_back(0x80 | (cpt & 0x3f));
|
574
585
|
return result;
|
575
586
|
}
|
576
|
-
if (0x800 <=
|
577
|
-
result.push_back(0xe0 | ((
|
578
|
-
result.push_back(0x80 | ((
|
579
|
-
result.push_back(0x80 | (
|
587
|
+
if (0x800 <= cpt && cpt <= 0xffff) {
|
588
|
+
result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
|
589
|
+
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
590
|
+
result.push_back(0x80 | (cpt & 0x3f));
|
580
591
|
return result;
|
581
592
|
}
|
582
|
-
if (0x10000 <=
|
583
|
-
result.push_back(0xf0 | ((
|
584
|
-
result.push_back(0x80 | ((
|
585
|
-
result.push_back(0x80 | ((
|
586
|
-
result.push_back(0x80 | (
|
593
|
+
if (0x10000 <= cpt && cpt <= 0x10ffff) {
|
594
|
+
result.push_back(0xf0 | ((cpt >> 18) & 0x07));
|
595
|
+
result.push_back(0x80 | ((cpt >> 12) & 0x3f));
|
596
|
+
result.push_back(0x80 | ((cpt >> 6) & 0x3f));
|
597
|
+
result.push_back(0x80 | (cpt & 0x3f));
|
587
598
|
return result;
|
588
599
|
}
|
589
600
|
|
@@ -613,19 +624,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
|
613
624
|
return result;
|
614
625
|
}
|
615
626
|
|
616
|
-
|
617
|
-
static const
|
627
|
+
unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) {
|
628
|
+
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
618
629
|
static const auto cpt_flags = unicode_cpt_flags_array();
|
619
|
-
return
|
630
|
+
return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef;
|
620
631
|
}
|
621
632
|
|
622
|
-
|
623
|
-
static const
|
633
|
+
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) {
|
634
|
+
static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED);
|
624
635
|
if (utf8.empty()) {
|
625
636
|
return undef; // undefined
|
626
637
|
}
|
627
638
|
size_t offset = 0;
|
628
|
-
return
|
639
|
+
return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset));
|
629
640
|
}
|
630
641
|
|
631
642
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
@@ -638,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
|
638
649
|
return map.at(utf8);
|
639
650
|
}
|
640
651
|
|
641
|
-
uint32_t unicode_tolower(uint32_t
|
652
|
+
uint32_t unicode_tolower(uint32_t cpt) {
|
642
653
|
// binary search
|
643
|
-
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(),
|
654
|
+
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt,
|
644
655
|
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
645
656
|
return pair.first < value;
|
646
657
|
});
|
647
|
-
if (it != unicode_map_lowercase.end() && it->first ==
|
658
|
+
if (it != unicode_map_lowercase.end() && it->first == cpt) {
|
648
659
|
return it->second;
|
649
660
|
}
|
650
|
-
return
|
661
|
+
return cpt; // Return the original code point if no lowercase mapping is found
|
651
662
|
}
|
652
663
|
|
653
664
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
654
665
|
// unicode categories
|
655
666
|
static const std::map<std::string, int> k_ucat_enum = {
|
656
|
-
{ "\\p{N}",
|
657
|
-
{ "\\p{L}",
|
658
|
-
{ "\\p{P}",
|
667
|
+
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
668
|
+
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
669
|
+
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
659
670
|
};
|
660
671
|
|
661
672
|
static const std::map<int, int> k_ucat_cpt = {
|
662
|
-
{
|
663
|
-
{
|
664
|
-
{
|
673
|
+
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
674
|
+
{ unicode_cpt_flags::LETTER, 0xD2 },
|
675
|
+
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
665
676
|
};
|
666
677
|
|
667
678
|
static const std::map<int, std::string> k_ucat_map = {
|
668
|
-
{
|
669
|
-
{
|
670
|
-
{
|
679
|
+
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
680
|
+
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
681
|
+
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
671
682
|
};
|
672
683
|
|
673
684
|
// compute collapsed codepoints only if needed by at least one regex
|
674
685
|
bool need_collapse = false;
|
675
|
-
for (auto & regex_expr : regex_exprs) {
|
686
|
+
for (const auto & regex_expr : regex_exprs) {
|
676
687
|
// search for unicode categories
|
677
688
|
for (const auto & ucat : k_ucat_enum) {
|
678
689
|
if (std::string::npos != regex_expr.find(ucat.first)) {
|
@@ -698,7 +709,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
698
709
|
continue;
|
699
710
|
}
|
700
711
|
|
701
|
-
const auto flags =
|
712
|
+
const auto flags = unicode_cpt_flags_from_cpt(cpts[i]);
|
702
713
|
|
703
714
|
if (flags.is_whitespace) {
|
704
715
|
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
|
@@ -714,7 +725,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
714
725
|
|
715
726
|
std::vector<size_t> bpe_offsets = { cpts.size() };
|
716
727
|
|
717
|
-
for (auto & regex_expr : regex_exprs) {
|
728
|
+
for (const auto & regex_expr : regex_exprs) {
|
718
729
|
// first, see if we have an efficient custom regex implementation
|
719
730
|
auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
|
720
731
|
|
@@ -728,7 +739,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
728
739
|
// if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
|
729
740
|
// with the corresponding collapsed representation
|
730
741
|
bool use_collapsed = false;
|
731
|
-
for (auto & ucat : k_ucat_enum) {
|
742
|
+
for (const auto & ucat : k_ucat_enum) {
|
732
743
|
if (std::string::npos != regex_expr.find(ucat.first)) {
|
733
744
|
use_collapsed = true;
|
734
745
|
break;
|
@@ -794,7 +805,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
794
805
|
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
|
795
806
|
std::wstring wtext(cpts.begin(), cpts.end());
|
796
807
|
for (size_t i = 0; i < wtext.size(); ++i) {
|
797
|
-
if (wtext[i] > 0x7F &&
|
808
|
+
if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) {
|
798
809
|
wtext[i] = 0x0B;
|
799
810
|
}
|
800
811
|
}
|
package/cpp/unicode.h
CHANGED
@@ -4,9 +4,7 @@
|
|
4
4
|
#include <string>
|
5
5
|
#include <vector>
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
struct codepoint_flags {
|
7
|
+
struct unicode_cpt_flags {
|
10
8
|
enum {
|
11
9
|
UNDEFINED = 0x0001,
|
12
10
|
NUMBER = 0x0002, // regex: \p{N}
|
@@ -35,7 +33,7 @@ struct codepoint_flags {
|
|
35
33
|
uint16_t is_nfd : 1;
|
36
34
|
|
37
35
|
// decode from uint16
|
38
|
-
inline
|
36
|
+
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
39
37
|
*reinterpret_cast<uint16_t*>(this) = flags;
|
40
38
|
}
|
41
39
|
|
@@ -50,18 +48,19 @@ struct codepoint_flags {
|
|
50
48
|
|
51
49
|
size_t unicode_len_utf8(char src);
|
52
50
|
|
53
|
-
std::string unicode_cpt_to_utf8(uint32_t
|
54
|
-
uint32_t
|
51
|
+
std::string unicode_cpt_to_utf8 (uint32_t cpt);
|
52
|
+
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
53
|
+
|
55
54
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
56
55
|
|
57
56
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
58
57
|
|
59
|
-
|
60
|
-
|
58
|
+
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
|
59
|
+
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
|
61
60
|
|
62
61
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
63
|
-
uint8_t
|
62
|
+
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
64
63
|
|
65
|
-
uint32_t unicode_tolower(uint32_t
|
64
|
+
uint32_t unicode_tolower(uint32_t cpt);
|
66
65
|
|
67
66
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
package/ios/RNLlamaContext.mm
CHANGED
@@ -292,6 +292,19 @@
|
|
292
292
|
if (params[@"xtc_probability"]) sparams.xtc_probability = [params[@"xtc_probability"] doubleValue];
|
293
293
|
if (params[@"typical_p"]) sparams.typ_p = [params[@"typical_p"] doubleValue];
|
294
294
|
|
295
|
+
if (params[@"dry_multiplier"]) sparams.dry_multiplier = [params[@"dry_multiplier"] doubleValue];
|
296
|
+
if (params[@"dry_base"]) sparams.dry_base = [params[@"dry_base"] doubleValue];
|
297
|
+
if (params[@"dry_allowed_length"]) sparams.dry_allowed_length = [params[@"dry_allowed_length"] intValue];
|
298
|
+
if (params[@"dry_penalty_last_n"]) sparams.dry_penalty_last_n = [params[@"dry_penalty_last_n"] intValue];
|
299
|
+
|
300
|
+
// dry break seq
|
301
|
+
if (params[@"dry_sequence_breakers"] && [params[@"dry_sequence_breakers"] isKindOfClass:[NSArray class]]) {
|
302
|
+
NSArray *dry_sequence_breakers = params[@"dry_sequence_breakers"];
|
303
|
+
for (NSString *s in dry_sequence_breakers) {
|
304
|
+
sparams.dry_sequence_breakers.push_back([s UTF8String]);
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
295
308
|
if (params[@"grammar"]) {
|
296
309
|
sparams.grammar = [params[@"grammar"] UTF8String];
|
297
310
|
}
|
@@ -1 +1 @@
|
|
1
|
-
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,
|
1
|
+
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GA+RnCC,gCAAmB,CAACC,GAAG,CAAO,SAAS,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
|
package/lib/commonjs/grammar.js
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
Object.defineProperty(exports, "__esModule", {
|
4
4
|
value: true
|
5
5
|
});
|
6
|
-
exports.convertJsonSchemaToGrammar = exports.SchemaGrammarConverter = void 0;
|
6
|
+
exports.convertJsonSchemaToGrammar = exports.SchemaGrammarConverterBuiltinRule = exports.SchemaGrammarConverter = void 0;
|
7
7
|
/* eslint-disable no-restricted-syntax */
|
8
8
|
/* eslint-disable no-underscore-dangle */
|
9
9
|
const SPACE_RULE = '" "?';
|
@@ -60,12 +60,14 @@ function buildRepetition(itemRule, minItems, maxItems) {
|
|
60
60
|
}
|
61
61
|
return result;
|
62
62
|
}
|
63
|
-
class
|
63
|
+
class SchemaGrammarConverterBuiltinRule {
|
64
64
|
constructor(content, deps) {
|
65
65
|
this.content = content;
|
66
66
|
this.deps = deps || [];
|
67
67
|
}
|
68
68
|
}
|
69
|
+
exports.SchemaGrammarConverterBuiltinRule = SchemaGrammarConverterBuiltinRule;
|
70
|
+
const BuiltinRule = SchemaGrammarConverterBuiltinRule;
|
69
71
|
const UP_TO_15_DIGITS = buildRepetition('[0-9]', 0, 15);
|
70
72
|
const PRIMITIVE_RULES = {
|
71
73
|
boolean: new BuiltinRule('("true" | "false") space', []),
|