llama_cpp 0.15.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +3 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +289 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +77 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +23 -8
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1 -1
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +18 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +11 -9
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +950 -267
- data/vendor/tmp/llama.cpp/ggml.c +1090 -89
- data/vendor/tmp/llama.cpp/ggml.h +15 -7
- data/vendor/tmp/llama.cpp/llama.cpp +57 -17
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1187 -655
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -1
- data/vendor/tmp/llama.cpp/unicode.cpp +254 -122
- data/vendor/tmp/llama.cpp/unicode.h +4 -2
- metadata +2 -2
@@ -5,8 +5,9 @@
|
|
5
5
|
#include <utility>
|
6
6
|
#include <vector>
|
7
7
|
|
8
|
-
extern const std::vector<std::pair<uint32_t, uint32_t>>
|
8
|
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
9
9
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
10
|
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
|
10
11
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
11
12
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
12
13
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#include <stdexcept>
|
10
10
|
#include <string>
|
11
11
|
#include <unordered_map>
|
12
|
+
#include <unordered_set>
|
12
13
|
#include <utility>
|
13
14
|
#include <vector>
|
14
15
|
#include <locale>
|
@@ -110,28 +111,28 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
|
110
111
|
|
111
112
|
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
112
113
|
std::unordered_map<uint32_t, int> cpt_types;
|
113
|
-
for (auto p :
|
114
|
-
for (auto i = p.first; i <= p.second; ++
|
115
|
-
cpt_types[i] =
|
114
|
+
for (auto p : unicode_ranges_number) {
|
115
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
116
|
+
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
116
117
|
}
|
117
118
|
}
|
118
119
|
for (auto p : unicode_ranges_letter) {
|
119
|
-
for (auto i = p.first; i <= p.second; ++
|
120
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
120
121
|
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
121
122
|
}
|
122
123
|
}
|
123
|
-
for (auto p :
|
124
|
-
for (auto i = p.first; i <= p.second; ++
|
125
|
-
cpt_types[i] =
|
124
|
+
for (auto p : unicode_ranges_separator) {
|
125
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
126
|
+
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
126
127
|
}
|
127
128
|
}
|
128
129
|
for (auto p : unicode_ranges_accent_mark) {
|
129
|
-
for (auto i = p.first; i <= p.second; ++
|
130
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
130
131
|
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
131
132
|
}
|
132
133
|
}
|
133
134
|
for (auto p : unicode_ranges_punctuation) {
|
134
|
-
for (auto i = p.first; i <= p.second; ++
|
135
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
135
136
|
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
136
137
|
}
|
137
138
|
}
|
@@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
|
141
142
|
}
|
142
143
|
}
|
143
144
|
for (auto p : unicode_ranges_control) {
|
144
|
-
for (auto i = p.first; i <= p.second; ++
|
145
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
145
146
|
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
146
147
|
}
|
147
148
|
}
|
@@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
224
225
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
225
226
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
226
227
|
|
227
|
-
size_t start = 0;
|
228
|
-
|
229
228
|
const auto cpts = unicode_cpts_from_utf8(text);
|
230
229
|
|
230
|
+
size_t start = 0;
|
231
231
|
for (auto offset : offsets) {
|
232
|
-
|
232
|
+
const size_t offset_ini = start;
|
233
|
+
const size_t offset_end = start + offset;
|
234
|
+
assert(offset_end <= cpts.size());
|
235
|
+
start = offset_end;
|
236
|
+
|
237
|
+
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
238
|
+
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
239
|
+
};
|
240
|
+
|
241
|
+
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
242
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
243
|
+
};
|
244
|
+
|
245
|
+
size_t _prev_end = offset_ini;
|
246
|
+
auto _add_token = [&] (const size_t end) -> size_t {
|
247
|
+
assert(_prev_end <= end && end <= offset_end);
|
248
|
+
size_t len = end - _prev_end;
|
249
|
+
if (len > 0) {
|
250
|
+
bpe_offsets.push_back(len);
|
251
|
+
}
|
252
|
+
_prev_end = end;
|
253
|
+
//if (len > 0) {
|
254
|
+
// std::string s = "";
|
255
|
+
// for(size_t p = end-len; p < end; p++)
|
256
|
+
// s += unicode_cpt_to_utf8(cpts[p]);
|
257
|
+
// printf(">>> '%s'\n", s.c_str());
|
258
|
+
//}
|
259
|
+
return len;
|
260
|
+
};
|
261
|
+
|
262
|
+
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
263
|
+
const char32_t cpt = _get_cpt(pos);
|
264
|
+
const int cpt_type = _get_cpt_type(pos);
|
265
|
+
|
266
|
+
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
267
|
+
if (cpt == '\'' && pos+1 < offset_end) {
|
268
|
+
char32_t cpt_next = _get_cpt(pos+1);
|
269
|
+
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
270
|
+
pos += _add_token(pos+2);
|
271
|
+
continue;
|
272
|
+
}
|
273
|
+
if (pos+2 < offset_end) {
|
274
|
+
char32_t cpt_next_next = _get_cpt(pos+2);
|
275
|
+
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
276
|
+
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
277
|
+
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
278
|
+
pos += _add_token(pos+3);
|
279
|
+
continue;
|
280
|
+
}
|
281
|
+
}
|
282
|
+
}
|
233
283
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
284
|
+
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
285
|
+
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
286
|
+
// regex: <space>?\p{L}+
|
287
|
+
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
288
|
+
pos += (cpt == ' ');
|
289
|
+
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
290
|
+
cpt2_type = _get_cpt_type(++pos);
|
291
|
+
}
|
292
|
+
_add_token(pos);
|
293
|
+
continue;
|
294
|
+
}
|
295
|
+
// regex: <space>?\p{N}+
|
296
|
+
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
297
|
+
pos += (cpt == ' ');
|
298
|
+
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
299
|
+
cpt2_type = _get_cpt_type(++pos);
|
300
|
+
}
|
301
|
+
_add_token(pos);
|
302
|
+
continue;
|
303
|
+
}
|
304
|
+
// regex: <space>?[^\s\p{L}\p{N}]+
|
305
|
+
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
306
|
+
pos += (cpt == ' ');
|
307
|
+
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
308
|
+
cpt2_type = _get_cpt_type(++pos);
|
309
|
+
cpt2 = _get_cpt(pos);
|
310
|
+
}
|
311
|
+
_add_token(pos);
|
312
|
+
continue;
|
313
|
+
}
|
239
314
|
|
240
|
-
|
241
|
-
|
315
|
+
size_t num_whitespaces = 0;
|
316
|
+
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
317
|
+
num_whitespaces++;
|
318
|
+
}
|
242
319
|
|
243
|
-
|
244
|
-
|
320
|
+
// regex: \s+(?!\S)
|
321
|
+
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
322
|
+
pos += num_whitespaces - 1;
|
323
|
+
_add_token(pos);
|
324
|
+
continue;
|
325
|
+
}
|
326
|
+
|
327
|
+
// regex: \s+
|
328
|
+
if (num_whitespaces > 0) {
|
329
|
+
pos += num_whitespaces;
|
330
|
+
_add_token(pos);
|
331
|
+
continue;
|
332
|
+
}
|
333
|
+
|
334
|
+
// no matches
|
335
|
+
_add_token(++pos);
|
245
336
|
}
|
337
|
+
}
|
338
|
+
|
339
|
+
return bpe_offsets;
|
340
|
+
}
|
246
341
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
342
|
+
// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
343
|
+
static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
|
344
|
+
std::vector<size_t> bpe_offsets; // store the offset of each word
|
345
|
+
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
251
346
|
|
252
|
-
|
253
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
254
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
347
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
255
348
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
349
|
+
size_t start = 0;
|
350
|
+
for (auto offset : offsets) {
|
351
|
+
const size_t offset_ini = start;
|
352
|
+
const size_t offset_end = start + offset;
|
353
|
+
assert(offset_end <= cpts.size());
|
354
|
+
start = offset_end;
|
355
|
+
|
356
|
+
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
357
|
+
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
358
|
+
};
|
359
|
+
|
360
|
+
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
361
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
362
|
+
};
|
363
|
+
|
364
|
+
size_t _prev_end = offset_ini;
|
365
|
+
auto _add_token = [&] (const size_t end) -> size_t {
|
366
|
+
assert(_prev_end <= end && end <= offset_end);
|
367
|
+
size_t len = end - _prev_end;
|
368
|
+
if (len > 0) {
|
369
|
+
bpe_offsets.push_back(len);
|
370
|
+
}
|
371
|
+
_prev_end = end;
|
372
|
+
//if (len > 0) {
|
373
|
+
// std::string s = "";
|
374
|
+
// for(size_t p = end-len; p < end; p++)
|
375
|
+
// s += unicode_cpt_to_utf8(cpts[p]);
|
376
|
+
// printf(">>> '%s'\n", s.c_str());
|
377
|
+
//}
|
378
|
+
return len;
|
379
|
+
};
|
380
|
+
|
381
|
+
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
382
|
+
const char32_t cpt = _get_cpt(pos);
|
383
|
+
const int cpt_type = _get_cpt_type(pos);
|
384
|
+
|
385
|
+
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
386
|
+
if (cpt == '\'' && pos+1 < offset_end) {
|
387
|
+
char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
388
|
+
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
389
|
+
pos += _add_token(pos+2);
|
390
|
+
continue;
|
261
391
|
}
|
262
|
-
if (
|
263
|
-
|
264
|
-
|
392
|
+
if (pos+2 < offset_end) {
|
393
|
+
char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
394
|
+
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
395
|
+
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
396
|
+
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
397
|
+
pos += _add_token(pos+3);
|
398
|
+
continue;
|
265
399
|
}
|
266
|
-
token = utf_char + utf_char_next;
|
267
|
-
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
268
|
-
token = "";
|
269
|
-
i++;
|
270
|
-
continue;
|
271
400
|
}
|
272
401
|
}
|
273
|
-
if (!split_condition && bytes_remain >= 3) {
|
274
|
-
// 're|'ve|'ll
|
275
|
-
if (utf_char == "\'" && (
|
276
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
277
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
278
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
279
|
-
) {
|
280
|
-
split_condition = true;
|
281
|
-
}
|
282
|
-
if (split_condition) {
|
283
|
-
// current token + next token can be defined
|
284
|
-
if (token.size()) {
|
285
|
-
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
286
|
-
}
|
287
|
-
token = utf_char;
|
288
|
-
token += utf_char_next;
|
289
|
-
token += utf_char_next_next;
|
290
402
|
|
291
|
-
|
292
|
-
|
293
|
-
|
403
|
+
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
404
|
+
if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
|
405
|
+
if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
|
406
|
+
pos++;
|
407
|
+
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
|
408
|
+
pos++;
|
409
|
+
}
|
410
|
+
_add_token(pos);
|
294
411
|
continue;
|
295
412
|
}
|
296
413
|
}
|
297
414
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
}
|
307
|
-
else if (
|
308
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
309
|
-
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
310
|
-
) {
|
311
|
-
collecting_special = true;
|
312
|
-
collecting = true;
|
313
|
-
}
|
314
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
315
|
-
collecting_whitespace_lookahead = true;
|
316
|
-
collecting = true;
|
317
|
-
}
|
318
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
319
|
-
split_condition = true;
|
415
|
+
// regex: \p{N}{1,3}
|
416
|
+
if (cpt_type == CODEPOINT_TYPE_NUMBER) {
|
417
|
+
size_t ini = pos;
|
418
|
+
while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
|
419
|
+
if (++pos - ini >= 3 ) {
|
420
|
+
_add_token(pos);
|
421
|
+
ini = pos;
|
422
|
+
}
|
320
423
|
}
|
424
|
+
_add_token(pos);
|
425
|
+
continue;
|
321
426
|
}
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
427
|
+
|
428
|
+
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
429
|
+
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
430
|
+
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
431
|
+
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
432
|
+
pos += (cpt == ' ');
|
433
|
+
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
434
|
+
cpt2_type = _get_cpt_type(++pos);
|
435
|
+
cpt2 = _get_cpt(pos);
|
328
436
|
}
|
329
|
-
|
330
|
-
|
437
|
+
while (cpt2 == '\r' || cpt2 == '\n') {
|
438
|
+
cpt2 = _get_cpt(++pos);
|
331
439
|
}
|
332
|
-
|
333
|
-
|
440
|
+
_add_token(pos);
|
441
|
+
continue;
|
442
|
+
}
|
443
|
+
|
444
|
+
size_t num_whitespaces = 0;
|
445
|
+
size_t last_end_r_or_n = 0;
|
446
|
+
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
447
|
+
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
448
|
+
if (cpt2 == '\r' || cpt2 == '\n') {
|
449
|
+
last_end_r_or_n = pos + num_whitespaces + 1;
|
334
450
|
}
|
451
|
+
num_whitespaces++;
|
335
452
|
}
|
336
453
|
|
337
|
-
|
338
|
-
|
339
|
-
|
454
|
+
// regex: \s*[\r\n]+
|
455
|
+
if (last_end_r_or_n > 0) {
|
456
|
+
pos = last_end_r_or_n;
|
457
|
+
_add_token(pos);
|
458
|
+
continue;
|
340
459
|
}
|
341
460
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
collecting = false;
|
348
|
-
collecting_letter = false;
|
349
|
-
collecting_numeric = false;
|
350
|
-
collecting_special = false;
|
351
|
-
collecting_whitespace_lookahead = false;
|
461
|
+
// regex: \s+(?!\S)
|
462
|
+
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
463
|
+
pos += num_whitespaces - 1;
|
464
|
+
_add_token(pos);
|
465
|
+
continue;
|
352
466
|
}
|
353
|
-
|
354
|
-
|
467
|
+
|
468
|
+
// regex: \s+
|
469
|
+
if (num_whitespaces > 0) {
|
470
|
+
pos += num_whitespaces;
|
471
|
+
_add_token(pos);
|
472
|
+
continue;
|
355
473
|
}
|
356
|
-
}
|
357
474
|
|
358
|
-
|
475
|
+
// no matches
|
476
|
+
_add_token(++pos);
|
477
|
+
}
|
359
478
|
}
|
360
479
|
|
361
480
|
return bpe_offsets;
|
@@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
|
|
424
543
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
425
544
|
std::vector<size_t> bpe_offsets;
|
426
545
|
|
427
|
-
(
|
428
|
-
|
429
|
-
(
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
546
|
+
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
547
|
+
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
548
|
+
} else if (
|
549
|
+
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
550
|
+
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
551
|
+
|
552
|
+
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
553
|
+
}
|
435
554
|
|
436
555
|
return bpe_offsets;
|
437
556
|
}
|
@@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
|
|
506
625
|
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
507
626
|
}
|
508
627
|
|
628
|
+
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
629
|
+
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
630
|
+
std::unordered_set<uint32_t> is_whitespace;
|
631
|
+
for (auto p : unicode_ranges_whitespace) {
|
632
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
633
|
+
is_whitespace.insert(i);
|
634
|
+
}
|
635
|
+
}
|
636
|
+
return is_whitespace;
|
637
|
+
}();
|
638
|
+
return (bool)is_whitespace.count(cp);
|
639
|
+
}
|
640
|
+
|
509
641
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
510
642
|
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
511
643
|
return map.at(byte);
|
@@ -524,19 +656,19 @@ char32_t unicode_tolower(char32_t cp) {
|
|
524
656
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
525
657
|
// unicode categories
|
526
658
|
static const std::map<std::string, int> k_ucat_enum = {
|
527
|
-
{ "\\p{N}",
|
659
|
+
{ "\\p{N}", CODEPOINT_TYPE_NUMBER },
|
528
660
|
{ "\\p{L}", CODEPOINT_TYPE_LETTER },
|
529
661
|
{ "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
|
530
662
|
};
|
531
663
|
|
532
664
|
static const std::map<int, int> k_ucat_cpt = {
|
533
|
-
{
|
665
|
+
{ CODEPOINT_TYPE_NUMBER, 0xD1 },
|
534
666
|
{ CODEPOINT_TYPE_LETTER, 0xD2 },
|
535
667
|
{ CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
|
536
668
|
};
|
537
669
|
|
538
670
|
static const std::map<int, std::string> k_ucat_map = {
|
539
|
-
{
|
671
|
+
{ CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
|
540
672
|
{ CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
541
673
|
{ CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
542
674
|
};
|
@@ -5,9 +5,9 @@
|
|
5
5
|
#include <vector>
|
6
6
|
|
7
7
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
8
|
-
#define
|
8
|
+
#define CODEPOINT_TYPE_NUMBER 1
|
9
9
|
#define CODEPOINT_TYPE_LETTER 2
|
10
|
-
#define
|
10
|
+
#define CODEPOINT_TYPE_SEPARATOR 3
|
11
11
|
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
12
12
|
#define CODEPOINT_TYPE_PUNCTUATION 5
|
13
13
|
#define CODEPOINT_TYPE_SYMBOL 6
|
@@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
|
21
21
|
int unicode_cpt_type(uint32_t cp);
|
22
22
|
int unicode_cpt_type(const std::string & utf8);
|
23
23
|
|
24
|
+
bool unicode_cpt_is_whitespace(uint32_t cp);
|
25
|
+
|
24
26
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
25
27
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
26
28
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.15.
|
4
|
+
version: 0.15.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|