llama_cpp 0.15.0 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -0
- data/vendor/tmp/llama.cpp/Makefile +3 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +289 -17
- data/vendor/tmp/llama.cpp/ggml-impl.h +77 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +23 -8
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1 -1
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +18 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +11 -9
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +950 -267
- data/vendor/tmp/llama.cpp/ggml.c +1090 -89
- data/vendor/tmp/llama.cpp/ggml.h +15 -7
- data/vendor/tmp/llama.cpp/llama.cpp +57 -17
- data/vendor/tmp/llama.cpp/llama.h +7 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +56 -21
- data/vendor/tmp/llama.cpp/unicode-data.cpp +1187 -655
- data/vendor/tmp/llama.cpp/unicode-data.h +2 -1
- data/vendor/tmp/llama.cpp/unicode.cpp +254 -122
- data/vendor/tmp/llama.cpp/unicode.h +4 -2
- metadata +2 -2
@@ -5,8 +5,9 @@
|
|
5
5
|
#include <utility>
|
6
6
|
#include <vector>
|
7
7
|
|
8
|
-
extern const std::vector<std::pair<uint32_t, uint32_t>>
|
8
|
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
9
9
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
10
|
+
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
|
10
11
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
11
12
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
12
13
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
@@ -9,6 +9,7 @@
|
|
9
9
|
#include <stdexcept>
|
10
10
|
#include <string>
|
11
11
|
#include <unordered_map>
|
12
|
+
#include <unordered_set>
|
12
13
|
#include <utility>
|
13
14
|
#include <vector>
|
14
15
|
#include <locale>
|
@@ -110,28 +111,28 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
|
110
111
|
|
111
112
|
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
112
113
|
std::unordered_map<uint32_t, int> cpt_types;
|
113
|
-
for (auto p :
|
114
|
-
for (auto i = p.first; i <= p.second; ++
|
115
|
-
cpt_types[i] =
|
114
|
+
for (auto p : unicode_ranges_number) {
|
115
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
116
|
+
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
116
117
|
}
|
117
118
|
}
|
118
119
|
for (auto p : unicode_ranges_letter) {
|
119
|
-
for (auto i = p.first; i <= p.second; ++
|
120
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
120
121
|
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
121
122
|
}
|
122
123
|
}
|
123
|
-
for (auto p :
|
124
|
-
for (auto i = p.first; i <= p.second; ++
|
125
|
-
cpt_types[i] =
|
124
|
+
for (auto p : unicode_ranges_separator) {
|
125
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
126
|
+
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
126
127
|
}
|
127
128
|
}
|
128
129
|
for (auto p : unicode_ranges_accent_mark) {
|
129
|
-
for (auto i = p.first; i <= p.second; ++
|
130
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
130
131
|
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
131
132
|
}
|
132
133
|
}
|
133
134
|
for (auto p : unicode_ranges_punctuation) {
|
134
|
-
for (auto i = p.first; i <= p.second; ++
|
135
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
135
136
|
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
136
137
|
}
|
137
138
|
}
|
@@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
|
141
142
|
}
|
142
143
|
}
|
143
144
|
for (auto p : unicode_ranges_control) {
|
144
|
-
for (auto i = p.first; i <= p.second; ++
|
145
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
145
146
|
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
146
147
|
}
|
147
148
|
}
|
@@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
224
225
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
225
226
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
226
227
|
|
227
|
-
size_t start = 0;
|
228
|
-
|
229
228
|
const auto cpts = unicode_cpts_from_utf8(text);
|
230
229
|
|
230
|
+
size_t start = 0;
|
231
231
|
for (auto offset : offsets) {
|
232
|
-
|
232
|
+
const size_t offset_ini = start;
|
233
|
+
const size_t offset_end = start + offset;
|
234
|
+
assert(offset_end <= cpts.size());
|
235
|
+
start = offset_end;
|
236
|
+
|
237
|
+
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
238
|
+
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
239
|
+
};
|
240
|
+
|
241
|
+
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
242
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
243
|
+
};
|
244
|
+
|
245
|
+
size_t _prev_end = offset_ini;
|
246
|
+
auto _add_token = [&] (const size_t end) -> size_t {
|
247
|
+
assert(_prev_end <= end && end <= offset_end);
|
248
|
+
size_t len = end - _prev_end;
|
249
|
+
if (len > 0) {
|
250
|
+
bpe_offsets.push_back(len);
|
251
|
+
}
|
252
|
+
_prev_end = end;
|
253
|
+
//if (len > 0) {
|
254
|
+
// std::string s = "";
|
255
|
+
// for(size_t p = end-len; p < end; p++)
|
256
|
+
// s += unicode_cpt_to_utf8(cpts[p]);
|
257
|
+
// printf(">>> '%s'\n", s.c_str());
|
258
|
+
//}
|
259
|
+
return len;
|
260
|
+
};
|
261
|
+
|
262
|
+
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
263
|
+
const char32_t cpt = _get_cpt(pos);
|
264
|
+
const int cpt_type = _get_cpt_type(pos);
|
265
|
+
|
266
|
+
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
267
|
+
if (cpt == '\'' && pos+1 < offset_end) {
|
268
|
+
char32_t cpt_next = _get_cpt(pos+1);
|
269
|
+
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
270
|
+
pos += _add_token(pos+2);
|
271
|
+
continue;
|
272
|
+
}
|
273
|
+
if (pos+2 < offset_end) {
|
274
|
+
char32_t cpt_next_next = _get_cpt(pos+2);
|
275
|
+
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
276
|
+
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
277
|
+
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
278
|
+
pos += _add_token(pos+3);
|
279
|
+
continue;
|
280
|
+
}
|
281
|
+
}
|
282
|
+
}
|
233
283
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
284
|
+
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
285
|
+
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
286
|
+
// regex: <space>?\p{L}+
|
287
|
+
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
288
|
+
pos += (cpt == ' ');
|
289
|
+
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
290
|
+
cpt2_type = _get_cpt_type(++pos);
|
291
|
+
}
|
292
|
+
_add_token(pos);
|
293
|
+
continue;
|
294
|
+
}
|
295
|
+
// regex: <space>?\p{N}+
|
296
|
+
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
297
|
+
pos += (cpt == ' ');
|
298
|
+
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
299
|
+
cpt2_type = _get_cpt_type(++pos);
|
300
|
+
}
|
301
|
+
_add_token(pos);
|
302
|
+
continue;
|
303
|
+
}
|
304
|
+
// regex: <space>?[^\s\p{L}\p{N}]+
|
305
|
+
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
306
|
+
pos += (cpt == ' ');
|
307
|
+
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
308
|
+
cpt2_type = _get_cpt_type(++pos);
|
309
|
+
cpt2 = _get_cpt(pos);
|
310
|
+
}
|
311
|
+
_add_token(pos);
|
312
|
+
continue;
|
313
|
+
}
|
239
314
|
|
240
|
-
|
241
|
-
|
315
|
+
size_t num_whitespaces = 0;
|
316
|
+
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
317
|
+
num_whitespaces++;
|
318
|
+
}
|
242
319
|
|
243
|
-
|
244
|
-
|
320
|
+
// regex: \s+(?!\S)
|
321
|
+
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
322
|
+
pos += num_whitespaces - 1;
|
323
|
+
_add_token(pos);
|
324
|
+
continue;
|
325
|
+
}
|
326
|
+
|
327
|
+
// regex: \s+
|
328
|
+
if (num_whitespaces > 0) {
|
329
|
+
pos += num_whitespaces;
|
330
|
+
_add_token(pos);
|
331
|
+
continue;
|
332
|
+
}
|
333
|
+
|
334
|
+
// no matches
|
335
|
+
_add_token(++pos);
|
245
336
|
}
|
337
|
+
}
|
338
|
+
|
339
|
+
return bpe_offsets;
|
340
|
+
}
|
246
341
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
342
|
+
// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
343
|
+
static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
|
344
|
+
std::vector<size_t> bpe_offsets; // store the offset of each word
|
345
|
+
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
251
346
|
|
252
|
-
|
253
|
-
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
254
|
-
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
347
|
+
const auto cpts = unicode_cpts_from_utf8(text);
|
255
348
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
349
|
+
size_t start = 0;
|
350
|
+
for (auto offset : offsets) {
|
351
|
+
const size_t offset_ini = start;
|
352
|
+
const size_t offset_end = start + offset;
|
353
|
+
assert(offset_end <= cpts.size());
|
354
|
+
start = offset_end;
|
355
|
+
|
356
|
+
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
357
|
+
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
358
|
+
};
|
359
|
+
|
360
|
+
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
361
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
362
|
+
};
|
363
|
+
|
364
|
+
size_t _prev_end = offset_ini;
|
365
|
+
auto _add_token = [&] (const size_t end) -> size_t {
|
366
|
+
assert(_prev_end <= end && end <= offset_end);
|
367
|
+
size_t len = end - _prev_end;
|
368
|
+
if (len > 0) {
|
369
|
+
bpe_offsets.push_back(len);
|
370
|
+
}
|
371
|
+
_prev_end = end;
|
372
|
+
//if (len > 0) {
|
373
|
+
// std::string s = "";
|
374
|
+
// for(size_t p = end-len; p < end; p++)
|
375
|
+
// s += unicode_cpt_to_utf8(cpts[p]);
|
376
|
+
// printf(">>> '%s'\n", s.c_str());
|
377
|
+
//}
|
378
|
+
return len;
|
379
|
+
};
|
380
|
+
|
381
|
+
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
382
|
+
const char32_t cpt = _get_cpt(pos);
|
383
|
+
const int cpt_type = _get_cpt_type(pos);
|
384
|
+
|
385
|
+
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
386
|
+
if (cpt == '\'' && pos+1 < offset_end) {
|
387
|
+
char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
388
|
+
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
389
|
+
pos += _add_token(pos+2);
|
390
|
+
continue;
|
261
391
|
}
|
262
|
-
if (
|
263
|
-
|
264
|
-
|
392
|
+
if (pos+2 < offset_end) {
|
393
|
+
char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
394
|
+
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
395
|
+
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
396
|
+
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
397
|
+
pos += _add_token(pos+3);
|
398
|
+
continue;
|
265
399
|
}
|
266
|
-
token = utf_char + utf_char_next;
|
267
|
-
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
268
|
-
token = "";
|
269
|
-
i++;
|
270
|
-
continue;
|
271
400
|
}
|
272
401
|
}
|
273
|
-
if (!split_condition && bytes_remain >= 3) {
|
274
|
-
// 're|'ve|'ll
|
275
|
-
if (utf_char == "\'" && (
|
276
|
-
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
277
|
-
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
278
|
-
(utf_char_next == "l" && utf_char_next_next == "l"))
|
279
|
-
) {
|
280
|
-
split_condition = true;
|
281
|
-
}
|
282
|
-
if (split_condition) {
|
283
|
-
// current token + next token can be defined
|
284
|
-
if (token.size()) {
|
285
|
-
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
286
|
-
}
|
287
|
-
token = utf_char;
|
288
|
-
token += utf_char_next;
|
289
|
-
token += utf_char_next_next;
|
290
402
|
|
291
|
-
|
292
|
-
|
293
|
-
|
403
|
+
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
404
|
+
if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
|
405
|
+
if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
|
406
|
+
pos++;
|
407
|
+
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
|
408
|
+
pos++;
|
409
|
+
}
|
410
|
+
_add_token(pos);
|
294
411
|
continue;
|
295
412
|
}
|
296
413
|
}
|
297
414
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
}
|
307
|
-
else if (
|
308
|
-
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
309
|
-
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
310
|
-
) {
|
311
|
-
collecting_special = true;
|
312
|
-
collecting = true;
|
313
|
-
}
|
314
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
315
|
-
collecting_whitespace_lookahead = true;
|
316
|
-
collecting = true;
|
317
|
-
}
|
318
|
-
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
319
|
-
split_condition = true;
|
415
|
+
// regex: \p{N}{1,3}
|
416
|
+
if (cpt_type == CODEPOINT_TYPE_NUMBER) {
|
417
|
+
size_t ini = pos;
|
418
|
+
while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
|
419
|
+
if (++pos - ini >= 3 ) {
|
420
|
+
_add_token(pos);
|
421
|
+
ini = pos;
|
422
|
+
}
|
320
423
|
}
|
424
|
+
_add_token(pos);
|
425
|
+
continue;
|
321
426
|
}
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
427
|
+
|
428
|
+
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
429
|
+
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
430
|
+
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
431
|
+
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
432
|
+
pos += (cpt == ' ');
|
433
|
+
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
434
|
+
cpt2_type = _get_cpt_type(++pos);
|
435
|
+
cpt2 = _get_cpt(pos);
|
328
436
|
}
|
329
|
-
|
330
|
-
|
437
|
+
while (cpt2 == '\r' || cpt2 == '\n') {
|
438
|
+
cpt2 = _get_cpt(++pos);
|
331
439
|
}
|
332
|
-
|
333
|
-
|
440
|
+
_add_token(pos);
|
441
|
+
continue;
|
442
|
+
}
|
443
|
+
|
444
|
+
size_t num_whitespaces = 0;
|
445
|
+
size_t last_end_r_or_n = 0;
|
446
|
+
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
447
|
+
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
448
|
+
if (cpt2 == '\r' || cpt2 == '\n') {
|
449
|
+
last_end_r_or_n = pos + num_whitespaces + 1;
|
334
450
|
}
|
451
|
+
num_whitespaces++;
|
335
452
|
}
|
336
453
|
|
337
|
-
|
338
|
-
|
339
|
-
|
454
|
+
// regex: \s*[\r\n]+
|
455
|
+
if (last_end_r_or_n > 0) {
|
456
|
+
pos = last_end_r_or_n;
|
457
|
+
_add_token(pos);
|
458
|
+
continue;
|
340
459
|
}
|
341
460
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
collecting = false;
|
348
|
-
collecting_letter = false;
|
349
|
-
collecting_numeric = false;
|
350
|
-
collecting_special = false;
|
351
|
-
collecting_whitespace_lookahead = false;
|
461
|
+
// regex: \s+(?!\S)
|
462
|
+
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
463
|
+
pos += num_whitespaces - 1;
|
464
|
+
_add_token(pos);
|
465
|
+
continue;
|
352
466
|
}
|
353
|
-
|
354
|
-
|
467
|
+
|
468
|
+
// regex: \s+
|
469
|
+
if (num_whitespaces > 0) {
|
470
|
+
pos += num_whitespaces;
|
471
|
+
_add_token(pos);
|
472
|
+
continue;
|
355
473
|
}
|
356
|
-
}
|
357
474
|
|
358
|
-
|
475
|
+
// no matches
|
476
|
+
_add_token(++pos);
|
477
|
+
}
|
359
478
|
}
|
360
479
|
|
361
480
|
return bpe_offsets;
|
@@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
|
|
424
543
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
425
544
|
std::vector<size_t> bpe_offsets;
|
426
545
|
|
427
|
-
(
|
428
|
-
|
429
|
-
(
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
546
|
+
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
547
|
+
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
548
|
+
} else if (
|
549
|
+
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
550
|
+
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
551
|
+
|
552
|
+
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
553
|
+
}
|
435
554
|
|
436
555
|
return bpe_offsets;
|
437
556
|
}
|
@@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
|
|
506
625
|
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
507
626
|
}
|
508
627
|
|
628
|
+
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
629
|
+
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
630
|
+
std::unordered_set<uint32_t> is_whitespace;
|
631
|
+
for (auto p : unicode_ranges_whitespace) {
|
632
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
633
|
+
is_whitespace.insert(i);
|
634
|
+
}
|
635
|
+
}
|
636
|
+
return is_whitespace;
|
637
|
+
}();
|
638
|
+
return (bool)is_whitespace.count(cp);
|
639
|
+
}
|
640
|
+
|
509
641
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
510
642
|
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
511
643
|
return map.at(byte);
|
@@ -524,19 +656,19 @@ char32_t unicode_tolower(char32_t cp) {
|
|
524
656
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
525
657
|
// unicode categories
|
526
658
|
static const std::map<std::string, int> k_ucat_enum = {
|
527
|
-
{ "\\p{N}",
|
659
|
+
{ "\\p{N}", CODEPOINT_TYPE_NUMBER },
|
528
660
|
{ "\\p{L}", CODEPOINT_TYPE_LETTER },
|
529
661
|
{ "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
|
530
662
|
};
|
531
663
|
|
532
664
|
static const std::map<int, int> k_ucat_cpt = {
|
533
|
-
{
|
665
|
+
{ CODEPOINT_TYPE_NUMBER, 0xD1 },
|
534
666
|
{ CODEPOINT_TYPE_LETTER, 0xD2 },
|
535
667
|
{ CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
|
536
668
|
};
|
537
669
|
|
538
670
|
static const std::map<int, std::string> k_ucat_map = {
|
539
|
-
{
|
671
|
+
{ CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
|
540
672
|
{ CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
541
673
|
{ CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
542
674
|
};
|
@@ -5,9 +5,9 @@
|
|
5
5
|
#include <vector>
|
6
6
|
|
7
7
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
8
|
-
#define
|
8
|
+
#define CODEPOINT_TYPE_NUMBER 1
|
9
9
|
#define CODEPOINT_TYPE_LETTER 2
|
10
|
-
#define
|
10
|
+
#define CODEPOINT_TYPE_SEPARATOR 3
|
11
11
|
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
12
12
|
#define CODEPOINT_TYPE_PUNCTUATION 5
|
13
13
|
#define CODEPOINT_TYPE_SYMBOL 6
|
@@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
|
21
21
|
int unicode_cpt_type(uint32_t cp);
|
22
22
|
int unicode_cpt_type(const std::string & utf8);
|
23
23
|
|
24
|
+
bool unicode_cpt_is_whitespace(uint32_t cp);
|
25
|
+
|
24
26
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
25
27
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
26
28
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.15.
|
4
|
+
version: 0.15.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|