llama_cpp 0.14.7 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,11 +5,14 @@
5
5
  #include <cstddef>
6
6
  #include <cstdint>
7
7
  #include <map>
8
+ #include <regex>
8
9
  #include <stdexcept>
9
10
  #include <string>
10
11
  #include <unordered_map>
11
12
  #include <utility>
12
13
  #include <vector>
14
+ #include <locale>
15
+ #include <codecvt>
13
16
 
14
17
  static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
15
18
  std::string result;
@@ -53,23 +56,22 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
53
56
  offset += 4;
54
57
  return result;
55
58
  }
56
- throw std::invalid_argument("invalid string");
59
+ throw std::invalid_argument("failed to convert utf8 to codepoint");
57
60
  }
58
61
 
59
- static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
60
- std::vector<uint16_t> result;
61
- if (/* 0x0000 <= cp && */ cp <= 0xffff) {
62
- result.emplace_back(cp);
63
- }
64
- else if (0x10000 <= cp && cp <= 0x10ffff) {
65
- result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
66
- result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
67
- }
68
- else {
69
- throw std::invalid_argument("invalid cpt");
70
- }
71
- return result;
72
- }
62
+ //static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
63
+ // std::vector<uint16_t> result;
64
+ // if (/* 0x0000 <= cp && */ cp <= 0xffff) {
65
+ // result.emplace_back(cp);
66
+ // return result;
67
+ // }
68
+ // if (0x10000 <= cp && cp <= 0x10ffff) {
69
+ // result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
70
+ // result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
71
+ // return result;
72
+ // }
73
+ // throw std::invalid_argument("failed to convert codepoint to utf16");
74
+ //}
73
75
 
74
76
  //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
75
77
  // std::vector<uint16_t> result;
@@ -80,28 +82,28 @@ static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
80
82
  // return result;
81
83
  //}
82
84
 
83
- static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
84
- assert(offset < utf16.size());
85
- if (((utf16[0] >> 10) << 10) != 0xd800) {
86
- auto result = utf16[offset + 0];
87
- offset += 1;
88
- return result;
89
- }
90
-
91
- if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
92
- throw std::invalid_argument("invalid character");
93
- }
94
-
95
- auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
96
- offset += 2;
97
- return result;
98
- }
85
+ //static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
86
+ // assert(offset < utf16.size());
87
+ // if (((utf16[0] >> 10) << 10) != 0xd800) {
88
+ // auto result = utf16[offset + 0];
89
+ // offset += 1;
90
+ // return result;
91
+ // }
92
+ //
93
+ // if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
94
+ // throw std::invalid_argument("invalid character");
95
+ // }
96
+ //
97
+ // auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
98
+ // offset += 2;
99
+ // return result;
100
+ //}
99
101
 
100
102
  //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
101
103
  // std::vector<uint32_t> result;
102
104
  // size_t offset = 0;
103
105
  // while (offset < utf16.size()) {
104
- // result.push_back(cpt_from_utf16(utf16, offset));
106
+ // result.push_back(unicode_cpt_from_utf16(utf16, offset));
105
107
  // }
106
108
  // return result;
107
109
  //}
@@ -194,34 +196,277 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
194
196
  return map;
195
197
  }
196
198
 
199
+ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
200
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
201
+ return conv.from_bytes(s);
202
+ }
203
+
204
+ static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
205
+ std::vector<std::string> bpe_encoded_words;
206
+ for (const auto & word : bpe_words) {
207
+ std::string text_utf;
208
+ auto utf_word = unicode_cpts_from_utf8(word);
209
+ for (size_t i = 0; i < utf_word.size(); ++i) {
210
+ text_utf += unicode_cpt_to_utf8(utf_word[i]);
211
+ }
212
+
213
+ std::string encoded_token;
214
+ for (char & c : text_utf) {
215
+ encoded_token += unicode_byte_to_utf8(c);
216
+ }
217
+ bpe_encoded_words.emplace_back(encoded_token);
218
+ }
219
+ return bpe_encoded_words;
220
+ }
221
+
222
+ // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
223
+ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
224
+ std::vector<size_t> bpe_offsets; // store the offset of each word
225
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
226
+
227
+ size_t start = 0;
228
+
229
+ const auto cpts = unicode_cpts_from_utf8(text);
230
+
231
+ for (auto offset : offsets) {
232
+ std::string token;
233
+
234
+ bool collecting_numeric = false;
235
+ bool collecting_letter = false;
236
+ bool collecting_special = false;
237
+ bool collecting_whitespace_lookahead = false;
238
+ bool collecting = false;
239
+
240
+ std::vector<std::string> text_utf;
241
+ text_utf.reserve(offset);
242
+
243
+ for (size_t i = start; i < start + offset; ++i) {
244
+ text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
245
+ }
246
+
247
+ for (int i = 0; i < (int)text_utf.size(); i++) {
248
+ const std::string & utf_char = text_utf[i];
249
+ bool split_condition = false;
250
+ int bytes_remain = text_utf.size() - i;
251
+
252
+ // forward backward lookups
253
+ const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
254
+ const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
255
+
256
+ // handling contractions
257
+ if (!split_condition && bytes_remain >= 2) {
258
+ // 's|'t|'m|'d
259
+ if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
260
+ split_condition = true;
261
+ }
262
+ if (split_condition) {
263
+ if (token.size()) {
264
+ bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
265
+ }
266
+ token = utf_char + utf_char_next;
267
+ bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
268
+ token = "";
269
+ i++;
270
+ continue;
271
+ }
272
+ }
273
+ if (!split_condition && bytes_remain >= 3) {
274
+ // 're|'ve|'ll
275
+ if (utf_char == "\'" && (
276
+ (utf_char_next == "r" && utf_char_next_next == "e") ||
277
+ (utf_char_next == "v" && utf_char_next_next == "e") ||
278
+ (utf_char_next == "l" && utf_char_next_next == "l"))
279
+ ) {
280
+ split_condition = true;
281
+ }
282
+ if (split_condition) {
283
+ // current token + next token can be defined
284
+ if (token.size()) {
285
+ bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
286
+ }
287
+ token = utf_char;
288
+ token += utf_char_next;
289
+ token += utf_char_next_next;
290
+
291
+ bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
292
+ token = "";
293
+ i += 2;
294
+ continue;
295
+ }
296
+ }
297
+
298
+ if (!split_condition && !collecting) {
299
+ if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
300
+ collecting_letter = true;
301
+ collecting = true;
302
+ }
303
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
304
+ collecting_numeric = true;
305
+ collecting = true;
306
+ }
307
+ else if (
308
+ ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
309
+ (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
310
+ ) {
311
+ collecting_special = true;
312
+ collecting = true;
313
+ }
314
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
315
+ collecting_whitespace_lookahead = true;
316
+ collecting = true;
317
+ }
318
+ else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
319
+ split_condition = true;
320
+ }
321
+ }
322
+ else if (!split_condition && collecting) {
323
+ if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
324
+ split_condition = true;
325
+ }
326
+ else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
327
+ split_condition = true;
328
+ }
329
+ else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
330
+ split_condition = true;
331
+ }
332
+ else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
333
+ split_condition = true;
334
+ }
335
+ }
336
+
337
+ if (utf_char_next == "") {
338
+ split_condition = true; // final
339
+ token += utf_char;
340
+ }
341
+
342
+ if (split_condition) {
343
+ if (token.size()) {
344
+ bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
345
+ }
346
+ token = utf_char;
347
+ collecting = false;
348
+ collecting_letter = false;
349
+ collecting_numeric = false;
350
+ collecting_special = false;
351
+ collecting_whitespace_lookahead = false;
352
+ }
353
+ else {
354
+ token += utf_char;
355
+ }
356
+ }
357
+
358
+ start += offset;
359
+ }
360
+
361
+ return bpe_offsets;
362
+ }
363
+
364
+ // use std::wregex to split the text
365
+ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
366
+ std::wregex expr(regex_expr);
367
+ std::vector<size_t> bpe_offsets; // store the offset of each word
368
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
369
+ size_t start = 0;
370
+ for (auto offset : offsets) {
371
+ std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
372
+ std::wcregex_iterator end;
373
+
374
+ int64_t start_idx = 0;
375
+ while (it != end) {
376
+ std::wcmatch match = *it;
377
+ if (match.position() > start_idx) {
378
+ bpe_offsets.emplace_back(match.position() - start_idx);
379
+ }
380
+ bpe_offsets.emplace_back(match.length());
381
+ start_idx = match.position() + match.length();
382
+ ++it;
383
+ }
384
+
385
+ if (start_idx < (int64_t) offset) {
386
+ bpe_offsets.emplace_back(offset - start_idx);
387
+ }
388
+ start += offset;
389
+ }
390
+
391
+ return bpe_offsets;
392
+ }
393
+
394
+ // use std::regex to split the text
395
+ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
396
+ std::regex expr(regex_expr);
397
+ std::vector<size_t> bpe_offsets; // store the offset of each word
398
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
399
+ size_t start = 0;
400
+ for (auto offset : offsets) {
401
+ std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
402
+ std::cregex_iterator end;
403
+
404
+ int64_t start_idx = 0;
405
+ while (it != end) {
406
+ std::cmatch match = *it;
407
+ if (match.position() > start_idx) {
408
+ bpe_offsets.emplace_back(match.position() - start_idx);
409
+ }
410
+ bpe_offsets.emplace_back(match.length());
411
+ start_idx = match.position() + match.length();
412
+ ++it;
413
+ }
414
+
415
+ if (start_idx < (int64_t) offset) {
416
+ bpe_offsets.emplace_back(offset - start_idx);
417
+ }
418
+ start += offset;
419
+ }
420
+
421
+ return bpe_offsets;
422
+ }
423
+
424
+ static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
425
+ std::vector<size_t> bpe_offsets;
426
+
427
+ (void)(text);
428
+ (void)(regex_expr);
429
+ (void)(offsets);
430
+ // TODO: this implementation is actually wrong, uncomment and run:
431
+ // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
432
+ //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
433
+ // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
434
+ //}
435
+
436
+ return bpe_offsets;
437
+ }
438
+
197
439
  //
198
440
  // interface
199
441
  //
200
442
 
201
443
  std::string unicode_cpt_to_utf8(uint32_t cp) {
202
444
  std::string result;
445
+
203
446
  if (/* 0x00 <= cp && */ cp <= 0x7f) {
204
447
  result.push_back(cp);
448
+ return result;
205
449
  }
206
- else if (0x80 <= cp && cp <= 0x7ff) {
450
+ if (0x80 <= cp && cp <= 0x7ff) {
207
451
  result.push_back(0xc0 | ((cp >> 6) & 0x1f));
208
452
  result.push_back(0x80 | (cp & 0x3f));
453
+ return result;
209
454
  }
210
- else if (0x800 <= cp && cp <= 0xffff) {
455
+ if (0x800 <= cp && cp <= 0xffff) {
211
456
  result.push_back(0xe0 | ((cp >> 12) & 0x0f));
212
457
  result.push_back(0x80 | ((cp >> 6) & 0x3f));
213
458
  result.push_back(0x80 | (cp & 0x3f));
459
+ return result;
214
460
  }
215
- else if (0x10000 <= cp && cp <= 0x10ffff) {
461
+ if (0x10000 <= cp && cp <= 0x10ffff) {
216
462
  result.push_back(0xf0 | ((cp >> 18) & 0x07));
217
463
  result.push_back(0x80 | ((cp >> 12) & 0x3f));
218
464
  result.push_back(0x80 | ((cp >> 6) & 0x3f));
219
465
  result.push_back(0x80 | (cp & 0x3f));
466
+ return result;
220
467
  }
221
- else {
222
- throw std::invalid_argument("invalid codepoint");
223
- }
224
- return result;
468
+
469
+ throw std::invalid_argument("invalid codepoint");
225
470
  }
226
471
 
227
472
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
@@ -275,3 +520,167 @@ char32_t unicode_tolower(char32_t cp) {
275
520
  auto it = unicode_map_lowercase.find(cp);
276
521
  return it == unicode_map_lowercase.end() ? cp : it->second;
277
522
  }
523
+
524
+ std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
525
+ // unicode categories
526
+ static const std::map<std::string, int> k_ucat_enum = {
527
+ { "\\p{N}", CODEPOINT_TYPE_DIGIT },
528
+ { "\\p{L}", CODEPOINT_TYPE_LETTER },
529
+ { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
530
+ };
531
+
532
+ static const std::map<int, int> k_ucat_cpt = {
533
+ { CODEPOINT_TYPE_DIGIT, 0xD1 },
534
+ { CODEPOINT_TYPE_LETTER, 0xD2 },
535
+ { CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
536
+ };
537
+
538
+ static const std::map<int, std::string> k_ucat_map = {
539
+ { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9
540
+ { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
541
+ { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
542
+ };
543
+
544
+ // compute collapsed codepoints only if needed by at least one regex
545
+ bool need_collapse = false;
546
+ for (auto & regex_expr : regex_exprs) {
547
+ // search for unicode categories
548
+ for (const auto & ucat : k_ucat_enum) {
549
+ if (std::string::npos != regex_expr.find(ucat.first)) {
550
+ need_collapse = true;
551
+ break;
552
+ }
553
+ }
554
+ }
555
+
556
+ const auto cpts = unicode_cpts_from_utf8(text);
557
+
558
+ // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
559
+ // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
560
+ std::string text_collapsed;
561
+ if (need_collapse) {
562
+ // collapse all unicode categories
563
+ text_collapsed.resize(cpts.size());
564
+
565
+ for (size_t i = 0; i < cpts.size(); ++i) {
566
+ // keep single-byte codepoints as is
567
+ if (cpts[i] < 128) {
568
+ text_collapsed[i] = cpts[i];
569
+ continue;
570
+ }
571
+
572
+ const int cpt_type = unicode_cpt_type(cpts[i]);
573
+
574
+ if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) {
575
+ text_collapsed[i] = k_ucat_cpt.at(cpt_type);
576
+ } else {
577
+ text_collapsed[i] = (char) 0xD0; // fallback
578
+ }
579
+ }
580
+ }
581
+
582
+ std::vector<size_t> bpe_offsets = { cpts.size() };
583
+
584
+ for (auto & regex_expr : regex_exprs) {
585
+ // first, see if we have an efficient custom regex implementation
586
+ auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
587
+
588
+ if (!tmp.empty()) {
589
+ bpe_offsets = std::move(tmp);
590
+ continue;
591
+ }
592
+
593
+ // fallback to general-purpose std::regex / std::wregex
594
+ try {
595
+ // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
596
+ // with the corresponding collapsed representation
597
+ bool use_collapsed = false;
598
+ for (auto & ucat : k_ucat_enum) {
599
+ if (std::string::npos != regex_expr.find(ucat.first)) {
600
+ use_collapsed = true;
601
+ break;
602
+ }
603
+ }
604
+
605
+ if (use_collapsed) {
606
+ // sanity-check that the original regex does not contain any non-ASCII characters
607
+ const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
608
+ for (size_t i = 0; i < cpts_regex.size(); ++i) {
609
+ if (cpts_regex[i] >= 128) {
610
+ throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
611
+ }
612
+ }
613
+
614
+ // generate a collapsed representation of the regex
615
+ std::string regex_expr_collapsed;
616
+
617
+ // track if we are inside [], because nested [] are not allowed
618
+ bool inside = false;
619
+ for (size_t i = 0; i < regex_expr.size(); ++i) {
620
+ if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
621
+ regex_expr_collapsed += '[';
622
+ inside = true;
623
+ continue;
624
+ }
625
+
626
+ if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
627
+ regex_expr_collapsed += ']';
628
+ inside = false;
629
+ continue;
630
+ }
631
+
632
+ if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
633
+ regex_expr[i + 1] == 'p' &&
634
+ regex_expr[i + 2] == '{' &&
635
+ regex_expr[i + 4] == '}') {
636
+ const std::string pat = regex_expr.substr(i, 5);
637
+ if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
638
+ if (!inside) {
639
+ regex_expr_collapsed += '[';
640
+ }
641
+ regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
642
+ regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
643
+ if (!inside) {
644
+ regex_expr_collapsed += ']';
645
+ }
646
+ i += 4;
647
+ continue;
648
+ }
649
+ }
650
+
651
+ regex_expr_collapsed += regex_expr[i];
652
+ }
653
+
654
+ //printf("text_collapsed: %s\n", text_collapsed.c_str());
655
+ //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
656
+ bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
657
+ } else {
658
+ // no unicode category used, we can use std::wregex directly
659
+ const std::wstring wtext = unicode_wstring_from_utf8(text);
660
+ const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
661
+
662
+ //printf("text: %s\n", text.c_str());
663
+ //printf("regex_expr: %s\n", regex_expr.c_str());
664
+ bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
665
+ }
666
+ } catch (std::regex_error & e) {
667
+ fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
668
+ fprintf(stderr, "Regex error: %s\n", e.what());
669
+ throw std::runtime_error("Failed to process regex");
670
+ }
671
+ }
672
+
673
+ std::vector<std::string> bpe_words;
674
+ bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size
675
+
676
+ size_t start = 0;
677
+ for (size_t & offset : bpe_offsets) {
678
+ bpe_words.emplace_back();
679
+ for (size_t i = start; i < start + offset; ++i) {
680
+ bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
681
+ }
682
+ start += offset;
683
+ }
684
+
685
+ return unicode_byte_encoding_process(bpe_words);
686
+ }
@@ -24,5 +24,6 @@ int unicode_cpt_type(const std::string & utf8);
24
24
  std::string unicode_byte_to_utf8(uint8_t byte);
25
25
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
26
26
 
27
- // simple tolower that only implements one-to-one mapping, not one-to-many
28
27
  char32_t unicode_tolower(char32_t cp);
28
+
29
+ std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.7
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-04-27 00:00:00.000000000 Z
11
+ date: 2024-05-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  - !ruby/object:Gem::Version
98
98
  version: '0'
99
99
  requirements: []
100
- rubygems_version: 3.5.7
100
+ rubygems_version: 3.5.9
101
101
  signing_key:
102
102
  specification_version: 4
103
103
  summary: Ruby bindings for the llama.cpp.