llama_cpp 0.15.0 → 0.15.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,8 +5,9 @@
5
5
  #include <utility>
6
6
  #include <vector>
7
7
 
8
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
8
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
9
9
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
10
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
10
11
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
11
12
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
12
13
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
@@ -9,6 +9,7 @@
9
9
  #include <stdexcept>
10
10
  #include <string>
11
11
  #include <unordered_map>
12
+ #include <unordered_set>
12
13
  #include <utility>
13
14
  #include <vector>
14
15
  #include <locale>
@@ -110,28 +111,28 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
110
111
 
111
112
  static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
112
113
  std::unordered_map<uint32_t, int> cpt_types;
113
- for (auto p : unicode_ranges_digit) {
114
- for (auto i = p.first; i <= p.second; ++ i) {
115
- cpt_types[i] = CODEPOINT_TYPE_DIGIT;
114
+ for (auto p : unicode_ranges_number) {
115
+ for (auto i = p.first; i <= p.second; ++i) {
116
+ cpt_types[i] = CODEPOINT_TYPE_NUMBER;
116
117
  }
117
118
  }
118
119
  for (auto p : unicode_ranges_letter) {
119
- for (auto i = p.first; i <= p.second; ++ i) {
120
+ for (auto i = p.first; i <= p.second; ++i) {
120
121
  cpt_types[i] = CODEPOINT_TYPE_LETTER;
121
122
  }
122
123
  }
123
- for (auto p : unicode_ranges_whitespace) {
124
- for (auto i = p.first; i <= p.second; ++ i) {
125
- cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
124
+ for (auto p : unicode_ranges_separator) {
125
+ for (auto i = p.first; i <= p.second; ++i) {
126
+ cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
126
127
  }
127
128
  }
128
129
  for (auto p : unicode_ranges_accent_mark) {
129
- for (auto i = p.first; i <= p.second; ++ i) {
130
+ for (auto i = p.first; i <= p.second; ++i) {
130
131
  cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
131
132
  }
132
133
  }
133
134
  for (auto p : unicode_ranges_punctuation) {
134
- for (auto i = p.first; i <= p.second; ++ i) {
135
+ for (auto i = p.first; i <= p.second; ++i) {
135
136
  cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
136
137
  }
137
138
  }
@@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
141
142
  }
142
143
  }
143
144
  for (auto p : unicode_ranges_control) {
144
- for (auto i = p.first; i <= p.second; ++ i) {
145
+ for (auto i = p.first; i <= p.second; ++i) {
145
146
  cpt_types[i] = CODEPOINT_TYPE_CONTROL;
146
147
  }
147
148
  }
@@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
224
225
  std::vector<size_t> bpe_offsets; // store the offset of each word
225
226
  bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
226
227
 
227
- size_t start = 0;
228
-
229
228
  const auto cpts = unicode_cpts_from_utf8(text);
230
229
 
230
+ size_t start = 0;
231
231
  for (auto offset : offsets) {
232
- std::string token;
232
+ const size_t offset_ini = start;
233
+ const size_t offset_end = start + offset;
234
+ assert(offset_end <= cpts.size());
235
+ start = offset_end;
236
+
237
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
238
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
239
+ };
240
+
241
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
242
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
243
+ };
244
+
245
+ size_t _prev_end = offset_ini;
246
+ auto _add_token = [&] (const size_t end) -> size_t {
247
+ assert(_prev_end <= end && end <= offset_end);
248
+ size_t len = end - _prev_end;
249
+ if (len > 0) {
250
+ bpe_offsets.push_back(len);
251
+ }
252
+ _prev_end = end;
253
+ //if (len > 0) {
254
+ // std::string s = "";
255
+ // for(size_t p = end-len; p < end; p++)
256
+ // s += unicode_cpt_to_utf8(cpts[p]);
257
+ // printf(">>> '%s'\n", s.c_str());
258
+ //}
259
+ return len;
260
+ };
261
+
262
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
263
+ const char32_t cpt = _get_cpt(pos);
264
+ const int cpt_type = _get_cpt_type(pos);
265
+
266
+ // regex: 's|'t|'re|'ve|'m|'ll|'d
267
+ if (cpt == '\'' && pos+1 < offset_end) {
268
+ char32_t cpt_next = _get_cpt(pos+1);
269
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
270
+ pos += _add_token(pos+2);
271
+ continue;
272
+ }
273
+ if (pos+2 < offset_end) {
274
+ char32_t cpt_next_next = _get_cpt(pos+2);
275
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
276
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
277
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
278
+ pos += _add_token(pos+3);
279
+ continue;
280
+ }
281
+ }
282
+ }
233
283
 
234
- bool collecting_numeric = false;
235
- bool collecting_letter = false;
236
- bool collecting_special = false;
237
- bool collecting_whitespace_lookahead = false;
238
- bool collecting = false;
284
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
285
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
286
+ // regex: <space>?\p{L}+
287
+ if (cpt2_type == CODEPOINT_TYPE_LETTER) {
288
+ pos += (cpt == ' ');
289
+ while (cpt2_type == CODEPOINT_TYPE_LETTER) {
290
+ cpt2_type = _get_cpt_type(++pos);
291
+ }
292
+ _add_token(pos);
293
+ continue;
294
+ }
295
+ // regex: <space>?\p{N}+
296
+ if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
297
+ pos += (cpt == ' ');
298
+ while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
299
+ cpt2_type = _get_cpt_type(++pos);
300
+ }
301
+ _add_token(pos);
302
+ continue;
303
+ }
304
+ // regex: <space>?[^\s\p{L}\p{N}]+
305
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
306
+ pos += (cpt == ' ');
307
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
308
+ cpt2_type = _get_cpt_type(++pos);
309
+ cpt2 = _get_cpt(pos);
310
+ }
311
+ _add_token(pos);
312
+ continue;
313
+ }
239
314
 
240
- std::vector<std::string> text_utf;
241
- text_utf.reserve(offset);
315
+ size_t num_whitespaces = 0;
316
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
317
+ num_whitespaces++;
318
+ }
242
319
 
243
- for (size_t i = start; i < start + offset; ++i) {
244
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
320
+ // regex: \s+(?!\S)
321
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
322
+ pos += num_whitespaces - 1;
323
+ _add_token(pos);
324
+ continue;
325
+ }
326
+
327
+ // regex: \s+
328
+ if (num_whitespaces > 0) {
329
+ pos += num_whitespaces;
330
+ _add_token(pos);
331
+ continue;
332
+ }
333
+
334
+ // no matches
335
+ _add_token(++pos);
245
336
  }
337
+ }
338
+
339
+ return bpe_offsets;
340
+ }
246
341
 
247
- for (int i = 0; i < (int)text_utf.size(); i++) {
248
- const std::string & utf_char = text_utf[i];
249
- bool split_condition = false;
250
- int bytes_remain = text_utf.size() - i;
342
+ // LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
343
+ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
344
+ std::vector<size_t> bpe_offsets; // store the offset of each word
345
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
251
346
 
252
- // forward backward lookups
253
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
254
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
347
+ const auto cpts = unicode_cpts_from_utf8(text);
255
348
 
256
- // handling contractions
257
- if (!split_condition && bytes_remain >= 2) {
258
- // 's|'t|'m|'d
259
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
260
- split_condition = true;
349
+ size_t start = 0;
350
+ for (auto offset : offsets) {
351
+ const size_t offset_ini = start;
352
+ const size_t offset_end = start + offset;
353
+ assert(offset_end <= cpts.size());
354
+ start = offset_end;
355
+
356
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
357
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
358
+ };
359
+
360
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
361
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
362
+ };
363
+
364
+ size_t _prev_end = offset_ini;
365
+ auto _add_token = [&] (const size_t end) -> size_t {
366
+ assert(_prev_end <= end && end <= offset_end);
367
+ size_t len = end - _prev_end;
368
+ if (len > 0) {
369
+ bpe_offsets.push_back(len);
370
+ }
371
+ _prev_end = end;
372
+ //if (len > 0) {
373
+ // std::string s = "";
374
+ // for(size_t p = end-len; p < end; p++)
375
+ // s += unicode_cpt_to_utf8(cpts[p]);
376
+ // printf(">>> '%s'\n", s.c_str());
377
+ //}
378
+ return len;
379
+ };
380
+
381
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
382
+ const char32_t cpt = _get_cpt(pos);
383
+ const int cpt_type = _get_cpt_type(pos);
384
+
385
+ // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
386
+ if (cpt == '\'' && pos+1 < offset_end) {
387
+ char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
388
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
389
+ pos += _add_token(pos+2);
390
+ continue;
261
391
  }
262
- if (split_condition) {
263
- if (token.size()) {
264
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
392
+ if (pos+2 < offset_end) {
393
+ char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
394
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
395
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
396
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
397
+ pos += _add_token(pos+3);
398
+ continue;
265
399
  }
266
- token = utf_char + utf_char_next;
267
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
268
- token = "";
269
- i++;
270
- continue;
271
400
  }
272
401
  }
273
- if (!split_condition && bytes_remain >= 3) {
274
- // 're|'ve|'ll
275
- if (utf_char == "\'" && (
276
- (utf_char_next == "r" && utf_char_next_next == "e") ||
277
- (utf_char_next == "v" && utf_char_next_next == "e") ||
278
- (utf_char_next == "l" && utf_char_next_next == "l"))
279
- ) {
280
- split_condition = true;
281
- }
282
- if (split_condition) {
283
- // current token + next token can be defined
284
- if (token.size()) {
285
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
286
- }
287
- token = utf_char;
288
- token += utf_char_next;
289
- token += utf_char_next_next;
290
402
 
291
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
292
- token = "";
293
- i += 2;
403
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
404
+ if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
405
+ if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
406
+ pos++;
407
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
408
+ pos++;
409
+ }
410
+ _add_token(pos);
294
411
  continue;
295
412
  }
296
413
  }
297
414
 
298
- if (!split_condition && !collecting) {
299
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
300
- collecting_letter = true;
301
- collecting = true;
302
- }
303
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
304
- collecting_numeric = true;
305
- collecting = true;
306
- }
307
- else if (
308
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
309
- (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
310
- ) {
311
- collecting_special = true;
312
- collecting = true;
313
- }
314
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
315
- collecting_whitespace_lookahead = true;
316
- collecting = true;
317
- }
318
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
319
- split_condition = true;
415
+ // regex: \p{N}{1,3}
416
+ if (cpt_type == CODEPOINT_TYPE_NUMBER) {
417
+ size_t ini = pos;
418
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
419
+ if (++pos - ini >= 3 ) {
420
+ _add_token(pos);
421
+ ini = pos;
422
+ }
320
423
  }
424
+ _add_token(pos);
425
+ continue;
321
426
  }
322
- else if (!split_condition && collecting) {
323
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
324
- split_condition = true;
325
- }
326
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
327
- split_condition = true;
427
+
428
+ // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
429
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
430
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
431
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
432
+ pos += (cpt == ' ');
433
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
434
+ cpt2_type = _get_cpt_type(++pos);
435
+ cpt2 = _get_cpt(pos);
328
436
  }
329
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
330
- split_condition = true;
437
+ while (cpt2 == '\r' || cpt2 == '\n') {
438
+ cpt2 = _get_cpt(++pos);
331
439
  }
332
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
333
- split_condition = true;
440
+ _add_token(pos);
441
+ continue;
442
+ }
443
+
444
+ size_t num_whitespaces = 0;
445
+ size_t last_end_r_or_n = 0;
446
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
447
+ char32_t cpt2 = _get_cpt(pos+num_whitespaces);
448
+ if (cpt2 == '\r' || cpt2 == '\n') {
449
+ last_end_r_or_n = pos + num_whitespaces + 1;
334
450
  }
451
+ num_whitespaces++;
335
452
  }
336
453
 
337
- if (utf_char_next == "") {
338
- split_condition = true; // final
339
- token += utf_char;
454
+ // regex: \s*[\r\n]+
455
+ if (last_end_r_or_n > 0) {
456
+ pos = last_end_r_or_n;
457
+ _add_token(pos);
458
+ continue;
340
459
  }
341
460
 
342
- if (split_condition) {
343
- if (token.size()) {
344
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
345
- }
346
- token = utf_char;
347
- collecting = false;
348
- collecting_letter = false;
349
- collecting_numeric = false;
350
- collecting_special = false;
351
- collecting_whitespace_lookahead = false;
461
+ // regex: \s+(?!\S)
462
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
463
+ pos += num_whitespaces - 1;
464
+ _add_token(pos);
465
+ continue;
352
466
  }
353
- else {
354
- token += utf_char;
467
+
468
+ // regex: \s+
469
+ if (num_whitespaces > 0) {
470
+ pos += num_whitespaces;
471
+ _add_token(pos);
472
+ continue;
355
473
  }
356
- }
357
474
 
358
- start += offset;
475
+ // no matches
476
+ _add_token(++pos);
477
+ }
359
478
  }
360
479
 
361
480
  return bpe_offsets;
@@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
424
543
  static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
425
544
  std::vector<size_t> bpe_offsets;
426
545
 
427
- (void)(text);
428
- (void)(regex_expr);
429
- (void)(offsets);
430
- // TODO: this implementation is actually wrong, uncomment and run:
431
- // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
432
- //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
433
- // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
434
- //}
546
+ if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
547
+ bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
548
+ } else if (
549
+ regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
550
+ regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
551
+
552
+ bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
553
+ }
435
554
 
436
555
  return bpe_offsets;
437
556
  }
@@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
506
625
  return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
507
626
  }
508
627
 
628
+ bool unicode_cpt_is_whitespace(uint32_t cp) {
629
+ static const std::unordered_set<uint32_t> is_whitespace = [] {
630
+ std::unordered_set<uint32_t> is_whitespace;
631
+ for (auto p : unicode_ranges_whitespace) {
632
+ for (auto i = p.first; i <= p.second; ++i) {
633
+ is_whitespace.insert(i);
634
+ }
635
+ }
636
+ return is_whitespace;
637
+ }();
638
+ return (bool)is_whitespace.count(cp);
639
+ }
640
+
509
641
  std::string unicode_byte_to_utf8(uint8_t byte) {
510
642
  static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
511
643
  return map.at(byte);
@@ -524,19 +656,19 @@ char32_t unicode_tolower(char32_t cp) {
524
656
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
525
657
  // unicode categories
526
658
  static const std::map<std::string, int> k_ucat_enum = {
527
- { "\\p{N}", CODEPOINT_TYPE_DIGIT },
659
+ { "\\p{N}", CODEPOINT_TYPE_NUMBER },
528
660
  { "\\p{L}", CODEPOINT_TYPE_LETTER },
529
661
  { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
530
662
  };
531
663
 
532
664
  static const std::map<int, int> k_ucat_cpt = {
533
- { CODEPOINT_TYPE_DIGIT, 0xD1 },
665
+ { CODEPOINT_TYPE_NUMBER, 0xD1 },
534
666
  { CODEPOINT_TYPE_LETTER, 0xD2 },
535
667
  { CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
536
668
  };
537
669
 
538
670
  static const std::map<int, std::string> k_ucat_map = {
539
- { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9
671
+ { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
540
672
  { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
541
673
  { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
542
674
  };
@@ -5,9 +5,9 @@
5
5
  #include <vector>
6
6
 
7
7
  #define CODEPOINT_TYPE_UNIDENTIFIED 0
8
- #define CODEPOINT_TYPE_DIGIT 1
8
+ #define CODEPOINT_TYPE_NUMBER 1
9
9
  #define CODEPOINT_TYPE_LETTER 2
10
- #define CODEPOINT_TYPE_WHITESPACE 3
10
+ #define CODEPOINT_TYPE_SEPARATOR 3
11
11
  #define CODEPOINT_TYPE_ACCENT_MARK 4
12
12
  #define CODEPOINT_TYPE_PUNCTUATION 5
13
13
  #define CODEPOINT_TYPE_SYMBOL 6
@@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
21
21
  int unicode_cpt_type(uint32_t cp);
22
22
  int unicode_cpt_type(const std::string & utf8);
23
23
 
24
+ bool unicode_cpt_is_whitespace(uint32_t cp);
25
+
24
26
  std::string unicode_byte_to_utf8(uint8_t byte);
25
27
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
26
28
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.15.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-04 00:00:00.000000000 Z
11
+ date: 2024-05-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: