llama_cpp 0.15.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,8 +5,9 @@
5
5
  #include <utility>
6
6
  #include <vector>
7
7
 
8
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
8
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
9
9
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
10
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
10
11
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
11
12
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
12
13
  extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
@@ -9,6 +9,7 @@
9
9
  #include <stdexcept>
10
10
  #include <string>
11
11
  #include <unordered_map>
12
+ #include <unordered_set>
12
13
  #include <utility>
13
14
  #include <vector>
14
15
  #include <locale>
@@ -110,28 +111,28 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
110
111
 
111
112
  static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
112
113
  std::unordered_map<uint32_t, int> cpt_types;
113
- for (auto p : unicode_ranges_digit) {
114
- for (auto i = p.first; i <= p.second; ++ i) {
115
- cpt_types[i] = CODEPOINT_TYPE_DIGIT;
114
+ for (auto p : unicode_ranges_number) {
115
+ for (auto i = p.first; i <= p.second; ++i) {
116
+ cpt_types[i] = CODEPOINT_TYPE_NUMBER;
116
117
  }
117
118
  }
118
119
  for (auto p : unicode_ranges_letter) {
119
- for (auto i = p.first; i <= p.second; ++ i) {
120
+ for (auto i = p.first; i <= p.second; ++i) {
120
121
  cpt_types[i] = CODEPOINT_TYPE_LETTER;
121
122
  }
122
123
  }
123
- for (auto p : unicode_ranges_whitespace) {
124
- for (auto i = p.first; i <= p.second; ++ i) {
125
- cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
124
+ for (auto p : unicode_ranges_separator) {
125
+ for (auto i = p.first; i <= p.second; ++i) {
126
+ cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
126
127
  }
127
128
  }
128
129
  for (auto p : unicode_ranges_accent_mark) {
129
- for (auto i = p.first; i <= p.second; ++ i) {
130
+ for (auto i = p.first; i <= p.second; ++i) {
130
131
  cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
131
132
  }
132
133
  }
133
134
  for (auto p : unicode_ranges_punctuation) {
134
- for (auto i = p.first; i <= p.second; ++ i) {
135
+ for (auto i = p.first; i <= p.second; ++i) {
135
136
  cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
136
137
  }
137
138
  }
@@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
141
142
  }
142
143
  }
143
144
  for (auto p : unicode_ranges_control) {
144
- for (auto i = p.first; i <= p.second; ++ i) {
145
+ for (auto i = p.first; i <= p.second; ++i) {
145
146
  cpt_types[i] = CODEPOINT_TYPE_CONTROL;
146
147
  }
147
148
  }
@@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
224
225
  std::vector<size_t> bpe_offsets; // store the offset of each word
225
226
  bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
226
227
 
227
- size_t start = 0;
228
-
229
228
  const auto cpts = unicode_cpts_from_utf8(text);
230
229
 
230
+ size_t start = 0;
231
231
  for (auto offset : offsets) {
232
- std::string token;
232
+ const size_t offset_ini = start;
233
+ const size_t offset_end = start + offset;
234
+ assert(offset_end <= cpts.size());
235
+ start = offset_end;
236
+
237
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
238
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
239
+ };
240
+
241
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
242
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
243
+ };
244
+
245
+ size_t _prev_end = offset_ini;
246
+ auto _add_token = [&] (const size_t end) -> size_t {
247
+ assert(_prev_end <= end && end <= offset_end);
248
+ size_t len = end - _prev_end;
249
+ if (len > 0) {
250
+ bpe_offsets.push_back(len);
251
+ }
252
+ _prev_end = end;
253
+ //if (len > 0) {
254
+ // std::string s = "";
255
+ // for(size_t p = end-len; p < end; p++)
256
+ // s += unicode_cpt_to_utf8(cpts[p]);
257
+ // printf(">>> '%s'\n", s.c_str());
258
+ //}
259
+ return len;
260
+ };
261
+
262
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
263
+ const char32_t cpt = _get_cpt(pos);
264
+ const int cpt_type = _get_cpt_type(pos);
265
+
266
+ // regex: 's|'t|'re|'ve|'m|'ll|'d
267
+ if (cpt == '\'' && pos+1 < offset_end) {
268
+ char32_t cpt_next = _get_cpt(pos+1);
269
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
270
+ pos += _add_token(pos+2);
271
+ continue;
272
+ }
273
+ if (pos+2 < offset_end) {
274
+ char32_t cpt_next_next = _get_cpt(pos+2);
275
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
276
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
277
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
278
+ pos += _add_token(pos+3);
279
+ continue;
280
+ }
281
+ }
282
+ }
233
283
 
234
- bool collecting_numeric = false;
235
- bool collecting_letter = false;
236
- bool collecting_special = false;
237
- bool collecting_whitespace_lookahead = false;
238
- bool collecting = false;
284
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
285
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
286
+ // regex: <space>?\p{L}+
287
+ if (cpt2_type == CODEPOINT_TYPE_LETTER) {
288
+ pos += (cpt == ' ');
289
+ while (cpt2_type == CODEPOINT_TYPE_LETTER) {
290
+ cpt2_type = _get_cpt_type(++pos);
291
+ }
292
+ _add_token(pos);
293
+ continue;
294
+ }
295
+ // regex: <space>?\p{N}+
296
+ if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
297
+ pos += (cpt == ' ');
298
+ while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
299
+ cpt2_type = _get_cpt_type(++pos);
300
+ }
301
+ _add_token(pos);
302
+ continue;
303
+ }
304
+ // regex: <space>?[^\s\p{L}\p{N}]+
305
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
306
+ pos += (cpt == ' ');
307
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
308
+ cpt2_type = _get_cpt_type(++pos);
309
+ cpt2 = _get_cpt(pos);
310
+ }
311
+ _add_token(pos);
312
+ continue;
313
+ }
239
314
 
240
- std::vector<std::string> text_utf;
241
- text_utf.reserve(offset);
315
+ size_t num_whitespaces = 0;
316
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
317
+ num_whitespaces++;
318
+ }
242
319
 
243
- for (size_t i = start; i < start + offset; ++i) {
244
- text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
320
+ // regex: \s+(?!\S)
321
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
322
+ pos += num_whitespaces - 1;
323
+ _add_token(pos);
324
+ continue;
325
+ }
326
+
327
+ // regex: \s+
328
+ if (num_whitespaces > 0) {
329
+ pos += num_whitespaces;
330
+ _add_token(pos);
331
+ continue;
332
+ }
333
+
334
+ // no matches
335
+ _add_token(++pos);
245
336
  }
337
+ }
338
+
339
+ return bpe_offsets;
340
+ }
246
341
 
247
- for (int i = 0; i < (int)text_utf.size(); i++) {
248
- const std::string & utf_char = text_utf[i];
249
- bool split_condition = false;
250
- int bytes_remain = text_utf.size() - i;
342
+ // LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
343
+ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
344
+ std::vector<size_t> bpe_offsets; // store the offset of each word
345
+ bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
251
346
 
252
- // forward backward lookups
253
- const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
254
- const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
347
+ const auto cpts = unicode_cpts_from_utf8(text);
255
348
 
256
- // handling contractions
257
- if (!split_condition && bytes_remain >= 2) {
258
- // 's|'t|'m|'d
259
- if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
260
- split_condition = true;
349
+ size_t start = 0;
350
+ for (auto offset : offsets) {
351
+ const size_t offset_ini = start;
352
+ const size_t offset_end = start + offset;
353
+ assert(offset_end <= cpts.size());
354
+ start = offset_end;
355
+
356
+ auto _get_cpt = [&] (const size_t pos) -> char32_t {
357
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
358
+ };
359
+
360
+ auto _get_cpt_type = [&] (const size_t pos) -> int {
361
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
362
+ };
363
+
364
+ size_t _prev_end = offset_ini;
365
+ auto _add_token = [&] (const size_t end) -> size_t {
366
+ assert(_prev_end <= end && end <= offset_end);
367
+ size_t len = end - _prev_end;
368
+ if (len > 0) {
369
+ bpe_offsets.push_back(len);
370
+ }
371
+ _prev_end = end;
372
+ //if (len > 0) {
373
+ // std::string s = "";
374
+ // for(size_t p = end-len; p < end; p++)
375
+ // s += unicode_cpt_to_utf8(cpts[p]);
376
+ // printf(">>> '%s'\n", s.c_str());
377
+ //}
378
+ return len;
379
+ };
380
+
381
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
382
+ const char32_t cpt = _get_cpt(pos);
383
+ const int cpt_type = _get_cpt_type(pos);
384
+
385
+ // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
386
+ if (cpt == '\'' && pos+1 < offset_end) {
387
+ char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
388
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
389
+ pos += _add_token(pos+2);
390
+ continue;
261
391
  }
262
- if (split_condition) {
263
- if (token.size()) {
264
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
392
+ if (pos+2 < offset_end) {
393
+ char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
394
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
395
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
396
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
397
+ pos += _add_token(pos+3);
398
+ continue;
265
399
  }
266
- token = utf_char + utf_char_next;
267
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
268
- token = "";
269
- i++;
270
- continue;
271
400
  }
272
401
  }
273
- if (!split_condition && bytes_remain >= 3) {
274
- // 're|'ve|'ll
275
- if (utf_char == "\'" && (
276
- (utf_char_next == "r" && utf_char_next_next == "e") ||
277
- (utf_char_next == "v" && utf_char_next_next == "e") ||
278
- (utf_char_next == "l" && utf_char_next_next == "l"))
279
- ) {
280
- split_condition = true;
281
- }
282
- if (split_condition) {
283
- // current token + next token can be defined
284
- if (token.size()) {
285
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
286
- }
287
- token = utf_char;
288
- token += utf_char_next;
289
- token += utf_char_next_next;
290
402
 
291
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
292
- token = "";
293
- i += 2;
403
+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
404
+ if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
405
+ if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
406
+ pos++;
407
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
408
+ pos++;
409
+ }
410
+ _add_token(pos);
294
411
  continue;
295
412
  }
296
413
  }
297
414
 
298
- if (!split_condition && !collecting) {
299
- if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
300
- collecting_letter = true;
301
- collecting = true;
302
- }
303
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
304
- collecting_numeric = true;
305
- collecting = true;
306
- }
307
- else if (
308
- ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
309
- (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
310
- ) {
311
- collecting_special = true;
312
- collecting = true;
313
- }
314
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
315
- collecting_whitespace_lookahead = true;
316
- collecting = true;
317
- }
318
- else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
319
- split_condition = true;
415
+ // regex: \p{N}{1,3}
416
+ if (cpt_type == CODEPOINT_TYPE_NUMBER) {
417
+ size_t ini = pos;
418
+ while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
419
+ if (++pos - ini >= 3 ) {
420
+ _add_token(pos);
421
+ ini = pos;
422
+ }
320
423
  }
424
+ _add_token(pos);
425
+ continue;
321
426
  }
322
- else if (!split_condition && collecting) {
323
- if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
324
- split_condition = true;
325
- }
326
- else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
327
- split_condition = true;
427
+
428
+ // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
429
+ char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
430
+ int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
431
+ if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
432
+ pos += (cpt == ' ');
433
+ while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
434
+ cpt2_type = _get_cpt_type(++pos);
435
+ cpt2 = _get_cpt(pos);
328
436
  }
329
- else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
330
- split_condition = true;
437
+ while (cpt2 == '\r' || cpt2 == '\n') {
438
+ cpt2 = _get_cpt(++pos);
331
439
  }
332
- else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
333
- split_condition = true;
440
+ _add_token(pos);
441
+ continue;
442
+ }
443
+
444
+ size_t num_whitespaces = 0;
445
+ size_t last_end_r_or_n = 0;
446
+ while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
447
+ char32_t cpt2 = _get_cpt(pos+num_whitespaces);
448
+ if (cpt2 == '\r' || cpt2 == '\n') {
449
+ last_end_r_or_n = pos + num_whitespaces + 1;
334
450
  }
451
+ num_whitespaces++;
335
452
  }
336
453
 
337
- if (utf_char_next == "") {
338
- split_condition = true; // final
339
- token += utf_char;
454
+ // regex: \s*[\r\n]+
455
+ if (last_end_r_or_n > 0) {
456
+ pos = last_end_r_or_n;
457
+ _add_token(pos);
458
+ continue;
340
459
  }
341
460
 
342
- if (split_condition) {
343
- if (token.size()) {
344
- bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
345
- }
346
- token = utf_char;
347
- collecting = false;
348
- collecting_letter = false;
349
- collecting_numeric = false;
350
- collecting_special = false;
351
- collecting_whitespace_lookahead = false;
461
+ // regex: \s+(?!\S)
462
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
463
+ pos += num_whitespaces - 1;
464
+ _add_token(pos);
465
+ continue;
352
466
  }
353
- else {
354
- token += utf_char;
467
+
468
+ // regex: \s+
469
+ if (num_whitespaces > 0) {
470
+ pos += num_whitespaces;
471
+ _add_token(pos);
472
+ continue;
355
473
  }
356
- }
357
474
 
358
- start += offset;
475
+ // no matches
476
+ _add_token(++pos);
477
+ }
359
478
  }
360
479
 
361
480
  return bpe_offsets;
@@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
424
543
  static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
425
544
  std::vector<size_t> bpe_offsets;
426
545
 
427
- (void)(text);
428
- (void)(regex_expr);
429
- (void)(offsets);
430
- // TODO: this implementation is actually wrong, uncomment and run:
431
- // make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
432
- //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
433
- // bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
434
- //}
546
+ if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
547
+ bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
548
+ } else if (
549
+ regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
550
+ regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
551
+
552
+ bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
553
+ }
435
554
 
436
555
  return bpe_offsets;
437
556
  }
@@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
506
625
  return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
507
626
  }
508
627
 
628
+ bool unicode_cpt_is_whitespace(uint32_t cp) {
629
+ static const std::unordered_set<uint32_t> is_whitespace = [] {
630
+ std::unordered_set<uint32_t> is_whitespace;
631
+ for (auto p : unicode_ranges_whitespace) {
632
+ for (auto i = p.first; i <= p.second; ++i) {
633
+ is_whitespace.insert(i);
634
+ }
635
+ }
636
+ return is_whitespace;
637
+ }();
638
+ return (bool)is_whitespace.count(cp);
639
+ }
640
+
509
641
  std::string unicode_byte_to_utf8(uint8_t byte) {
510
642
  static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
511
643
  return map.at(byte);
@@ -524,19 +656,19 @@ char32_t unicode_tolower(char32_t cp) {
524
656
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
525
657
  // unicode categories
526
658
  static const std::map<std::string, int> k_ucat_enum = {
527
- { "\\p{N}", CODEPOINT_TYPE_DIGIT },
659
+ { "\\p{N}", CODEPOINT_TYPE_NUMBER },
528
660
  { "\\p{L}", CODEPOINT_TYPE_LETTER },
529
661
  { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
530
662
  };
531
663
 
532
664
  static const std::map<int, int> k_ucat_cpt = {
533
- { CODEPOINT_TYPE_DIGIT, 0xD1 },
665
+ { CODEPOINT_TYPE_NUMBER, 0xD1 },
534
666
  { CODEPOINT_TYPE_LETTER, 0xD2 },
535
667
  { CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
536
668
  };
537
669
 
538
670
  static const std::map<int, std::string> k_ucat_map = {
539
- { CODEPOINT_TYPE_DIGIT, "\x30-\x39" }, // 0-9
671
+ { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
540
672
  { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
541
673
  { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
542
674
  };
@@ -5,9 +5,9 @@
5
5
  #include <vector>
6
6
 
7
7
  #define CODEPOINT_TYPE_UNIDENTIFIED 0
8
- #define CODEPOINT_TYPE_DIGIT 1
8
+ #define CODEPOINT_TYPE_NUMBER 1
9
9
  #define CODEPOINT_TYPE_LETTER 2
10
- #define CODEPOINT_TYPE_WHITESPACE 3
10
+ #define CODEPOINT_TYPE_SEPARATOR 3
11
11
  #define CODEPOINT_TYPE_ACCENT_MARK 4
12
12
  #define CODEPOINT_TYPE_PUNCTUATION 5
13
13
  #define CODEPOINT_TYPE_SYMBOL 6
@@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
21
21
  int unicode_cpt_type(uint32_t cp);
22
22
  int unicode_cpt_type(const std::string & utf8);
23
23
 
24
+ bool unicode_cpt_is_whitespace(uint32_t cp);
25
+
24
26
  std::string unicode_byte_to_utf8(uint8_t byte);
25
27
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
26
28
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.15.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-04 00:00:00.000000000 Z
11
+ date: 2024-05-11 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: