llama_cpp 0.16.1 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
226
226
  assert(offset_end <= cpts.size());
227
227
  start = offset_end;
228
228
 
229
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
230
- return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
229
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
230
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
231
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
231
232
  };
232
233
 
233
234
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@@ -253,18 +254,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253
254
  };
254
255
 
255
256
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
256
- const char32_t cpt = _get_cpt(pos);
257
+ const uint32_t cpt = _get_cpt(pos);
257
258
  const auto flags = _get_flags(pos);
258
259
 
259
260
  // regex: 's|'t|'re|'ve|'m|'ll|'d
260
261
  if (cpt == '\'' && pos+1 < offset_end) {
261
- char32_t cpt_next = _get_cpt(pos+1);
262
+ uint32_t cpt_next = _get_cpt(pos+1);
262
263
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
263
264
  pos += _add_token(pos+2);
264
265
  continue;
265
266
  }
266
267
  if (pos+2 < offset_end) {
267
- char32_t cpt_next_next = _get_cpt(pos+2);
268
+ uint32_t cpt_next_next = _get_cpt(pos+2);
268
269
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
269
270
  (cpt_next == 'v' && cpt_next_next == 'e') ||
270
271
  (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
309
310
  }
310
311
 
311
312
  // regex: \s+(?!\S)
312
- if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
313
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
313
314
  pos += num_whitespaces - 1;
314
315
  _add_token(pos);
315
316
  continue;
@@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
344
345
  assert(offset_end <= cpts.size());
345
346
  start = offset_end;
346
347
 
347
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
348
- return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
348
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
349
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
350
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
349
351
  };
350
352
 
351
353
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@@ -371,18 +373,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371
373
  };
372
374
 
373
375
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
374
- const char32_t cpt = _get_cpt(pos);
376
+ const uint32_t cpt = _get_cpt(pos);
375
377
  const auto flags = _get_flags(pos);
376
378
 
377
379
  // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
378
380
  if (cpt == '\'' && pos+1 < offset_end) {
379
- char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
381
+ uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
380
382
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
381
383
  pos += _add_token(pos+2);
382
384
  continue;
383
385
  }
384
386
  if (pos+2 < offset_end) {
385
- char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
387
+ uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
386
388
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
387
389
  (cpt_next == 'v' && cpt_next_next == 'e') ||
388
390
  (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -424,7 +426,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
424
426
  while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
427
  flags2 = _get_flags(++pos);
426
428
  }
427
- char32_t cpt2 = _get_cpt(pos);
429
+ uint32_t cpt2 = _get_cpt(pos);
428
430
  while (cpt2 == '\r' || cpt2 == '\n') {
429
431
  cpt2 = _get_cpt(++pos);
430
432
  }
@@ -435,7 +437,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
435
437
  size_t num_whitespaces = 0;
436
438
  size_t last_end_r_or_n = 0;
437
439
  while (_get_flags(pos+num_whitespaces).is_whitespace) {
438
- char32_t cpt2 = _get_cpt(pos+num_whitespaces);
440
+ uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
439
441
  if (cpt2 == '\r' || cpt2 == '\n') {
440
442
  last_end_r_or_n = pos + num_whitespaces + 1;
441
443
  }
@@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
450
452
  }
451
453
 
452
454
  // regex: \s+(?!\S)
453
- if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
455
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
454
456
  pos += num_whitespaces - 1;
455
457
  _add_token(pos);
456
458
  continue;
@@ -594,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
594
596
 
595
597
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
596
598
  std::vector<uint32_t> result;
599
+ result.reserve(utf8.size());
597
600
  size_t offset = 0;
598
601
  while (offset < utf8.size()) {
599
602
  result.push_back(unicode_cpt_from_utf8(utf8, offset));
@@ -626,7 +629,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
626
629
  return map.at(utf8);
627
630
  }
628
631
 
629
- char32_t unicode_tolower(char32_t cp) {
632
+ uint32_t unicode_tolower(uint32_t cp) {
630
633
  auto it = unicode_map_lowercase.find(cp);
631
634
  return it == unicode_map_lowercase.end() ? cp : it->second;
632
635
  }
@@ -679,10 +682,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
679
682
  continue;
680
683
  }
681
684
 
682
- const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
685
+ const auto flags = unicode_cpt_flags(cpts[i]);
683
686
 
684
- if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
685
- text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
687
+ if (flags.is_whitespace) {
688
+ //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
689
+ //text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
690
+ text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
691
+ } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
692
+ text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
686
693
  } else {
687
694
  text_collapsed[i] = (char) 0xD0; // fallback
688
695
  }
@@ -766,9 +773,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
766
773
  bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
767
774
  } else {
768
775
  // no unicode category used, we can use std::wregex directly
769
- const std::wstring wtext = unicode_wstring_from_utf8(text);
770
776
  const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
771
777
 
778
+ // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
779
+ std::wstring wtext(cpts.begin(), cpts.end());
780
+ for (size_t i = 0; i < wtext.size(); ++i) {
781
+ if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
782
+ wtext[i] = 0x0B;
783
+ }
784
+ }
785
+
772
786
  //printf("text: %s\n", text.c_str());
773
787
  //printf("regex_expr: %s\n", regex_expr.c_str());
774
788
  bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
@@ -58,6 +58,6 @@ codepoint_flags unicode_cpt_flags(const std::string & utf8);
58
58
  std::string unicode_byte_to_utf8(uint8_t byte);
59
59
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
60
60
 
61
- char32_t unicode_tolower(char32_t cp);
61
+ uint32_t unicode_tolower(uint32_t cp);
62
62
 
63
63
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.1
4
+ version: 0.16.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-06-15 00:00:00.000000000 Z
11
+ date: 2024-06-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: