llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
226
226
  assert(offset_end <= cpts.size());
227
227
  start = offset_end;
228
228
 
229
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
230
- return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
229
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
230
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
231
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
231
232
  };
232
233
 
233
234
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@@ -253,18 +254,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253
254
  };
254
255
 
255
256
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
256
- const char32_t cpt = _get_cpt(pos);
257
+ const uint32_t cpt = _get_cpt(pos);
257
258
  const auto flags = _get_flags(pos);
258
259
 
259
260
  // regex: 's|'t|'re|'ve|'m|'ll|'d
260
261
  if (cpt == '\'' && pos+1 < offset_end) {
261
- char32_t cpt_next = _get_cpt(pos+1);
262
+ uint32_t cpt_next = _get_cpt(pos+1);
262
263
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
263
264
  pos += _add_token(pos+2);
264
265
  continue;
265
266
  }
266
267
  if (pos+2 < offset_end) {
267
- char32_t cpt_next_next = _get_cpt(pos+2);
268
+ uint32_t cpt_next_next = _get_cpt(pos+2);
268
269
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
269
270
  (cpt_next == 'v' && cpt_next_next == 'e') ||
270
271
  (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
309
310
  }
310
311
 
311
312
  // regex: \s+(?!\S)
312
- if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
313
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
313
314
  pos += num_whitespaces - 1;
314
315
  _add_token(pos);
315
316
  continue;
@@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
344
345
  assert(offset_end <= cpts.size());
345
346
  start = offset_end;
346
347
 
347
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
348
- return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
348
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
349
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
350
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
349
351
  };
350
352
 
351
353
  auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@@ -371,18 +373,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371
373
  };
372
374
 
373
375
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
374
- const char32_t cpt = _get_cpt(pos);
376
+ const uint32_t cpt = _get_cpt(pos);
375
377
  const auto flags = _get_flags(pos);
376
378
 
377
379
  // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
378
380
  if (cpt == '\'' && pos+1 < offset_end) {
379
- char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
381
+ uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
380
382
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
381
383
  pos += _add_token(pos+2);
382
384
  continue;
383
385
  }
384
386
  if (pos+2 < offset_end) {
385
- char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
387
+ uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
386
388
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
387
389
  (cpt_next == 'v' && cpt_next_next == 'e') ||
388
390
  (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -424,7 +426,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
424
426
  while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
427
  flags2 = _get_flags(++pos);
426
428
  }
427
- char32_t cpt2 = _get_cpt(pos);
429
+ uint32_t cpt2 = _get_cpt(pos);
428
430
  while (cpt2 == '\r' || cpt2 == '\n') {
429
431
  cpt2 = _get_cpt(++pos);
430
432
  }
@@ -435,7 +437,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
435
437
  size_t num_whitespaces = 0;
436
438
  size_t last_end_r_or_n = 0;
437
439
  while (_get_flags(pos+num_whitespaces).is_whitespace) {
438
- char32_t cpt2 = _get_cpt(pos+num_whitespaces);
440
+ uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
439
441
  if (cpt2 == '\r' || cpt2 == '\n') {
440
442
  last_end_r_or_n = pos + num_whitespaces + 1;
441
443
  }
@@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
450
452
  }
451
453
 
452
454
  // regex: \s+(?!\S)
453
- if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
455
+ if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
454
456
  pos += num_whitespaces - 1;
455
457
  _add_token(pos);
456
458
  continue;
@@ -594,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
594
596
 
595
597
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
596
598
  std::vector<uint32_t> result;
599
+ result.reserve(utf8.size());
597
600
  size_t offset = 0;
598
601
  while (offset < utf8.size()) {
599
602
  result.push_back(unicode_cpt_from_utf8(utf8, offset));
@@ -626,7 +629,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
626
629
  return map.at(utf8);
627
630
  }
628
631
 
629
- char32_t unicode_tolower(char32_t cp) {
632
+ uint32_t unicode_tolower(uint32_t cp) {
630
633
  auto it = unicode_map_lowercase.find(cp);
631
634
  return it == unicode_map_lowercase.end() ? cp : it->second;
632
635
  }
@@ -679,10 +682,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
679
682
  continue;
680
683
  }
681
684
 
682
- const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
685
+ const auto flags = unicode_cpt_flags(cpts[i]);
683
686
 
684
- if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
685
- text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
687
+ if (flags.is_whitespace) {
688
+ //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
689
+ //text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
690
+ text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
691
+ } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
692
+ text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
686
693
  } else {
687
694
  text_collapsed[i] = (char) 0xD0; // fallback
688
695
  }
@@ -766,9 +773,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
766
773
  bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
767
774
  } else {
768
775
  // no unicode category used, we can use std::wregex directly
769
- const std::wstring wtext = unicode_wstring_from_utf8(text);
770
776
  const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
771
777
 
778
+ // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
779
+ std::wstring wtext(cpts.begin(), cpts.end());
780
+ for (size_t i = 0; i < wtext.size(); ++i) {
781
+ if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
782
+ wtext[i] = 0x0B;
783
+ }
784
+ }
785
+
772
786
  //printf("text: %s\n", text.c_str());
773
787
  //printf("regex_expr: %s\n", regex_expr.c_str());
774
788
  bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
@@ -58,6 +58,6 @@ codepoint_flags unicode_cpt_flags(const std::string & utf8);
58
58
  std::string unicode_byte_to_utf8(uint8_t byte);
59
59
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
60
60
 
61
- char32_t unicode_tolower(char32_t cp);
61
+ uint32_t unicode_tolower(uint32_t cp);
62
62
 
63
63
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.16.1
4
+ version: 0.16.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-06-15 00:00:00.000000000 Z
11
+ date: 2024-06-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: