llama_cpp 0.15.1 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,20 @@
1
1
  #pragma once
2
2
 
3
3
  #include <cstdint>
4
- #include <map>
5
- #include <utility>
6
4
  #include <vector>
5
+ #include <unordered_map>
6
+ #include <unordered_set>
7
7
 
8
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
9
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
10
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
11
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
12
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
13
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
14
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
15
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
16
- extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
17
- extern const std::map<char32_t, char32_t> unicode_map_lowercase;
8
+ struct range_nfd {
9
+ uint32_t first;
10
+ uint32_t last;
11
+ uint32_t nfd;
12
+ };
13
+
14
+ static const uint32_t MAX_CODEPOINTS = 0x110000;
15
+
16
+ extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
17
+ extern const std::unordered_set<uint32_t> unicode_set_whitespace;
18
+ extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
19
+ extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
20
+ extern const std::vector<range_nfd> unicode_ranges_nfd;
@@ -1,4 +1,4 @@
1
- #include "unicode.h"
1
+ #include "unicode.h"
2
2
  #include "unicode-data.h"
3
3
 
4
4
  #include <cassert>
@@ -109,57 +109,49 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
109
109
  // return result;
110
110
  //}
111
111
 
112
- static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
113
- std::unordered_map<uint32_t, int> cpt_types;
114
- for (auto p : unicode_ranges_number) {
115
- for (auto i = p.first; i <= p.second; ++i) {
116
- cpt_types[i] = CODEPOINT_TYPE_NUMBER;
117
- }
118
- }
119
- for (auto p : unicode_ranges_letter) {
120
- for (auto i = p.first; i <= p.second; ++i) {
121
- cpt_types[i] = CODEPOINT_TYPE_LETTER;
122
- }
123
- }
124
- for (auto p : unicode_ranges_separator) {
125
- for (auto i = p.first; i <= p.second; ++i) {
126
- cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
112
+ static std::vector<codepoint_flags> unicode_cpt_flags_array() {
113
+ std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
114
+
115
+ assert (unicode_ranges_flags.front().first == 0);
116
+ assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
117
+ for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
118
+ const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
119
+ const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
120
+ for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
121
+ cpt_flags[cpt] = range_ini.second;
127
122
  }
128
123
  }
129
- for (auto p : unicode_ranges_accent_mark) {
130
- for (auto i = p.first; i <= p.second; ++i) {
131
- cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
132
- }
124
+
125
+ for (auto cpt : unicode_set_whitespace) {
126
+ cpt_flags[cpt].is_whitespace = true;
133
127
  }
134
- for (auto p : unicode_ranges_punctuation) {
135
- for (auto i = p.first; i <= p.second; ++i) {
136
- cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
137
- }
128
+
129
+ for (auto p : unicode_map_lowercase) {
130
+ cpt_flags[p.second].is_lowercase = true;
138
131
  }
139
- for (auto p : unicode_ranges_symbol) {
140
- for (auto i = p.first; i <= p.second; ++i) {
141
- cpt_types[i] = CODEPOINT_TYPE_SYMBOL;
142
- }
132
+
133
+ for (auto p : unicode_map_uppercase) {
134
+ cpt_flags[p.second].is_uppercase = true;
143
135
  }
144
- for (auto p : unicode_ranges_control) {
145
- for (auto i = p.first; i <= p.second; ++i) {
146
- cpt_types[i] = CODEPOINT_TYPE_CONTROL;
147
- }
136
+
137
+ for (auto &range : unicode_ranges_nfd) { // start, last, nfd
138
+ cpt_flags[range.nfd].is_nfd = true;
148
139
  }
149
- return cpt_types;
140
+
141
+ return cpt_flags;
150
142
  }
151
143
 
152
144
  static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
153
145
  std::unordered_map<uint8_t, std::string> map;
154
- for (int ch = u'!'; ch <= u'~'; ++ch) {
146
+ for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
155
147
  assert(0 <= ch && ch < 256);
156
148
  map[ch] = unicode_cpt_to_utf8(ch);
157
149
  }
158
- for (int ch = u'¡'; ch <= u'¬'; ++ch) {
150
+ for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
159
151
  assert(0 <= ch && ch < 256);
160
152
  map[ch] = unicode_cpt_to_utf8(ch);
161
153
  }
162
- for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
154
+ for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
163
155
  assert(0 <= ch && ch < 256);
164
156
  map[ch] = unicode_cpt_to_utf8(ch);
165
157
  }
@@ -175,15 +167,15 @@ static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
175
167
 
176
168
  static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
177
169
  std::unordered_map<std::string, uint8_t> map;
178
- for (int ch = u'!'; ch <= u'~'; ++ch) {
170
+ for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
179
171
  assert(0 <= ch && ch < 256);
180
172
  map[unicode_cpt_to_utf8(ch)] = ch;
181
173
  }
182
- for (int ch = u'¡'; ch <= u'¬'; ++ch) {
174
+ for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
183
175
  assert(0 <= ch && ch < 256);
184
176
  map[unicode_cpt_to_utf8(ch)] = ch;
185
177
  }
186
- for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
178
+ for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
187
179
  assert(0 <= ch && ch < 256);
188
180
  map[unicode_cpt_to_utf8(ch)] = ch;
189
181
  }
@@ -238,8 +230,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
238
230
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
239
231
  };
240
232
 
241
- auto _get_cpt_type = [&] (const size_t pos) -> int {
242
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
233
+ auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
234
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
235
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
243
236
  };
244
237
 
245
238
  size_t _prev_end = offset_ini;
@@ -261,7 +254,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
261
254
 
262
255
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
263
256
  const char32_t cpt = _get_cpt(pos);
264
- const int cpt_type = _get_cpt_type(pos);
257
+ const auto flags = _get_flags(pos);
265
258
 
266
259
  // regex: 's|'t|'re|'ve|'m|'ll|'d
267
260
  if (cpt == '\'' && pos+1 < offset_end) {
@@ -281,39 +274,37 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
281
274
  }
282
275
  }
283
276
 
284
- char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
285
- int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
277
+ auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
286
278
  // regex: <space>?\p{L}+
287
- if (cpt2_type == CODEPOINT_TYPE_LETTER) {
279
+ if (flags2.is_letter) {
288
280
  pos += (cpt == ' ');
289
- while (cpt2_type == CODEPOINT_TYPE_LETTER) {
290
- cpt2_type = _get_cpt_type(++pos);
281
+ while (flags2.is_letter) {
282
+ flags2 = _get_flags(++pos);
291
283
  }
292
284
  _add_token(pos);
293
285
  continue;
294
286
  }
295
287
  // regex: <space>?\p{N}+
296
- if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
288
+ if (flags2.is_number) {
297
289
  pos += (cpt == ' ');
298
- while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
299
- cpt2_type = _get_cpt_type(++pos);
290
+ while (flags2.is_number) {
291
+ flags2 = _get_flags(++pos);
300
292
  }
301
293
  _add_token(pos);
302
294
  continue;
303
295
  }
304
296
  // regex: <space>?[^\s\p{L}\p{N}]+
305
- if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
297
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
306
298
  pos += (cpt == ' ');
307
- while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
308
- cpt2_type = _get_cpt_type(++pos);
309
- cpt2 = _get_cpt(pos);
299
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
300
+ flags2 = _get_flags(++pos);
310
301
  }
311
302
  _add_token(pos);
312
303
  continue;
313
304
  }
314
305
 
315
306
  size_t num_whitespaces = 0;
316
- while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
307
+ while (_get_flags(pos+num_whitespaces).is_whitespace) {
317
308
  num_whitespaces++;
318
309
  }
319
310
 
@@ -357,8 +348,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
357
348
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
358
349
  };
359
350
 
360
- auto _get_cpt_type = [&] (const size_t pos) -> int {
361
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
351
+ auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
352
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
353
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
362
354
  };
363
355
 
364
356
  size_t _prev_end = offset_ini;
@@ -380,7 +372,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
380
372
 
381
373
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
382
374
  const char32_t cpt = _get_cpt(pos);
383
- const int cpt_type = _get_cpt_type(pos);
375
+ const auto flags = _get_flags(pos);
384
376
 
385
377
  // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
386
378
  if (cpt == '\'' && pos+1 < offset_end) {
@@ -401,10 +393,10 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
401
393
  }
402
394
 
403
395
  // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
404
- if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
405
- if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
396
+ if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
397
+ if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
406
398
  pos++;
407
- while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
399
+ while (_get_flags(pos).is_letter) {
408
400
  pos++;
409
401
  }
410
402
  _add_token(pos);
@@ -413,9 +405,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
413
405
  }
414
406
 
415
407
  // regex: \p{N}{1,3}
416
- if (cpt_type == CODEPOINT_TYPE_NUMBER) {
408
+ if (flags.is_number) {
417
409
  size_t ini = pos;
418
- while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
410
+ while (_get_flags(pos).is_number) {
419
411
  if (++pos - ini >= 3 ) {
420
412
  _add_token(pos);
421
413
  ini = pos;
@@ -426,14 +418,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
426
418
  }
427
419
 
428
420
  // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
429
- char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
430
- int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
431
- if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
421
+ auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
422
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
432
423
  pos += (cpt == ' ');
433
- while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
434
- cpt2_type = _get_cpt_type(++pos);
435
- cpt2 = _get_cpt(pos);
424
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
+ flags2 = _get_flags(++pos);
436
426
  }
427
+ char32_t cpt2 = _get_cpt(pos);
437
428
  while (cpt2 == '\r' || cpt2 == '\n') {
438
429
  cpt2 = _get_cpt(++pos);
439
430
  }
@@ -443,7 +434,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
443
434
 
444
435
  size_t num_whitespaces = 0;
445
436
  size_t last_end_r_or_n = 0;
446
- while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
437
+ while (_get_flags(pos+num_whitespaces).is_whitespace) {
447
438
  char32_t cpt2 = _get_cpt(pos+num_whitespaces);
448
439
  if (cpt2 == '\r' || cpt2 == '\n') {
449
440
  last_end_r_or_n = pos + num_whitespaces + 1;
@@ -589,15 +580,14 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
589
580
  }
590
581
 
591
582
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
592
- std::vector<uint32_t> result;
593
- result.reserve(cpts.size());
583
+ auto comp = [] (const uint32_t cpt, const range_nfd & range) {
584
+ return cpt < range.first;
585
+ };
586
+ std::vector<uint32_t> result(cpts.size());
594
587
  for (size_t i = 0; i < cpts.size(); ++i) {
595
- auto it = unicode_map_nfd.find(cpts[i]);
596
- if (it == unicode_map_nfd.end()) {
597
- result.push_back(cpts[i]);
598
- } else {
599
- result.push_back(it->second);
600
- }
588
+ const uint32_t cpt = cpts[i];
589
+ auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
590
+ result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
601
591
  }
602
592
  return result;
603
593
  }
@@ -611,31 +601,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
611
601
  return result;
612
602
  }
613
603
 
614
- int unicode_cpt_type(uint32_t cp) {
615
- static std::unordered_map<uint32_t, int> cpt_types = unicode_cpt_type_map();
616
- const auto it = cpt_types.find(cp);
617
- return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
604
+ codepoint_flags unicode_cpt_flags(const uint32_t cp) {
605
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
606
+ static const auto cpt_flags = unicode_cpt_flags_array();
607
+ return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
618
608
  }
619
609
 
620
- int unicode_cpt_type(const std::string & utf8) {
621
- if (utf8.length() == 0) {
622
- return CODEPOINT_TYPE_UNIDENTIFIED;
610
+ codepoint_flags unicode_cpt_flags(const std::string & utf8) {
611
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
612
+ if (utf8.empty()) {
613
+ return undef; // undefined
623
614
  }
624
615
  size_t offset = 0;
625
- return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
626
- }
627
-
628
- bool unicode_cpt_is_whitespace(uint32_t cp) {
629
- static const std::unordered_set<uint32_t> is_whitespace = [] {
630
- std::unordered_set<uint32_t> is_whitespace;
631
- for (auto p : unicode_ranges_whitespace) {
632
- for (auto i = p.first; i <= p.second; ++i) {
633
- is_whitespace.insert(i);
634
- }
635
- }
636
- return is_whitespace;
637
- }();
638
- return (bool)is_whitespace.count(cp);
616
+ return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
639
617
  }
640
618
 
641
619
  std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -656,21 +634,21 @@ char32_t unicode_tolower(char32_t cp) {
656
634
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
657
635
  // unicode categories
658
636
  static const std::map<std::string, int> k_ucat_enum = {
659
- { "\\p{N}", CODEPOINT_TYPE_NUMBER },
660
- { "\\p{L}", CODEPOINT_TYPE_LETTER },
661
- { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
637
+ { "\\p{N}", codepoint_flags::NUMBER },
638
+ { "\\p{L}", codepoint_flags::LETTER },
639
+ { "\\p{P}", codepoint_flags::PUNCTUATION },
662
640
  };
663
641
 
664
642
  static const std::map<int, int> k_ucat_cpt = {
665
- { CODEPOINT_TYPE_NUMBER, 0xD1 },
666
- { CODEPOINT_TYPE_LETTER, 0xD2 },
667
- { CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
643
+ { codepoint_flags::NUMBER, 0xD1 },
644
+ { codepoint_flags::LETTER, 0xD2 },
645
+ { codepoint_flags::PUNCTUATION, 0xD3 },
668
646
  };
669
647
 
670
648
  static const std::map<int, std::string> k_ucat_map = {
671
- { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
672
- { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
673
- { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
649
+ { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
650
+ { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
651
+ { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
674
652
  };
675
653
 
676
654
  // compute collapsed codepoints only if needed by at least one regex
@@ -701,10 +679,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
701
679
  continue;
702
680
  }
703
681
 
704
- const int cpt_type = unicode_cpt_type(cpts[i]);
682
+ const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
705
683
 
706
- if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) {
707
- text_collapsed[i] = k_ucat_cpt.at(cpt_type);
684
+ if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
685
+ text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
708
686
  } else {
709
687
  text_collapsed[i] = (char) 0xD0; // fallback
710
688
  }
@@ -4,24 +4,56 @@
4
4
  #include <string>
5
5
  #include <vector>
6
6
 
7
- #define CODEPOINT_TYPE_UNIDENTIFIED 0
8
- #define CODEPOINT_TYPE_NUMBER 1
9
- #define CODEPOINT_TYPE_LETTER 2
10
- #define CODEPOINT_TYPE_SEPARATOR 3
11
- #define CODEPOINT_TYPE_ACCENT_MARK 4
12
- #define CODEPOINT_TYPE_PUNCTUATION 5
13
- #define CODEPOINT_TYPE_SYMBOL 6
14
- #define CODEPOINT_TYPE_CONTROL 7
7
+ struct codepoint_flags {
8
+ enum {
9
+ UNDEFINED = 0x0001,
10
+ NUMBER = 0x0002, // regex: \p{N}
11
+ LETTER = 0x0004, // regex: \p{L}
12
+ SEPARATOR = 0x0008, // regex: \p{Z}
13
+ ACCENT_MARK = 0x0010, // regex: \p{M}
14
+ PUNCTUATION = 0x0020, // regex: \p{P}
15
+ SYMBOL = 0x0040, // regex: \p{S}
16
+ CONTROL = 0x0080, // regex: \p{C}
17
+ MASK_CATEGORIES = 0x00FF,
18
+ };
19
+
20
+ // codepoint type
21
+ uint16_t is_undefined : 1;
22
+ uint16_t is_number : 1; // regex: \p{N}
23
+ uint16_t is_letter : 1; // regex: \p{L}
24
+ uint16_t is_separator : 1; // regex: \p{Z}
25
+ uint16_t is_accent_mark : 1; // regex: \p{M}
26
+ uint16_t is_punctuation : 1; // regex: \p{P}
27
+ uint16_t is_symbol : 1; // regex: \p{S}
28
+ uint16_t is_control : 1; // regex: \p{C}
29
+ // helper flags
30
+ uint16_t is_whitespace : 1; // regex: \s
31
+ uint16_t is_lowercase : 1;
32
+ uint16_t is_uppercase : 1;
33
+ uint16_t is_nfd : 1;
34
+
35
+ // decode from uint16
36
+ inline codepoint_flags(const uint16_t flags=0) {
37
+ *reinterpret_cast<uint16_t*>(this) = flags;
38
+ }
39
+
40
+ inline uint16_t as_uint() const {
41
+ return *reinterpret_cast<const uint16_t*>(this);
42
+ }
43
+
44
+ inline uint16_t category_flag() const {
45
+ return this->as_uint() & MASK_CATEGORIES;
46
+ }
47
+ };
48
+
15
49
 
16
50
  std::string unicode_cpt_to_utf8(uint32_t cp);
17
51
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
18
52
 
19
53
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
20
54
 
21
- int unicode_cpt_type(uint32_t cp);
22
- int unicode_cpt_type(const std::string & utf8);
23
-
24
- bool unicode_cpt_is_whitespace(uint32_t cp);
55
+ codepoint_flags unicode_cpt_flags(const uint32_t cp);
56
+ codepoint_flags unicode_cpt_flags(const std::string & utf8);
25
57
 
26
58
  std::string unicode_byte_to_utf8(uint8_t byte);
27
59
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.15.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-11 00:00:00.000000000 Z
11
+ date: 2024-05-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -57,6 +57,8 @@ files:
57
57
  - vendor/tmp/llama.cpp/ggml-opencl.h
58
58
  - vendor/tmp/llama.cpp/ggml-quants.c
59
59
  - vendor/tmp/llama.cpp/ggml-quants.h
60
+ - vendor/tmp/llama.cpp/ggml-rpc.cpp
61
+ - vendor/tmp/llama.cpp/ggml-rpc.h
60
62
  - vendor/tmp/llama.cpp/ggml-sycl.cpp
61
63
  - vendor/tmp/llama.cpp/ggml-sycl.h
62
64
  - vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp