llama_cpp 0.15.1 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,17 +1,20 @@
1
1
  #pragma once
2
2
 
3
3
  #include <cstdint>
4
- #include <map>
5
- #include <utility>
6
4
  #include <vector>
5
+ #include <unordered_map>
6
+ #include <unordered_set>
7
7
 
8
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
9
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
10
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
11
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
12
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
13
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
14
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
15
- extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
16
- extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
17
- extern const std::map<char32_t, char32_t> unicode_map_lowercase;
8
+ struct range_nfd {
9
+ uint32_t first;
10
+ uint32_t last;
11
+ uint32_t nfd;
12
+ };
13
+
14
+ static const uint32_t MAX_CODEPOINTS = 0x110000;
15
+
16
+ extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
17
+ extern const std::unordered_set<uint32_t> unicode_set_whitespace;
18
+ extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
19
+ extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
20
+ extern const std::vector<range_nfd> unicode_ranges_nfd;
@@ -1,4 +1,4 @@
1
- #include "unicode.h"
1
+ #include "unicode.h"
2
2
  #include "unicode-data.h"
3
3
 
4
4
  #include <cassert>
@@ -109,57 +109,49 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
109
109
  // return result;
110
110
  //}
111
111
 
112
- static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
113
- std::unordered_map<uint32_t, int> cpt_types;
114
- for (auto p : unicode_ranges_number) {
115
- for (auto i = p.first; i <= p.second; ++i) {
116
- cpt_types[i] = CODEPOINT_TYPE_NUMBER;
117
- }
118
- }
119
- for (auto p : unicode_ranges_letter) {
120
- for (auto i = p.first; i <= p.second; ++i) {
121
- cpt_types[i] = CODEPOINT_TYPE_LETTER;
122
- }
123
- }
124
- for (auto p : unicode_ranges_separator) {
125
- for (auto i = p.first; i <= p.second; ++i) {
126
- cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
112
+ static std::vector<codepoint_flags> unicode_cpt_flags_array() {
113
+ std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
114
+
115
+ assert (unicode_ranges_flags.front().first == 0);
116
+ assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
117
+ for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
118
+ const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
119
+ const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
120
+ for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
121
+ cpt_flags[cpt] = range_ini.second;
127
122
  }
128
123
  }
129
- for (auto p : unicode_ranges_accent_mark) {
130
- for (auto i = p.first; i <= p.second; ++i) {
131
- cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
132
- }
124
+
125
+ for (auto cpt : unicode_set_whitespace) {
126
+ cpt_flags[cpt].is_whitespace = true;
133
127
  }
134
- for (auto p : unicode_ranges_punctuation) {
135
- for (auto i = p.first; i <= p.second; ++i) {
136
- cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
137
- }
128
+
129
+ for (auto p : unicode_map_lowercase) {
130
+ cpt_flags[p.second].is_lowercase = true;
138
131
  }
139
- for (auto p : unicode_ranges_symbol) {
140
- for (auto i = p.first; i <= p.second; ++i) {
141
- cpt_types[i] = CODEPOINT_TYPE_SYMBOL;
142
- }
132
+
133
+ for (auto p : unicode_map_uppercase) {
134
+ cpt_flags[p.second].is_uppercase = true;
143
135
  }
144
- for (auto p : unicode_ranges_control) {
145
- for (auto i = p.first; i <= p.second; ++i) {
146
- cpt_types[i] = CODEPOINT_TYPE_CONTROL;
147
- }
136
+
137
+ for (auto &range : unicode_ranges_nfd) { // start, last, nfd
138
+ cpt_flags[range.nfd].is_nfd = true;
148
139
  }
149
- return cpt_types;
140
+
141
+ return cpt_flags;
150
142
  }
151
143
 
152
144
  static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
153
145
  std::unordered_map<uint8_t, std::string> map;
154
- for (int ch = u'!'; ch <= u'~'; ++ch) {
146
+ for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
155
147
  assert(0 <= ch && ch < 256);
156
148
  map[ch] = unicode_cpt_to_utf8(ch);
157
149
  }
158
- for (int ch = u'¡'; ch <= u'¬'; ++ch) {
150
+ for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
159
151
  assert(0 <= ch && ch < 256);
160
152
  map[ch] = unicode_cpt_to_utf8(ch);
161
153
  }
162
- for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
154
+ for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
163
155
  assert(0 <= ch && ch < 256);
164
156
  map[ch] = unicode_cpt_to_utf8(ch);
165
157
  }
@@ -175,15 +167,15 @@ static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
175
167
 
176
168
  static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
177
169
  std::unordered_map<std::string, uint8_t> map;
178
- for (int ch = u'!'; ch <= u'~'; ++ch) {
170
+ for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
179
171
  assert(0 <= ch && ch < 256);
180
172
  map[unicode_cpt_to_utf8(ch)] = ch;
181
173
  }
182
- for (int ch = u'¡'; ch <= u'¬'; ++ch) {
174
+ for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
183
175
  assert(0 <= ch && ch < 256);
184
176
  map[unicode_cpt_to_utf8(ch)] = ch;
185
177
  }
186
- for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
178
+ for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
187
179
  assert(0 <= ch && ch < 256);
188
180
  map[unicode_cpt_to_utf8(ch)] = ch;
189
181
  }
@@ -238,8 +230,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
238
230
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
239
231
  };
240
232
 
241
- auto _get_cpt_type = [&] (const size_t pos) -> int {
242
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
233
+ auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
234
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
235
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
243
236
  };
244
237
 
245
238
  size_t _prev_end = offset_ini;
@@ -261,7 +254,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
261
254
 
262
255
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
263
256
  const char32_t cpt = _get_cpt(pos);
264
- const int cpt_type = _get_cpt_type(pos);
257
+ const auto flags = _get_flags(pos);
265
258
 
266
259
  // regex: 's|'t|'re|'ve|'m|'ll|'d
267
260
  if (cpt == '\'' && pos+1 < offset_end) {
@@ -281,39 +274,37 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
281
274
  }
282
275
  }
283
276
 
284
- char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
285
- int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
277
+ auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
286
278
  // regex: <space>?\p{L}+
287
- if (cpt2_type == CODEPOINT_TYPE_LETTER) {
279
+ if (flags2.is_letter) {
288
280
  pos += (cpt == ' ');
289
- while (cpt2_type == CODEPOINT_TYPE_LETTER) {
290
- cpt2_type = _get_cpt_type(++pos);
281
+ while (flags2.is_letter) {
282
+ flags2 = _get_flags(++pos);
291
283
  }
292
284
  _add_token(pos);
293
285
  continue;
294
286
  }
295
287
  // regex: <space>?\p{N}+
296
- if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
288
+ if (flags2.is_number) {
297
289
  pos += (cpt == ' ');
298
- while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
299
- cpt2_type = _get_cpt_type(++pos);
290
+ while (flags2.is_number) {
291
+ flags2 = _get_flags(++pos);
300
292
  }
301
293
  _add_token(pos);
302
294
  continue;
303
295
  }
304
296
  // regex: <space>?[^\s\p{L}\p{N}]+
305
- if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
297
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
306
298
  pos += (cpt == ' ');
307
- while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
308
- cpt2_type = _get_cpt_type(++pos);
309
- cpt2 = _get_cpt(pos);
299
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
300
+ flags2 = _get_flags(++pos);
310
301
  }
311
302
  _add_token(pos);
312
303
  continue;
313
304
  }
314
305
 
315
306
  size_t num_whitespaces = 0;
316
- while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
307
+ while (_get_flags(pos+num_whitespaces).is_whitespace) {
317
308
  num_whitespaces++;
318
309
  }
319
310
 
@@ -357,8 +348,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
357
348
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
358
349
  };
359
350
 
360
- auto _get_cpt_type = [&] (const size_t pos) -> int {
361
- return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
351
+ auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
352
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
353
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
362
354
  };
363
355
 
364
356
  size_t _prev_end = offset_ini;
@@ -380,7 +372,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
380
372
 
381
373
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
382
374
  const char32_t cpt = _get_cpt(pos);
383
- const int cpt_type = _get_cpt_type(pos);
375
+ const auto flags = _get_flags(pos);
384
376
 
385
377
  // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
386
378
  if (cpt == '\'' && pos+1 < offset_end) {
@@ -401,10 +393,10 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
401
393
  }
402
394
 
403
395
  // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
404
- if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
405
- if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
396
+ if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
397
+ if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
406
398
  pos++;
407
- while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
399
+ while (_get_flags(pos).is_letter) {
408
400
  pos++;
409
401
  }
410
402
  _add_token(pos);
@@ -413,9 +405,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
413
405
  }
414
406
 
415
407
  // regex: \p{N}{1,3}
416
- if (cpt_type == CODEPOINT_TYPE_NUMBER) {
408
+ if (flags.is_number) {
417
409
  size_t ini = pos;
418
- while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
410
+ while (_get_flags(pos).is_number) {
419
411
  if (++pos - ini >= 3 ) {
420
412
  _add_token(pos);
421
413
  ini = pos;
@@ -426,14 +418,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
426
418
  }
427
419
 
428
420
  // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
429
- char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
430
- int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
431
- if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
421
+ auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
422
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
432
423
  pos += (cpt == ' ');
433
- while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
434
- cpt2_type = _get_cpt_type(++pos);
435
- cpt2 = _get_cpt(pos);
424
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
+ flags2 = _get_flags(++pos);
436
426
  }
427
+ char32_t cpt2 = _get_cpt(pos);
437
428
  while (cpt2 == '\r' || cpt2 == '\n') {
438
429
  cpt2 = _get_cpt(++pos);
439
430
  }
@@ -443,7 +434,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
443
434
 
444
435
  size_t num_whitespaces = 0;
445
436
  size_t last_end_r_or_n = 0;
446
- while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
437
+ while (_get_flags(pos+num_whitespaces).is_whitespace) {
447
438
  char32_t cpt2 = _get_cpt(pos+num_whitespaces);
448
439
  if (cpt2 == '\r' || cpt2 == '\n') {
449
440
  last_end_r_or_n = pos + num_whitespaces + 1;
@@ -589,15 +580,14 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
589
580
  }
590
581
 
591
582
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
592
- std::vector<uint32_t> result;
593
- result.reserve(cpts.size());
583
+ auto comp = [] (const uint32_t cpt, const range_nfd & range) {
584
+ return cpt < range.first;
585
+ };
586
+ std::vector<uint32_t> result(cpts.size());
594
587
  for (size_t i = 0; i < cpts.size(); ++i) {
595
- auto it = unicode_map_nfd.find(cpts[i]);
596
- if (it == unicode_map_nfd.end()) {
597
- result.push_back(cpts[i]);
598
- } else {
599
- result.push_back(it->second);
600
- }
588
+ const uint32_t cpt = cpts[i];
589
+ auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
590
+ result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
601
591
  }
602
592
  return result;
603
593
  }
@@ -611,31 +601,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
611
601
  return result;
612
602
  }
613
603
 
614
- int unicode_cpt_type(uint32_t cp) {
615
- static std::unordered_map<uint32_t, int> cpt_types = unicode_cpt_type_map();
616
- const auto it = cpt_types.find(cp);
617
- return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
604
+ codepoint_flags unicode_cpt_flags(const uint32_t cp) {
605
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
606
+ static const auto cpt_flags = unicode_cpt_flags_array();
607
+ return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
618
608
  }
619
609
 
620
- int unicode_cpt_type(const std::string & utf8) {
621
- if (utf8.length() == 0) {
622
- return CODEPOINT_TYPE_UNIDENTIFIED;
610
+ codepoint_flags unicode_cpt_flags(const std::string & utf8) {
611
+ static const codepoint_flags undef(codepoint_flags::UNDEFINED);
612
+ if (utf8.empty()) {
613
+ return undef; // undefined
623
614
  }
624
615
  size_t offset = 0;
625
- return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
626
- }
627
-
628
- bool unicode_cpt_is_whitespace(uint32_t cp) {
629
- static const std::unordered_set<uint32_t> is_whitespace = [] {
630
- std::unordered_set<uint32_t> is_whitespace;
631
- for (auto p : unicode_ranges_whitespace) {
632
- for (auto i = p.first; i <= p.second; ++i) {
633
- is_whitespace.insert(i);
634
- }
635
- }
636
- return is_whitespace;
637
- }();
638
- return (bool)is_whitespace.count(cp);
616
+ return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
639
617
  }
640
618
 
641
619
  std::string unicode_byte_to_utf8(uint8_t byte) {
@@ -656,21 +634,21 @@ char32_t unicode_tolower(char32_t cp) {
656
634
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
657
635
  // unicode categories
658
636
  static const std::map<std::string, int> k_ucat_enum = {
659
- { "\\p{N}", CODEPOINT_TYPE_NUMBER },
660
- { "\\p{L}", CODEPOINT_TYPE_LETTER },
661
- { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
637
+ { "\\p{N}", codepoint_flags::NUMBER },
638
+ { "\\p{L}", codepoint_flags::LETTER },
639
+ { "\\p{P}", codepoint_flags::PUNCTUATION },
662
640
  };
663
641
 
664
642
  static const std::map<int, int> k_ucat_cpt = {
665
- { CODEPOINT_TYPE_NUMBER, 0xD1 },
666
- { CODEPOINT_TYPE_LETTER, 0xD2 },
667
- { CODEPOINT_TYPE_PUNCTUATION, 0xD3 },
643
+ { codepoint_flags::NUMBER, 0xD1 },
644
+ { codepoint_flags::LETTER, 0xD2 },
645
+ { codepoint_flags::PUNCTUATION, 0xD3 },
668
646
  };
669
647
 
670
648
  static const std::map<int, std::string> k_ucat_map = {
671
- { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9
672
- { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
673
- { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
649
+ { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
650
+ { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
651
+ { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
674
652
  };
675
653
 
676
654
  // compute collapsed codepoints only if needed by at least one regex
@@ -701,10 +679,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
701
679
  continue;
702
680
  }
703
681
 
704
- const int cpt_type = unicode_cpt_type(cpts[i]);
682
+ const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
705
683
 
706
- if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) {
707
- text_collapsed[i] = k_ucat_cpt.at(cpt_type);
684
+ if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
685
+ text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
708
686
  } else {
709
687
  text_collapsed[i] = (char) 0xD0; // fallback
710
688
  }
@@ -4,24 +4,56 @@
4
4
  #include <string>
5
5
  #include <vector>
6
6
 
7
- #define CODEPOINT_TYPE_UNIDENTIFIED 0
8
- #define CODEPOINT_TYPE_NUMBER 1
9
- #define CODEPOINT_TYPE_LETTER 2
10
- #define CODEPOINT_TYPE_SEPARATOR 3
11
- #define CODEPOINT_TYPE_ACCENT_MARK 4
12
- #define CODEPOINT_TYPE_PUNCTUATION 5
13
- #define CODEPOINT_TYPE_SYMBOL 6
14
- #define CODEPOINT_TYPE_CONTROL 7
7
+ struct codepoint_flags {
8
+ enum {
9
+ UNDEFINED = 0x0001,
10
+ NUMBER = 0x0002, // regex: \p{N}
11
+ LETTER = 0x0004, // regex: \p{L}
12
+ SEPARATOR = 0x0008, // regex: \p{Z}
13
+ ACCENT_MARK = 0x0010, // regex: \p{M}
14
+ PUNCTUATION = 0x0020, // regex: \p{P}
15
+ SYMBOL = 0x0040, // regex: \p{S}
16
+ CONTROL = 0x0080, // regex: \p{C}
17
+ MASK_CATEGORIES = 0x00FF,
18
+ };
19
+
20
+ // codepoint type
21
+ uint16_t is_undefined : 1;
22
+ uint16_t is_number : 1; // regex: \p{N}
23
+ uint16_t is_letter : 1; // regex: \p{L}
24
+ uint16_t is_separator : 1; // regex: \p{Z}
25
+ uint16_t is_accent_mark : 1; // regex: \p{M}
26
+ uint16_t is_punctuation : 1; // regex: \p{P}
27
+ uint16_t is_symbol : 1; // regex: \p{S}
28
+ uint16_t is_control : 1; // regex: \p{C}
29
+ // helper flags
30
+ uint16_t is_whitespace : 1; // regex: \s
31
+ uint16_t is_lowercase : 1;
32
+ uint16_t is_uppercase : 1;
33
+ uint16_t is_nfd : 1;
34
+
35
+ // decode from uint16
36
+ inline codepoint_flags(const uint16_t flags=0) {
37
+ *reinterpret_cast<uint16_t*>(this) = flags;
38
+ }
39
+
40
+ inline uint16_t as_uint() const {
41
+ return *reinterpret_cast<const uint16_t*>(this);
42
+ }
43
+
44
+ inline uint16_t category_flag() const {
45
+ return this->as_uint() & MASK_CATEGORIES;
46
+ }
47
+ };
48
+
15
49
 
16
50
  std::string unicode_cpt_to_utf8(uint32_t cp);
17
51
  std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
18
52
 
19
53
  std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
20
54
 
21
- int unicode_cpt_type(uint32_t cp);
22
- int unicode_cpt_type(const std::string & utf8);
23
-
24
- bool unicode_cpt_is_whitespace(uint32_t cp);
55
+ codepoint_flags unicode_cpt_flags(const uint32_t cp);
56
+ codepoint_flags unicode_cpt_flags(const std::string & utf8);
25
57
 
26
58
  std::string unicode_byte_to_utf8(uint8_t byte);
27
59
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.15.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-11 00:00:00.000000000 Z
11
+ date: 2024-05-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -57,6 +57,8 @@ files:
57
57
  - vendor/tmp/llama.cpp/ggml-opencl.h
58
58
  - vendor/tmp/llama.cpp/ggml-quants.c
59
59
  - vendor/tmp/llama.cpp/ggml-quants.h
60
+ - vendor/tmp/llama.cpp/ggml-rpc.cpp
61
+ - vendor/tmp/llama.cpp/ggml-rpc.h
60
62
  - vendor/tmp/llama.cpp/ggml-sycl.cpp
61
63
  - vendor/tmp/llama.cpp/ggml-sycl.h
62
64
  - vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp