llama_cpp 0.15.1 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +3 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
- data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
- data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
- data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
- data/vendor/tmp/llama.cpp/ggml.c +726 -646
- data/vendor/tmp/llama.cpp/ggml.h +28 -17
- data/vendor/tmp/llama.cpp/llama.cpp +478 -281
- data/vendor/tmp/llama.cpp/llama.h +3 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +4 -2
@@ -1,17 +1,20 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
3
|
#include <cstdint>
|
4
|
-
#include <map>
|
5
|
-
#include <utility>
|
6
4
|
#include <vector>
|
5
|
+
#include <unordered_map>
|
6
|
+
#include <unordered_set>
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
extern const std::
|
17
|
-
extern const std::
|
8
|
+
struct range_nfd {
|
9
|
+
uint32_t first;
|
10
|
+
uint32_t last;
|
11
|
+
uint32_t nfd;
|
12
|
+
};
|
13
|
+
|
14
|
+
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
15
|
+
|
16
|
+
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
17
|
+
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
18
|
+
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
|
19
|
+
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
|
20
|
+
extern const std::vector<range_nfd> unicode_ranges_nfd;
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
#include "unicode.h"
|
2
2
|
#include "unicode-data.h"
|
3
3
|
|
4
4
|
#include <cassert>
|
@@ -109,57 +109,49 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
|
109
109
|
// return result;
|
110
110
|
//}
|
111
111
|
|
112
|
-
static std::
|
113
|
-
std::
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
for (
|
121
|
-
|
122
|
-
}
|
123
|
-
}
|
124
|
-
for (auto p : unicode_ranges_separator) {
|
125
|
-
for (auto i = p.first; i <= p.second; ++i) {
|
126
|
-
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
112
|
+
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
113
|
+
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
114
|
+
|
115
|
+
assert (unicode_ranges_flags.front().first == 0);
|
116
|
+
assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
|
117
|
+
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
|
118
|
+
const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
|
119
|
+
const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
|
120
|
+
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
|
121
|
+
cpt_flags[cpt] = range_ini.second;
|
127
122
|
}
|
128
123
|
}
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
}
|
124
|
+
|
125
|
+
for (auto cpt : unicode_set_whitespace) {
|
126
|
+
cpt_flags[cpt].is_whitespace = true;
|
133
127
|
}
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
}
|
128
|
+
|
129
|
+
for (auto p : unicode_map_lowercase) {
|
130
|
+
cpt_flags[p.second].is_lowercase = true;
|
138
131
|
}
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
}
|
132
|
+
|
133
|
+
for (auto p : unicode_map_uppercase) {
|
134
|
+
cpt_flags[p.second].is_uppercase = true;
|
143
135
|
}
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
}
|
136
|
+
|
137
|
+
for (auto &range : unicode_ranges_nfd) { // start, last, nfd
|
138
|
+
cpt_flags[range.nfd].is_nfd = true;
|
148
139
|
}
|
149
|
-
|
140
|
+
|
141
|
+
return cpt_flags;
|
150
142
|
}
|
151
143
|
|
152
144
|
static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
|
153
145
|
std::unordered_map<uint8_t, std::string> map;
|
154
|
-
for (int ch =
|
146
|
+
for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
|
155
147
|
assert(0 <= ch && ch < 256);
|
156
148
|
map[ch] = unicode_cpt_to_utf8(ch);
|
157
149
|
}
|
158
|
-
for (int ch =
|
150
|
+
for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
|
159
151
|
assert(0 <= ch && ch < 256);
|
160
152
|
map[ch] = unicode_cpt_to_utf8(ch);
|
161
153
|
}
|
162
|
-
for (int ch =
|
154
|
+
for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
|
163
155
|
assert(0 <= ch && ch < 256);
|
164
156
|
map[ch] = unicode_cpt_to_utf8(ch);
|
165
157
|
}
|
@@ -175,15 +167,15 @@ static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
|
|
175
167
|
|
176
168
|
static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
177
169
|
std::unordered_map<std::string, uint8_t> map;
|
178
|
-
for (int ch =
|
170
|
+
for (int ch = 0x21; ch <= 0x7E; ++ch) { // u'!' to u'~'
|
179
171
|
assert(0 <= ch && ch < 256);
|
180
172
|
map[unicode_cpt_to_utf8(ch)] = ch;
|
181
173
|
}
|
182
|
-
for (int ch =
|
174
|
+
for (int ch = 0xA1; ch <= 0xAC; ++ch) { // u'¡' to u'¬'
|
183
175
|
assert(0 <= ch && ch < 256);
|
184
176
|
map[unicode_cpt_to_utf8(ch)] = ch;
|
185
177
|
}
|
186
|
-
for (int ch =
|
178
|
+
for (int ch = 0xAE; ch <= 0xFF; ++ch) { // u'®' to u'ÿ'
|
187
179
|
assert(0 <= ch && ch < 256);
|
188
180
|
map[unicode_cpt_to_utf8(ch)] = ch;
|
189
181
|
}
|
@@ -238,8 +230,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
238
230
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
239
231
|
};
|
240
232
|
|
241
|
-
auto
|
242
|
-
|
233
|
+
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
234
|
+
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
235
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
243
236
|
};
|
244
237
|
|
245
238
|
size_t _prev_end = offset_ini;
|
@@ -261,7 +254,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
261
254
|
|
262
255
|
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
263
256
|
const char32_t cpt = _get_cpt(pos);
|
264
|
-
const
|
257
|
+
const auto flags = _get_flags(pos);
|
265
258
|
|
266
259
|
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
267
260
|
if (cpt == '\'' && pos+1 < offset_end) {
|
@@ -281,39 +274,37 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
281
274
|
}
|
282
275
|
}
|
283
276
|
|
284
|
-
|
285
|
-
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
277
|
+
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
286
278
|
// regex: <space>?\p{L}+
|
287
|
-
if (
|
279
|
+
if (flags2.is_letter) {
|
288
280
|
pos += (cpt == ' ');
|
289
|
-
while (
|
290
|
-
|
281
|
+
while (flags2.is_letter) {
|
282
|
+
flags2 = _get_flags(++pos);
|
291
283
|
}
|
292
284
|
_add_token(pos);
|
293
285
|
continue;
|
294
286
|
}
|
295
287
|
// regex: <space>?\p{N}+
|
296
|
-
if (
|
288
|
+
if (flags2.is_number) {
|
297
289
|
pos += (cpt == ' ');
|
298
|
-
while (
|
299
|
-
|
290
|
+
while (flags2.is_number) {
|
291
|
+
flags2 = _get_flags(++pos);
|
300
292
|
}
|
301
293
|
_add_token(pos);
|
302
294
|
continue;
|
303
295
|
}
|
304
296
|
// regex: <space>?[^\s\p{L}\p{N}]+
|
305
|
-
if (!
|
297
|
+
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
306
298
|
pos += (cpt == ' ');
|
307
|
-
while (!
|
308
|
-
|
309
|
-
cpt2 = _get_cpt(pos);
|
299
|
+
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
300
|
+
flags2 = _get_flags(++pos);
|
310
301
|
}
|
311
302
|
_add_token(pos);
|
312
303
|
continue;
|
313
304
|
}
|
314
305
|
|
315
306
|
size_t num_whitespaces = 0;
|
316
|
-
while (
|
307
|
+
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
317
308
|
num_whitespaces++;
|
318
309
|
}
|
319
310
|
|
@@ -357,8 +348,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
357
348
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
358
349
|
};
|
359
350
|
|
360
|
-
auto
|
361
|
-
|
351
|
+
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
|
352
|
+
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
353
|
+
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
|
362
354
|
};
|
363
355
|
|
364
356
|
size_t _prev_end = offset_ini;
|
@@ -380,7 +372,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
380
372
|
|
381
373
|
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
382
374
|
const char32_t cpt = _get_cpt(pos);
|
383
|
-
const
|
375
|
+
const auto flags = _get_flags(pos);
|
384
376
|
|
385
377
|
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
386
378
|
if (cpt == '\'' && pos+1 < offset_end) {
|
@@ -401,10 +393,10 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
401
393
|
}
|
402
394
|
|
403
395
|
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
404
|
-
if (cpt
|
405
|
-
if (
|
396
|
+
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
|
397
|
+
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
406
398
|
pos++;
|
407
|
-
while (
|
399
|
+
while (_get_flags(pos).is_letter) {
|
408
400
|
pos++;
|
409
401
|
}
|
410
402
|
_add_token(pos);
|
@@ -413,9 +405,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
413
405
|
}
|
414
406
|
|
415
407
|
// regex: \p{N}{1,3}
|
416
|
-
if (
|
408
|
+
if (flags.is_number) {
|
417
409
|
size_t ini = pos;
|
418
|
-
while (
|
410
|
+
while (_get_flags(pos).is_number) {
|
419
411
|
if (++pos - ini >= 3 ) {
|
420
412
|
_add_token(pos);
|
421
413
|
ini = pos;
|
@@ -426,14 +418,13 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
426
418
|
}
|
427
419
|
|
428
420
|
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
429
|
-
|
430
|
-
|
431
|
-
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
421
|
+
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
422
|
+
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
432
423
|
pos += (cpt == ' ');
|
433
|
-
while (!
|
434
|
-
|
435
|
-
cpt2 = _get_cpt(pos);
|
424
|
+
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
425
|
+
flags2 = _get_flags(++pos);
|
436
426
|
}
|
427
|
+
char32_t cpt2 = _get_cpt(pos);
|
437
428
|
while (cpt2 == '\r' || cpt2 == '\n') {
|
438
429
|
cpt2 = _get_cpt(++pos);
|
439
430
|
}
|
@@ -443,7 +434,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
443
434
|
|
444
435
|
size_t num_whitespaces = 0;
|
445
436
|
size_t last_end_r_or_n = 0;
|
446
|
-
while (
|
437
|
+
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
447
438
|
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
448
439
|
if (cpt2 == '\r' || cpt2 == '\n') {
|
449
440
|
last_end_r_or_n = pos + num_whitespaces + 1;
|
@@ -589,15 +580,14 @@ std::string unicode_cpt_to_utf8(uint32_t cp) {
|
|
589
580
|
}
|
590
581
|
|
591
582
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
|
592
|
-
|
593
|
-
|
583
|
+
auto comp = [] (const uint32_t cpt, const range_nfd & range) {
|
584
|
+
return cpt < range.first;
|
585
|
+
};
|
586
|
+
std::vector<uint32_t> result(cpts.size());
|
594
587
|
for (size_t i = 0; i < cpts.size(); ++i) {
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
} else {
|
599
|
-
result.push_back(it->second);
|
600
|
-
}
|
588
|
+
const uint32_t cpt = cpts[i];
|
589
|
+
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
|
590
|
+
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
|
601
591
|
}
|
602
592
|
return result;
|
603
593
|
}
|
@@ -611,31 +601,19 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
|
611
601
|
return result;
|
612
602
|
}
|
613
603
|
|
614
|
-
|
615
|
-
static
|
616
|
-
const auto
|
617
|
-
return
|
604
|
+
codepoint_flags unicode_cpt_flags(const uint32_t cp) {
|
605
|
+
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
606
|
+
static const auto cpt_flags = unicode_cpt_flags_array();
|
607
|
+
return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
|
618
608
|
}
|
619
609
|
|
620
|
-
|
621
|
-
|
622
|
-
|
610
|
+
codepoint_flags unicode_cpt_flags(const std::string & utf8) {
|
611
|
+
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
|
612
|
+
if (utf8.empty()) {
|
613
|
+
return undef; // undefined
|
623
614
|
}
|
624
615
|
size_t offset = 0;
|
625
|
-
return
|
626
|
-
}
|
627
|
-
|
628
|
-
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
629
|
-
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
630
|
-
std::unordered_set<uint32_t> is_whitespace;
|
631
|
-
for (auto p : unicode_ranges_whitespace) {
|
632
|
-
for (auto i = p.first; i <= p.second; ++i) {
|
633
|
-
is_whitespace.insert(i);
|
634
|
-
}
|
635
|
-
}
|
636
|
-
return is_whitespace;
|
637
|
-
}();
|
638
|
-
return (bool)is_whitespace.count(cp);
|
616
|
+
return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
|
639
617
|
}
|
640
618
|
|
641
619
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
@@ -656,21 +634,21 @@ char32_t unicode_tolower(char32_t cp) {
|
|
656
634
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
657
635
|
// unicode categories
|
658
636
|
static const std::map<std::string, int> k_ucat_enum = {
|
659
|
-
{ "\\p{N}",
|
660
|
-
{ "\\p{L}",
|
661
|
-
{ "\\p{P}",
|
637
|
+
{ "\\p{N}", codepoint_flags::NUMBER },
|
638
|
+
{ "\\p{L}", codepoint_flags::LETTER },
|
639
|
+
{ "\\p{P}", codepoint_flags::PUNCTUATION },
|
662
640
|
};
|
663
641
|
|
664
642
|
static const std::map<int, int> k_ucat_cpt = {
|
665
|
-
{
|
666
|
-
{
|
667
|
-
{
|
643
|
+
{ codepoint_flags::NUMBER, 0xD1 },
|
644
|
+
{ codepoint_flags::LETTER, 0xD2 },
|
645
|
+
{ codepoint_flags::PUNCTUATION, 0xD3 },
|
668
646
|
};
|
669
647
|
|
670
648
|
static const std::map<int, std::string> k_ucat_map = {
|
671
|
-
{
|
672
|
-
{
|
673
|
-
{
|
649
|
+
{ codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9
|
650
|
+
{ codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
651
|
+
{ codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
674
652
|
};
|
675
653
|
|
676
654
|
// compute collapsed codepoints only if needed by at least one regex
|
@@ -701,10 +679,10 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|
701
679
|
continue;
|
702
680
|
}
|
703
681
|
|
704
|
-
const int
|
682
|
+
const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
|
705
683
|
|
706
|
-
if (k_ucat_cpt.find(
|
707
|
-
text_collapsed[i] = k_ucat_cpt.at(
|
684
|
+
if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
|
685
|
+
text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
|
708
686
|
} else {
|
709
687
|
text_collapsed[i] = (char) 0xD0; // fallback
|
710
688
|
}
|
@@ -4,24 +4,56 @@
|
|
4
4
|
#include <string>
|
5
5
|
#include <vector>
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
7
|
+
struct codepoint_flags {
|
8
|
+
enum {
|
9
|
+
UNDEFINED = 0x0001,
|
10
|
+
NUMBER = 0x0002, // regex: \p{N}
|
11
|
+
LETTER = 0x0004, // regex: \p{L}
|
12
|
+
SEPARATOR = 0x0008, // regex: \p{Z}
|
13
|
+
ACCENT_MARK = 0x0010, // regex: \p{M}
|
14
|
+
PUNCTUATION = 0x0020, // regex: \p{P}
|
15
|
+
SYMBOL = 0x0040, // regex: \p{S}
|
16
|
+
CONTROL = 0x0080, // regex: \p{C}
|
17
|
+
MASK_CATEGORIES = 0x00FF,
|
18
|
+
};
|
19
|
+
|
20
|
+
// codepoint type
|
21
|
+
uint16_t is_undefined : 1;
|
22
|
+
uint16_t is_number : 1; // regex: \p{N}
|
23
|
+
uint16_t is_letter : 1; // regex: \p{L}
|
24
|
+
uint16_t is_separator : 1; // regex: \p{Z}
|
25
|
+
uint16_t is_accent_mark : 1; // regex: \p{M}
|
26
|
+
uint16_t is_punctuation : 1; // regex: \p{P}
|
27
|
+
uint16_t is_symbol : 1; // regex: \p{S}
|
28
|
+
uint16_t is_control : 1; // regex: \p{C}
|
29
|
+
// helper flags
|
30
|
+
uint16_t is_whitespace : 1; // regex: \s
|
31
|
+
uint16_t is_lowercase : 1;
|
32
|
+
uint16_t is_uppercase : 1;
|
33
|
+
uint16_t is_nfd : 1;
|
34
|
+
|
35
|
+
// decode from uint16
|
36
|
+
inline codepoint_flags(const uint16_t flags=0) {
|
37
|
+
*reinterpret_cast<uint16_t*>(this) = flags;
|
38
|
+
}
|
39
|
+
|
40
|
+
inline uint16_t as_uint() const {
|
41
|
+
return *reinterpret_cast<const uint16_t*>(this);
|
42
|
+
}
|
43
|
+
|
44
|
+
inline uint16_t category_flag() const {
|
45
|
+
return this->as_uint() & MASK_CATEGORIES;
|
46
|
+
}
|
47
|
+
};
|
48
|
+
|
15
49
|
|
16
50
|
std::string unicode_cpt_to_utf8(uint32_t cp);
|
17
51
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
18
52
|
|
19
53
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
20
54
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
bool unicode_cpt_is_whitespace(uint32_t cp);
|
55
|
+
codepoint_flags unicode_cpt_flags(const uint32_t cp);
|
56
|
+
codepoint_flags unicode_cpt_flags(const std::string & utf8);
|
25
57
|
|
26
58
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
27
59
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.15.
|
4
|
+
version: 0.15.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-05-
|
11
|
+
date: 2024-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -57,6 +57,8 @@ files:
|
|
57
57
|
- vendor/tmp/llama.cpp/ggml-opencl.h
|
58
58
|
- vendor/tmp/llama.cpp/ggml-quants.c
|
59
59
|
- vendor/tmp/llama.cpp/ggml-quants.h
|
60
|
+
- vendor/tmp/llama.cpp/ggml-rpc.cpp
|
61
|
+
- vendor/tmp/llama.cpp/ggml-rpc.h
|
60
62
|
- vendor/tmp/llama.cpp/ggml-sycl.cpp
|
61
63
|
- vendor/tmp/llama.cpp/ggml-sycl.h
|
62
64
|
- vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp
|