llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -61,6 +61,7 @@ extern "C" {
|
|
61
61
|
enum llama_vocab_type {
|
62
62
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
63
63
|
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
64
|
+
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
64
65
|
};
|
65
66
|
|
66
67
|
enum llama_token_type {
|
@@ -99,6 +100,8 @@ extern "C" {
|
|
99
100
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
100
101
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
101
102
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
102
105
|
|
103
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
104
107
|
};
|
@@ -111,6 +114,12 @@ extern "C" {
|
|
111
114
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
112
115
|
};
|
113
116
|
|
117
|
+
enum llama_pooling_type {
|
118
|
+
LLAMA_POOLING_NONE = 0,
|
119
|
+
LLAMA_POOLING_MEAN = 1,
|
120
|
+
LLAMA_POOLING_CLS = 2,
|
121
|
+
};
|
122
|
+
|
114
123
|
enum llama_split_mode {
|
115
124
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
116
125
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
@@ -235,6 +244,7 @@ extern "C" {
|
|
235
244
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
236
245
|
bool embedding; // embedding mode only
|
237
246
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
247
|
+
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
238
248
|
};
|
239
249
|
|
240
250
|
// model quantization parameters
|
@@ -296,6 +306,12 @@ extern "C" {
|
|
296
306
|
int32_t n_eval;
|
297
307
|
};
|
298
308
|
|
309
|
+
// used in chat template
|
310
|
+
typedef struct llama_chat_message {
|
311
|
+
const char * role;
|
312
|
+
const char * content;
|
313
|
+
} llama_chat_message;
|
314
|
+
|
299
315
|
// Helpers for getting default parameters
|
300
316
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
301
317
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
@@ -304,7 +320,10 @@ extern "C" {
|
|
304
320
|
// Initialize the llama + ggml backend
|
305
321
|
// If numa is true, use NUMA optimizations
|
306
322
|
// Call once at the start of the program
|
307
|
-
LLAMA_API void llama_backend_init(
|
323
|
+
LLAMA_API void llama_backend_init(void);
|
324
|
+
|
325
|
+
//optional:
|
326
|
+
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
308
327
|
|
309
328
|
// Call once at the end of the program - currently only used for MPI
|
310
329
|
LLAMA_API void llama_backend_free(void);
|
@@ -627,6 +646,10 @@ extern "C" {
|
|
627
646
|
// shape: [n_embd] (1-dimensional)
|
628
647
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
629
648
|
|
649
|
+
// Get the embeddings for the ith sequence
|
650
|
+
// llama_get_embeddings(ctx) + i*n_embd
|
651
|
+
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
652
|
+
|
630
653
|
//
|
631
654
|
// Vocab
|
632
655
|
//
|
@@ -683,6 +706,25 @@ extern "C" {
|
|
683
706
|
char * buf,
|
684
707
|
int32_t length);
|
685
708
|
|
709
|
+
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
710
|
+
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
711
|
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
712
|
+
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
713
|
+
/// @param chat Pointer to a list of multiple llama_chat_message
|
714
|
+
/// @param n_msg Number of llama_chat_message in this chat
|
715
|
+
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
716
|
+
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
717
|
+
/// @param length The size of the allocated buffer
|
718
|
+
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
719
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
720
|
+
const struct llama_model * model,
|
721
|
+
const char * tmpl,
|
722
|
+
const struct llama_chat_message * chat,
|
723
|
+
size_t n_msg,
|
724
|
+
bool add_ass,
|
725
|
+
char * buf,
|
726
|
+
int32_t length);
|
727
|
+
|
686
728
|
//
|
687
729
|
// Grammar
|
688
730
|
//
|
@@ -1,6 +1,6 @@
|
|
1
1
|
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
2
2
|
GF_CC_IS_GCC = 1
|
3
|
-
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null
|
3
|
+
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
|
4
4
|
else
|
5
5
|
GF_CC_IS_CLANG = 1
|
6
6
|
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
|
|
264
264
|
offset += 1;
|
265
265
|
return result;
|
266
266
|
}
|
267
|
-
|
267
|
+
if (!(utf8[offset + 0] & 0x40)) {
|
268
268
|
throw std::invalid_argument("invalid character");
|
269
269
|
}
|
270
|
-
|
271
|
-
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
|
270
|
+
if (!(utf8[offset + 0] & 0x20)) {
|
271
|
+
if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
|
272
272
|
throw std::invalid_argument("invalid character");
|
273
|
+
}
|
273
274
|
auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
|
274
275
|
offset += 2;
|
275
276
|
return result;
|
276
277
|
}
|
277
|
-
|
278
|
-
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
|
278
|
+
if (!(utf8[offset + 0] & 0x10)) {
|
279
|
+
if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
|
279
280
|
throw std::invalid_argument("invalid character");
|
281
|
+
}
|
280
282
|
auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
|
281
283
|
offset += 3;
|
282
284
|
return result;
|
283
285
|
}
|
284
|
-
|
285
|
-
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
|
286
|
+
if (!(utf8[offset + 0] & 0x08)) {
|
287
|
+
if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
|
286
288
|
throw std::invalid_argument("invalid character");
|
289
|
+
}
|
287
290
|
auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
|
288
291
|
offset += 4;
|
289
292
|
return result;
|
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
|
|
331
334
|
offset += 1;
|
332
335
|
return result;
|
333
336
|
}
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
338
|
-
offset += 2;
|
339
|
-
return result;
|
337
|
+
|
338
|
+
if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
|
339
|
+
throw std::invalid_argument("invalid character");
|
340
340
|
}
|
341
|
-
|
341
|
+
|
342
|
+
auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
|
343
|
+
offset += 2;
|
344
|
+
return result;
|
342
345
|
}
|
343
346
|
|
344
347
|
static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
|
345
348
|
std::vector<uint32_t> result;
|
346
349
|
size_t offset = 0;
|
347
|
-
while (offset < utf16.size())
|
350
|
+
while (offset < utf16.size()) {
|
348
351
|
result.push_back(codepoint_from_utf16(utf16, offset));
|
352
|
+
}
|
349
353
|
return result;
|
350
354
|
}
|
351
355
|
|
@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
|
|
361
365
|
static std::unordered_map<uint32_t, int> codepoint_type_map() {
|
362
366
|
std::unordered_map<uint32_t, int> codepoint_types;
|
363
367
|
for (auto p : digit_ranges) {
|
364
|
-
for(auto i = p.first; i <= p.second; ++ i)
|
368
|
+
for (auto i = p.first; i <= p.second; ++ i) {
|
365
369
|
codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
|
370
|
+
}
|
366
371
|
}
|
367
|
-
for(auto p : letter_ranges) {
|
368
|
-
for(auto i = p.first; i <= p.second; ++ i)
|
372
|
+
for (auto p : letter_ranges) {
|
373
|
+
for (auto i = p.first; i <= p.second; ++ i) {
|
369
374
|
codepoint_types[i] = CODEPOINT_TYPE_LETTER;
|
375
|
+
}
|
370
376
|
}
|
371
|
-
for(auto p : whitespace_ranges) {
|
372
|
-
for(auto i = p.first; i <= p.second; ++ i)
|
377
|
+
for (auto p : whitespace_ranges) {
|
378
|
+
for (auto i = p.first; i <= p.second; ++ i) {
|
373
379
|
codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
380
|
+
}
|
374
381
|
}
|
375
|
-
for(auto p : accent_mark_ranges) {
|
376
|
-
for(auto i = p.first; i <= p.second; ++ i)
|
382
|
+
for (auto p : accent_mark_ranges) {
|
383
|
+
for (auto i = p.first; i <= p.second; ++ i) {
|
377
384
|
codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
385
|
+
}
|
378
386
|
}
|
379
|
-
for(auto p : punctuation_ranges) {
|
380
|
-
for(auto i = p.first; i <= p.second; ++ i)
|
387
|
+
for (auto p : punctuation_ranges) {
|
388
|
+
for (auto i = p.first; i <= p.second; ++ i) {
|
381
389
|
codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
390
|
+
}
|
382
391
|
}
|
383
|
-
for
|
384
|
-
for (auto i = p.first; i <= p.second; ++i)
|
392
|
+
for (auto p : symbol_ranges) {
|
393
|
+
for (auto i = p.first; i <= p.second; ++i) {
|
385
394
|
codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
|
395
|
+
}
|
386
396
|
}
|
387
|
-
for(auto p : control_ranges) {
|
388
|
-
for(auto i = p.first; i <= p.second; ++ i)
|
397
|
+
for (auto p : control_ranges) {
|
398
|
+
for (auto i = p.first; i <= p.second; ++ i) {
|
389
399
|
codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
|
400
|
+
}
|
390
401
|
}
|
391
402
|
return codepoint_types;
|
392
403
|
}
|
393
404
|
|
394
405
|
static int codepoint_type(uint32_t cp) {
|
395
406
|
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
|
396
|
-
return codepoint_types
|
407
|
+
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
|
397
408
|
}
|
398
409
|
|
399
410
|
static int codepoint_type(const std::string & utf8) {
|
400
|
-
if (utf8.length() == 0)
|
411
|
+
if (utf8.length() == 0) {
|
401
412
|
return CODEPOINT_TYPE_UNIDENTIFIED;
|
413
|
+
}
|
402
414
|
size_t offset = 0;
|
403
415
|
return codepoint_type(codepoint_from_utf8(utf8, offset));
|
404
416
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|