llama_cpp 0.12.6 → 0.12.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -100,6 +100,8 @@ extern "C" {
|
|
100
100
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
101
101
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
102
102
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
103
105
|
|
104
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
105
107
|
};
|
@@ -112,6 +114,12 @@ extern "C" {
|
|
112
114
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
113
115
|
};
|
114
116
|
|
117
|
+
enum llama_pooling_type {
|
118
|
+
LLAMA_POOLING_NONE = 0,
|
119
|
+
LLAMA_POOLING_MEAN = 1,
|
120
|
+
LLAMA_POOLING_CLS = 2,
|
121
|
+
};
|
122
|
+
|
115
123
|
enum llama_split_mode {
|
116
124
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
117
125
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
@@ -298,6 +306,12 @@ extern "C" {
|
|
298
306
|
int32_t n_eval;
|
299
307
|
};
|
300
308
|
|
309
|
+
// used in chat template
|
310
|
+
typedef struct llama_chat_message {
|
311
|
+
const char * role;
|
312
|
+
const char * content;
|
313
|
+
} llama_chat_message;
|
314
|
+
|
301
315
|
// Helpers for getting default parameters
|
302
316
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
303
317
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
@@ -306,7 +320,10 @@ extern "C" {
|
|
306
320
|
// Initialize the llama + ggml backend
|
307
321
|
// If numa is true, use NUMA optimizations
|
308
322
|
// Call once at the start of the program
|
309
|
-
LLAMA_API void llama_backend_init(
|
323
|
+
LLAMA_API void llama_backend_init(void);
|
324
|
+
|
325
|
+
//optional:
|
326
|
+
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
310
327
|
|
311
328
|
// Call once at the end of the program - currently only used for MPI
|
312
329
|
LLAMA_API void llama_backend_free(void);
|
@@ -689,6 +706,25 @@ extern "C" {
|
|
689
706
|
char * buf,
|
690
707
|
int32_t length);
|
691
708
|
|
709
|
+
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
710
|
+
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
711
|
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
712
|
+
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
713
|
+
/// @param chat Pointer to a list of multiple llama_chat_message
|
714
|
+
/// @param n_msg Number of llama_chat_message in this chat
|
715
|
+
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
716
|
+
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
717
|
+
/// @param length The size of the allocated buffer
|
718
|
+
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
719
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
720
|
+
const struct llama_model * model,
|
721
|
+
const char * tmpl,
|
722
|
+
const struct llama_chat_message * chat,
|
723
|
+
size_t n_msg,
|
724
|
+
bool add_ass,
|
725
|
+
char * buf,
|
726
|
+
int32_t length);
|
727
|
+
|
692
728
|
//
|
693
729
|
// Grammar
|
694
730
|
//
|
@@ -1,6 +1,6 @@
|
|
1
1
|
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
2
2
|
GF_CC_IS_GCC = 1
|
3
|
-
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null
|
3
|
+
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
|
4
4
|
else
|
5
5
|
GF_CC_IS_CLANG = 1
|
6
6
|
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|