llama_cpp 0.12.6 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +21 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +8 -1
- data/vendor/tmp/llama.cpp/Makefile +43 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +73 -43
- data/vendor/tmp/llama.cpp/ggml-backend.c +18 -9
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +908 -54
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +81 -203
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +124 -52
- data/vendor/tmp/llama.cpp/ggml.c +948 -504
- data/vendor/tmp/llama.cpp/ggml.h +24 -11
- data/vendor/tmp/llama.cpp/llama.cpp +688 -163
- data/vendor/tmp/llama.cpp/llama.h +37 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- metadata +2 -2
@@ -100,6 +100,8 @@ extern "C" {
|
|
100
100
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
101
101
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
102
102
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
103
105
|
|
104
106
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
105
107
|
};
|
@@ -112,6 +114,12 @@ extern "C" {
|
|
112
114
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
113
115
|
};
|
114
116
|
|
117
|
+
enum llama_pooling_type {
|
118
|
+
LLAMA_POOLING_NONE = 0,
|
119
|
+
LLAMA_POOLING_MEAN = 1,
|
120
|
+
LLAMA_POOLING_CLS = 2,
|
121
|
+
};
|
122
|
+
|
115
123
|
enum llama_split_mode {
|
116
124
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
117
125
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
@@ -298,6 +306,12 @@ extern "C" {
|
|
298
306
|
int32_t n_eval;
|
299
307
|
};
|
300
308
|
|
309
|
+
// used in chat template
|
310
|
+
typedef struct llama_chat_message {
|
311
|
+
const char * role;
|
312
|
+
const char * content;
|
313
|
+
} llama_chat_message;
|
314
|
+
|
301
315
|
// Helpers for getting default parameters
|
302
316
|
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
303
317
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
@@ -306,7 +320,10 @@ extern "C" {
|
|
306
320
|
// Initialize the llama + ggml backend
|
307
321
|
// If numa is true, use NUMA optimizations
|
308
322
|
// Call once at the start of the program
|
309
|
-
LLAMA_API void llama_backend_init(
|
323
|
+
LLAMA_API void llama_backend_init(void);
|
324
|
+
|
325
|
+
//optional:
|
326
|
+
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
310
327
|
|
311
328
|
// Call once at the end of the program - currently only used for MPI
|
312
329
|
LLAMA_API void llama_backend_free(void);
|
@@ -689,6 +706,25 @@ extern "C" {
|
|
689
706
|
char * buf,
|
690
707
|
int32_t length);
|
691
708
|
|
709
|
+
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
710
|
+
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
711
|
+
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
712
|
+
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
713
|
+
/// @param chat Pointer to a list of multiple llama_chat_message
|
714
|
+
/// @param n_msg Number of llama_chat_message in this chat
|
715
|
+
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
716
|
+
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
717
|
+
/// @param length The size of the allocated buffer
|
718
|
+
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
719
|
+
LLAMA_API int32_t llama_chat_apply_template(
|
720
|
+
const struct llama_model * model,
|
721
|
+
const char * tmpl,
|
722
|
+
const struct llama_chat_message * chat,
|
723
|
+
size_t n_msg,
|
724
|
+
bool add_ass,
|
725
|
+
char * buf,
|
726
|
+
int32_t length);
|
727
|
+
|
692
728
|
//
|
693
729
|
// Grammar
|
694
730
|
//
|
@@ -1,6 +1,6 @@
|
|
1
1
|
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
2
2
|
GF_CC_IS_GCC = 1
|
3
|
-
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null
|
3
|
+
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
|
4
4
|
else
|
5
5
|
GF_CC_IS_CLANG = 1
|
6
6
|
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|