llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -43,7 +43,7 @@
|
|
43
43
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
44
44
|
|
45
45
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
46
|
-
#define LLAMA_SESSION_VERSION
|
46
|
+
#define LLAMA_SESSION_VERSION 4
|
47
47
|
|
48
48
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
49
49
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
@@ -118,6 +118,12 @@ extern "C" {
|
|
118
118
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
119
119
|
};
|
120
120
|
|
121
|
+
enum llama_split_mode {
|
122
|
+
LLAMA_SPLIT_NONE = 0, // single GPU
|
123
|
+
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
124
|
+
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
125
|
+
};
|
126
|
+
|
121
127
|
typedef struct llama_token_data {
|
122
128
|
llama_token id; // token id
|
123
129
|
float logit; // log-odds of the token
|
@@ -180,8 +186,16 @@ extern "C" {
|
|
180
186
|
|
181
187
|
struct llama_model_params {
|
182
188
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
183
|
-
|
184
|
-
|
189
|
+
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
190
|
+
|
191
|
+
// main_gpu interpretation depends on split_mode:
|
192
|
+
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
193
|
+
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
194
|
+
// LLAMA_SPLIT_LAYER: ignored
|
195
|
+
int32_t main_gpu;
|
196
|
+
|
197
|
+
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
198
|
+
const float * tensor_split;
|
185
199
|
|
186
200
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
187
201
|
// If the provided progress_callback returns true, model loading continues.
|
@@ -235,6 +249,7 @@ extern "C" {
|
|
235
249
|
bool quantize_output_tensor; // quantize output.weight
|
236
250
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
237
251
|
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
252
|
+
void * imatrix; // pointer to importance matrix data
|
238
253
|
} llama_model_quantize_params;
|
239
254
|
|
240
255
|
// grammar types
|
@@ -699,14 +714,21 @@ extern "C" {
|
|
699
714
|
float penalty_present);
|
700
715
|
|
701
716
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
702
|
-
/// @param
|
703
|
-
/// @
|
704
|
-
/// @
|
705
|
-
LLAMA_API void
|
717
|
+
/// @param logits Logits extracted from the original generation context.
|
718
|
+
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
719
|
+
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
720
|
+
LLAMA_API void llama_sample_apply_guidance(
|
721
|
+
struct llama_context * ctx,
|
722
|
+
float * logits,
|
723
|
+
float * logits_guidance,
|
724
|
+
float scale);
|
725
|
+
|
726
|
+
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
706
727
|
struct llama_context * ctx,
|
707
728
|
llama_token_data_array * candidates,
|
708
729
|
struct llama_context * guidance_ctx,
|
709
|
-
float scale)
|
730
|
+
float scale),
|
731
|
+
"use llama_sample_apply_guidance() instead");
|
710
732
|
|
711
733
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
712
734
|
LLAMA_API void llama_sample_softmax(
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.12.
|
4
|
+
version: 0.12.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-01-
|
11
|
+
date: 2024-01-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|