llama_cpp 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
|
44
44
|
|
|
45
45
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
|
46
|
-
#define LLAMA_SESSION_VERSION
|
|
46
|
+
#define LLAMA_SESSION_VERSION 4
|
|
47
47
|
|
|
48
48
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
|
49
49
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
@@ -103,6 +103,9 @@ extern "C" {
|
|
|
103
103
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
|
104
104
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
|
105
105
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
|
106
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
|
107
|
+
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
|
108
|
+
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
|
106
109
|
|
|
107
110
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
108
111
|
};
|
|
@@ -115,6 +118,12 @@ extern "C" {
|
|
|
115
118
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
|
116
119
|
};
|
|
117
120
|
|
|
121
|
+
enum llama_split_mode {
|
|
122
|
+
LLAMA_SPLIT_NONE = 0, // single GPU
|
|
123
|
+
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
|
124
|
+
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
|
125
|
+
};
|
|
126
|
+
|
|
118
127
|
typedef struct llama_token_data {
|
|
119
128
|
llama_token id; // token id
|
|
120
129
|
float logit; // log-odds of the token
|
|
@@ -177,8 +186,16 @@ extern "C" {
|
|
|
177
186
|
|
|
178
187
|
struct llama_model_params {
|
|
179
188
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
180
|
-
|
|
181
|
-
|
|
189
|
+
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
190
|
+
|
|
191
|
+
// main_gpu interpretation depends on split_mode:
|
|
192
|
+
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
|
193
|
+
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
|
194
|
+
// LLAMA_SPLIT_LAYER: ignored
|
|
195
|
+
int32_t main_gpu;
|
|
196
|
+
|
|
197
|
+
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
|
198
|
+
const float * tensor_split;
|
|
182
199
|
|
|
183
200
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
|
184
201
|
// If the provided progress_callback returns true, model loading continues.
|
|
@@ -232,6 +249,7 @@ extern "C" {
|
|
|
232
249
|
bool quantize_output_tensor; // quantize output.weight
|
|
233
250
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
|
234
251
|
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
|
252
|
+
void * imatrix; // pointer to importance matrix data
|
|
235
253
|
} llama_model_quantize_params;
|
|
236
254
|
|
|
237
255
|
// grammar types
|
|
@@ -484,6 +502,17 @@ extern "C" {
|
|
|
484
502
|
llama_pos p1,
|
|
485
503
|
llama_pos delta);
|
|
486
504
|
|
|
505
|
+
// Integer division of the positions by factor of `d > 1`
|
|
506
|
+
// If the KV cache is RoPEd, the KV data is updated accordingly
|
|
507
|
+
// p0 < 0 : [0, p1]
|
|
508
|
+
// p1 < 0 : [p0, inf)
|
|
509
|
+
LLAMA_API void llama_kv_cache_seq_div(
|
|
510
|
+
struct llama_context * ctx,
|
|
511
|
+
llama_seq_id seq_id,
|
|
512
|
+
llama_pos p0,
|
|
513
|
+
llama_pos p1,
|
|
514
|
+
int d);
|
|
515
|
+
|
|
487
516
|
//
|
|
488
517
|
// State / sessions
|
|
489
518
|
//
|
|
@@ -685,14 +714,21 @@ extern "C" {
|
|
|
685
714
|
float penalty_present);
|
|
686
715
|
|
|
687
716
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
|
688
|
-
/// @param
|
|
689
|
-
/// @
|
|
690
|
-
/// @
|
|
691
|
-
LLAMA_API void
|
|
717
|
+
/// @param logits Logits extracted from the original generation context.
|
|
718
|
+
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
|
719
|
+
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
|
720
|
+
LLAMA_API void llama_sample_apply_guidance(
|
|
721
|
+
struct llama_context * ctx,
|
|
722
|
+
float * logits,
|
|
723
|
+
float * logits_guidance,
|
|
724
|
+
float scale);
|
|
725
|
+
|
|
726
|
+
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
|
692
727
|
struct llama_context * ctx,
|
|
693
728
|
llama_token_data_array * candidates,
|
|
694
729
|
struct llama_context * guidance_ctx,
|
|
695
|
-
float scale)
|
|
730
|
+
float scale),
|
|
731
|
+
"use llama_sample_apply_guidance() instead");
|
|
696
732
|
|
|
697
733
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
698
734
|
LLAMA_API void llama_sample_softmax(
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: llama_cpp
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.12.
|
|
4
|
+
version: 0.12.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- yoshoku
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-01-
|
|
11
|
+
date: 2024-01-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
|
14
14
|
email:
|