llama_cpp 0.3.2 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -88,7 +88,13 @@ extern "C" {
|
|
88
88
|
int32_t n_batch; // prompt processing batch size
|
89
89
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
90
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
|
-
|
91
|
+
|
92
|
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
|
+
|
94
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
95
|
+
float rope_freq_base; // RoPE base frequency
|
96
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
97
|
+
|
92
98
|
// called with a progress value between 0 and 1, pass NULL to disable
|
93
99
|
llama_progress_callback progress_callback;
|
94
100
|
// context pointer passed to the progress callback
|
@@ -148,6 +154,8 @@ extern "C" {
|
|
148
154
|
int32_t n_eval;
|
149
155
|
};
|
150
156
|
|
157
|
+
LLAMA_API int llama_max_devices();
|
158
|
+
|
151
159
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
152
160
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
153
161
|
|
@@ -158,7 +166,9 @@ extern "C" {
|
|
158
166
|
// Initialize the llama + ggml backend
|
159
167
|
// If numa is true, use NUMA optimizations
|
160
168
|
// Call once at the start of the program
|
161
|
-
LLAMA_API void
|
169
|
+
LLAMA_API void llama_backend_init(bool numa);
|
170
|
+
// Call once at the end of the program - currently only used for MPI
|
171
|
+
LLAMA_API void llama_backend_free();
|
162
172
|
|
163
173
|
LLAMA_API int64_t llama_time_us();
|
164
174
|
|
@@ -268,10 +278,21 @@ extern "C" {
|
|
268
278
|
int n_max_tokens,
|
269
279
|
bool add_bos);
|
270
280
|
|
281
|
+
LLAMA_API int llama_tokenize_with_model(
|
282
|
+
const struct llama_model * model,
|
283
|
+
const char * text,
|
284
|
+
llama_token * tokens,
|
285
|
+
int n_max_tokens,
|
286
|
+
bool add_bos);
|
287
|
+
|
271
288
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
272
289
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
273
290
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
274
291
|
|
292
|
+
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
293
|
+
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
294
|
+
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
295
|
+
|
275
296
|
// Get the vocabulary as output parameters.
|
276
297
|
// Returns number of results.
|
277
298
|
LLAMA_API int llama_get_vocab(
|
@@ -280,6 +301,12 @@ extern "C" {
|
|
280
301
|
float * scores,
|
281
302
|
int capacity);
|
282
303
|
|
304
|
+
LLAMA_API int llama_get_vocab_from_model(
|
305
|
+
const struct llama_model * model,
|
306
|
+
const char * * strings,
|
307
|
+
float * scores,
|
308
|
+
int capacity);
|
309
|
+
|
283
310
|
// Token logits obtained from the last call to llama_eval()
|
284
311
|
// The logits for the last token are stored in the last row
|
285
312
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -292,7 +319,13 @@ extern "C" {
|
|
292
319
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
293
320
|
|
294
321
|
// Token Id -> String. Uses the vocabulary in the provided context
|
295
|
-
LLAMA_API const char * llama_token_to_str(
|
322
|
+
LLAMA_API const char * llama_token_to_str(
|
323
|
+
const struct llama_context * ctx,
|
324
|
+
llama_token token);
|
325
|
+
|
326
|
+
LLAMA_API const char * llama_token_to_str_with_model(
|
327
|
+
const struct llama_model * model,
|
328
|
+
llama_token token);
|
296
329
|
|
297
330
|
// Special tokens
|
298
331
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
@@ -307,6 +340,16 @@ extern "C" {
|
|
307
340
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
308
341
|
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
309
342
|
|
343
|
+
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
344
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
345
|
+
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
346
|
+
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
347
|
+
LLAMA_API void llama_sample_classifier_free_guidance(
|
348
|
+
struct llama_context * ctx,
|
349
|
+
llama_token_data_array * candidates,
|
350
|
+
struct llama_context * guidance_ctx,
|
351
|
+
float scale);
|
352
|
+
|
310
353
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
311
354
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
312
355
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-d924522'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
-
def self?.
|
29
|
+
def self?.backend_init: (?numa: bool) -> void
|
30
|
+
def self?.backend_free: () -> void
|
30
31
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
31
32
|
def self?.generate: (::LLaMACpp::Context, String,
|
32
33
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -38,6 +39,7 @@ module LLaMACpp
|
|
38
39
|
def self?.token_nl: () -> Integer
|
39
40
|
def self?.mmap_supported?: () -> bool
|
40
41
|
def self?.mlock_supported?: () -> bool
|
42
|
+
def self?.max_devices: () -> Integer
|
41
43
|
|
42
44
|
class TokenData
|
43
45
|
public
|
@@ -68,6 +70,12 @@ module LLaMACpp
|
|
68
70
|
def free: () -> void
|
69
71
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
70
72
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
73
|
+
def n_vocab: () -> Integer
|
74
|
+
def n_ctx: () -> Integer
|
75
|
+
def n_embd: () -> Integer
|
76
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
77
|
+
def token_to_str: (Integer) -> String
|
78
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
71
79
|
end
|
72
80
|
|
73
81
|
class Timings
|
@@ -108,6 +116,7 @@ module LLaMACpp
|
|
108
116
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
109
117
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
110
118
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
119
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
111
120
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
112
121
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
113
122
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -138,6 +147,10 @@ module LLaMACpp
|
|
138
147
|
def main_gpu: () -> Integer
|
139
148
|
def main_gpu=: (Integer) -> Integer
|
140
149
|
def tensor_split: () -> Array[Float]
|
150
|
+
def rope_freq_base=: (Float) -> Float
|
151
|
+
def rope_freq_base: () -> Float
|
152
|
+
def rope_freq_scale=: (Float) -> Float
|
153
|
+
def rope_freq_scale: () -> Float
|
141
154
|
def low_vram: () -> bool
|
142
155
|
def low_vram=: (bool) -> bool
|
143
156
|
def seed: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -35,6 +35,8 @@ files:
|
|
35
35
|
- ext/llama_cpp/src/ggml-metal.h
|
36
36
|
- ext/llama_cpp/src/ggml-metal.m
|
37
37
|
- ext/llama_cpp/src/ggml-metal.metal
|
38
|
+
- ext/llama_cpp/src/ggml-mpi.c
|
39
|
+
- ext/llama_cpp/src/ggml-mpi.h
|
38
40
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
39
41
|
- ext/llama_cpp/src/ggml-opencl.h
|
40
42
|
- ext/llama_cpp/src/ggml.c
|