llama_cpp 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +37 -0
- data/ext/llama_cpp/extconf.rb +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +302 -112
- data/ext/llama_cpp/src/ggml-cuda.cu +677 -118
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +65 -45
- data/ext/llama_cpp/src/ggml-metal.metal +610 -484
- data/ext/llama_cpp/src/ggml-mpi.c +216 -0
- data/ext/llama_cpp/src/ggml-mpi.h +39 -0
- data/ext/llama_cpp/src/ggml.c +1146 -812
- data/ext/llama_cpp/src/ggml.h +77 -19
- data/ext/llama_cpp/src/k_quants.h +8 -0
- data/ext/llama_cpp/src/llama.cpp +289 -104
- data/ext/llama_cpp/src/llama.h +46 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -1
- data/sig/llama_cpp.rbs +14 -1
- metadata +4 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -88,7 +88,13 @@ extern "C" {
|
|
88
88
|
int32_t n_batch; // prompt processing batch size
|
89
89
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
90
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
91
|
-
|
91
|
+
|
92
|
+
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
93
|
+
|
94
|
+
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
95
|
+
float rope_freq_base; // RoPE base frequency
|
96
|
+
float rope_freq_scale; // RoPE frequency scaling factor
|
97
|
+
|
92
98
|
// called with a progress value between 0 and 1, pass NULL to disable
|
93
99
|
llama_progress_callback progress_callback;
|
94
100
|
// context pointer passed to the progress callback
|
@@ -148,6 +154,8 @@ extern "C" {
|
|
148
154
|
int32_t n_eval;
|
149
155
|
};
|
150
156
|
|
157
|
+
LLAMA_API int llama_max_devices();
|
158
|
+
|
151
159
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
152
160
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
153
161
|
|
@@ -158,7 +166,9 @@ extern "C" {
|
|
158
166
|
// Initialize the llama + ggml backend
|
159
167
|
// If numa is true, use NUMA optimizations
|
160
168
|
// Call once at the start of the program
|
161
|
-
LLAMA_API void
|
169
|
+
LLAMA_API void llama_backend_init(bool numa);
|
170
|
+
// Call once at the end of the program - currently only used for MPI
|
171
|
+
LLAMA_API void llama_backend_free();
|
162
172
|
|
163
173
|
LLAMA_API int64_t llama_time_us();
|
164
174
|
|
@@ -268,10 +278,21 @@ extern "C" {
|
|
268
278
|
int n_max_tokens,
|
269
279
|
bool add_bos);
|
270
280
|
|
281
|
+
LLAMA_API int llama_tokenize_with_model(
|
282
|
+
const struct llama_model * model,
|
283
|
+
const char * text,
|
284
|
+
llama_token * tokens,
|
285
|
+
int n_max_tokens,
|
286
|
+
bool add_bos);
|
287
|
+
|
271
288
|
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
272
289
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
273
290
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
274
291
|
|
292
|
+
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
293
|
+
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
294
|
+
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
295
|
+
|
275
296
|
// Get the vocabulary as output parameters.
|
276
297
|
// Returns number of results.
|
277
298
|
LLAMA_API int llama_get_vocab(
|
@@ -280,6 +301,12 @@ extern "C" {
|
|
280
301
|
float * scores,
|
281
302
|
int capacity);
|
282
303
|
|
304
|
+
LLAMA_API int llama_get_vocab_from_model(
|
305
|
+
const struct llama_model * model,
|
306
|
+
const char * * strings,
|
307
|
+
float * scores,
|
308
|
+
int capacity);
|
309
|
+
|
283
310
|
// Token logits obtained from the last call to llama_eval()
|
284
311
|
// The logits for the last token are stored in the last row
|
285
312
|
// Can be mutated in order to change the probabilities of the next token
|
@@ -292,7 +319,13 @@ extern "C" {
|
|
292
319
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
293
320
|
|
294
321
|
// Token Id -> String. Uses the vocabulary in the provided context
|
295
|
-
LLAMA_API const char * llama_token_to_str(
|
322
|
+
LLAMA_API const char * llama_token_to_str(
|
323
|
+
const struct llama_context * ctx,
|
324
|
+
llama_token token);
|
325
|
+
|
326
|
+
LLAMA_API const char * llama_token_to_str_with_model(
|
327
|
+
const struct llama_model * model,
|
328
|
+
llama_token token);
|
296
329
|
|
297
330
|
// Special tokens
|
298
331
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
@@ -307,6 +340,16 @@ extern "C" {
|
|
307
340
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
308
341
|
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
309
342
|
|
343
|
+
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
344
|
+
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
345
|
+
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
346
|
+
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
347
|
+
LLAMA_API void llama_sample_classifier_free_guidance(
|
348
|
+
struct llama_context * ctx,
|
349
|
+
llama_token_data_array * candidates,
|
350
|
+
struct llama_context * guidance_ctx,
|
351
|
+
float scale);
|
352
|
+
|
310
353
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
311
354
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
312
355
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-d924522'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
|
|
26
26
|
LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
|
27
27
|
LLAMA_FTYPE_MOSTLY_Q6_K: Integer
|
28
28
|
|
29
|
-
def self?.
|
29
|
+
def self?.backend_init: (?numa: bool) -> void
|
30
|
+
def self?.backend_free: () -> void
|
30
31
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
31
32
|
def self?.generate: (::LLaMACpp::Context, String,
|
32
33
|
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
@@ -38,6 +39,7 @@ module LLaMACpp
|
|
38
39
|
def self?.token_nl: () -> Integer
|
39
40
|
def self?.mmap_supported?: () -> bool
|
40
41
|
def self?.mlock_supported?: () -> bool
|
42
|
+
def self?.max_devices: () -> Integer
|
41
43
|
|
42
44
|
class TokenData
|
43
45
|
public
|
@@ -68,6 +70,12 @@ module LLaMACpp
|
|
68
70
|
def free: () -> void
|
69
71
|
def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
|
70
72
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
73
|
+
def n_vocab: () -> Integer
|
74
|
+
def n_ctx: () -> Integer
|
75
|
+
def n_embd: () -> Integer
|
76
|
+
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
77
|
+
def token_to_str: (Integer) -> String
|
78
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
71
79
|
end
|
72
80
|
|
73
81
|
class Timings
|
@@ -108,6 +116,7 @@ module LLaMACpp
|
|
108
116
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
109
117
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
110
118
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
119
|
+
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
111
120
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
112
121
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
113
122
|
def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
@@ -138,6 +147,10 @@ module LLaMACpp
|
|
138
147
|
def main_gpu: () -> Integer
|
139
148
|
def main_gpu=: (Integer) -> Integer
|
140
149
|
def tensor_split: () -> Array[Float]
|
150
|
+
def rope_freq_base=: (Float) -> Float
|
151
|
+
def rope_freq_base: () -> Float
|
152
|
+
def rope_freq_scale=: (Float) -> Float
|
153
|
+
def rope_freq_scale: () -> Float
|
141
154
|
def low_vram: () -> bool
|
142
155
|
def low_vram=: (bool) -> bool
|
143
156
|
def seed: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -35,6 +35,8 @@ files:
|
|
35
35
|
- ext/llama_cpp/src/ggml-metal.h
|
36
36
|
- ext/llama_cpp/src/ggml-metal.m
|
37
37
|
- ext/llama_cpp/src/ggml-metal.metal
|
38
|
+
- ext/llama_cpp/src/ggml-mpi.c
|
39
|
+
- ext/llama_cpp/src/ggml-mpi.h
|
38
40
|
- ext/llama_cpp/src/ggml-opencl.cpp
|
39
41
|
- ext/llama_cpp/src/ggml-opencl.h
|
40
42
|
- ext/llama_cpp/src/ggml.c
|