llama_cpp 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +195 -2
- data/ext/llama_cpp/src/ggml-cuda.cu +499 -118
- data/ext/llama_cpp/src/ggml-cuda.h +1 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +357 -176
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +230 -261
- data/ext/llama_cpp/src/llama.h +31 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +21 -1
- metadata +3 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -46,6 +46,8 @@
|
|
46
46
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
47
|
#define LLAMA_SESSION_VERSION 1
|
48
48
|
|
49
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
50
|
+
|
49
51
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
50
52
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
51
53
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
@@ -81,11 +83,11 @@ extern "C" {
|
|
81
83
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
82
84
|
|
83
85
|
struct llama_context_params {
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
86
|
+
uint32_t seed; // RNG seed, -1 for random
|
87
|
+
int32_t n_ctx; // text context
|
88
|
+
int32_t n_batch; // prompt processing batch size
|
89
|
+
int32_t n_gpu_layers; // number of layers to store in VRAM
|
90
|
+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
89
91
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
90
92
|
// called with a progress value between 0 and 1, pass NULL to disable
|
91
93
|
llama_progress_callback progress_callback;
|
@@ -132,6 +134,20 @@ extern "C" {
|
|
132
134
|
bool quantize_output_tensor; // quantize output.weight
|
133
135
|
} llama_model_quantize_params;
|
134
136
|
|
137
|
+
// performance timing information
|
138
|
+
struct llama_timings {
|
139
|
+
double t_start_ms;
|
140
|
+
double t_end_ms;
|
141
|
+
double t_load_ms;
|
142
|
+
double t_sample_ms;
|
143
|
+
double t_p_eval_ms;
|
144
|
+
double t_eval_ms;
|
145
|
+
|
146
|
+
int32_t n_sample;
|
147
|
+
int32_t n_p_eval;
|
148
|
+
int32_t n_eval;
|
149
|
+
};
|
150
|
+
|
135
151
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
136
152
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
137
153
|
|
@@ -196,7 +212,7 @@ extern "C" {
|
|
196
212
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
197
213
|
|
198
214
|
// Sets the current rng seed.
|
199
|
-
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx,
|
215
|
+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
200
216
|
|
201
217
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
202
218
|
// and kv_cache) - will often be smaller after compacting tokens
|
@@ -226,6 +242,14 @@ extern "C" {
|
|
226
242
|
int n_past,
|
227
243
|
int n_threads);
|
228
244
|
|
245
|
+
// Same as llama_eval, but use float matrix input directly.
|
246
|
+
LLAMA_API int llama_eval_embd(
|
247
|
+
struct llama_context * ctx,
|
248
|
+
const float * embd,
|
249
|
+
int n_tokens,
|
250
|
+
int n_past,
|
251
|
+
int n_threads);
|
252
|
+
|
229
253
|
// Export a static computation graph for context of 511 and batch size of 1
|
230
254
|
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
231
255
|
// parameters here to keep things simple
|
@@ -321,6 +345,7 @@ extern "C" {
|
|
321
345
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
322
346
|
|
323
347
|
// Performance information
|
348
|
+
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
324
349
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
325
350
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
326
351
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.3.
|
6
|
+
VERSION = '0.3.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-481f793'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
|
|
16
16
|
# @param prompt [String] The prompt to start generation with.
|
17
17
|
# @param n_predict [Integer] The number of tokens to predict.
|
18
18
|
# @param n_threads [Integer] The number of threads.
|
19
|
+
# @param n_keep [Integer] The number of tokens to keep in the context.
|
20
|
+
# @param n_batch [Integer] The number of tokens to process in a batch.
|
21
|
+
# @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
|
22
|
+
# @param repeat_penalty [Float] The repetition penalty.
|
23
|
+
# @param frequency [Float] The frequency penalty.
|
24
|
+
# @param presence [Float] The presence penalty.
|
25
|
+
# @param top_k [Integer] The number of tokens to consider for top-k sampling.
|
26
|
+
# @param top_p [Float] The probability threshold for nucleus sampling.
|
27
|
+
# @param tfs_z [Float] The z parameter for tail-free sampling.
|
28
|
+
# @param typical_p [Float] The probability for typical sampling.
|
29
|
+
# @param temperature [Float] The temperature for temperature sampling.
|
19
30
|
# @return [String]
|
20
|
-
def generate(context, prompt,
|
31
|
+
def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
|
32
|
+
n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
|
33
|
+
repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
|
34
|
+
top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
|
21
35
|
raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
|
22
36
|
raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
|
23
37
|
|
@@ -31,19 +45,8 @@ module LLaMACpp
|
|
31
45
|
|
32
46
|
embd = []
|
33
47
|
n_consumed = 0
|
34
|
-
n_keep = 10
|
35
48
|
n_past = 0
|
36
49
|
n_remain = n_predict
|
37
|
-
repeat_last_n = 64
|
38
|
-
repeat_penalty = 1.1
|
39
|
-
frequency = 0.0
|
40
|
-
presence = 0.0
|
41
|
-
top_k = 40
|
42
|
-
top_p = 0.95
|
43
|
-
tfs_z = 1.0
|
44
|
-
typical_p = 1.0
|
45
|
-
temperature = 0.8
|
46
|
-
n_batch = 512
|
47
50
|
n_vocab = context.n_vocab
|
48
51
|
output = []
|
49
52
|
|
data/sig/llama_cpp.rbs
CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
|
|
4
4
|
LLAMA_FILE_VERSION: String
|
5
5
|
LLAMA_FILE_MAGIC: String
|
6
6
|
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
|
+
LLAMA_DEFALUT_SEED: String
|
7
8
|
|
8
9
|
LLAMA_MAX_DEVICES: Integer
|
9
10
|
|
@@ -27,7 +28,10 @@ module LLaMACpp
|
|
27
28
|
|
28
29
|
def self?.init_backend: (?numa: bool) -> void
|
29
30
|
def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
|
30
|
-
def self?.generate: (::LLaMACpp::Context, String,
|
31
|
+
def self?.generate: (::LLaMACpp::Context, String,
|
32
|
+
?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
|
33
|
+
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
34
|
+
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
31
35
|
def self?.print_system_info: () -> void
|
32
36
|
def self?.token_bos: () -> Integer
|
33
37
|
def self?.token_eos: () -> Integer
|
@@ -66,18 +70,34 @@ module LLaMACpp
|
|
66
70
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
67
71
|
end
|
68
72
|
|
73
|
+
class Timings
|
74
|
+
public
|
75
|
+
|
76
|
+
def t_start_ms: () -> Float
|
77
|
+
def t_end_ms: () -> Float
|
78
|
+
def t_load_ms: () -> Float
|
79
|
+
def t_sample_ms: () -> Float
|
80
|
+
def t_p_eval_ms: () -> Float
|
81
|
+
def t_eval_ms: () -> Float
|
82
|
+
def n_sample: () -> Integer
|
83
|
+
def n_p_eval: () -> Integer
|
84
|
+
def n_eval: () -> Integer
|
85
|
+
end
|
86
|
+
|
69
87
|
class Context
|
70
88
|
public
|
71
89
|
|
72
90
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
73
91
|
def embeddings: () -> Array[Float]
|
74
92
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
93
|
+
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
75
94
|
def eval_export: (String) -> bool
|
76
95
|
def logits: () -> Array[Float]
|
77
96
|
def n_ctx: () -> Integer
|
78
97
|
def n_embd: () -> Integer
|
79
98
|
def n_vocab: () -> Integer
|
80
99
|
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
100
|
+
def timings: () -> ::LLaMACpp::Timings
|
81
101
|
def print_timings: () -> void
|
82
102
|
def reset_timings: () -> void
|
83
103
|
def token_to_str: (Integer) -> String
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -25,6 +25,7 @@ files:
|
|
25
25
|
- examples/README.md
|
26
26
|
- examples/chat.rb
|
27
27
|
- examples/embedding.rb
|
28
|
+
- examples/prompt_jp.txt
|
28
29
|
- ext/llama_cpp/extconf.rb
|
29
30
|
- ext/llama_cpp/llama_cpp.cpp
|
30
31
|
- ext/llama_cpp/llama_cpp.h
|