llama_cpp 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +153 -21
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-opencl.c +291 -215
- data/ext/llama_cpp/src/ggml.c +4428 -2143
- data/ext/llama_cpp/src/ggml.h +216 -13
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +300 -149
- data/ext/llama_cpp/src/llama.h +38 -25
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +4 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,10 +19,16 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
23
|
-
#define
|
24
|
-
#define
|
25
|
-
#define
|
22
|
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
23
|
+
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
24
|
+
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
25
|
+
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
26
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
27
|
+
|
28
|
+
#define LLAMA_FILE_VERSION 3
|
29
|
+
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
30
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
31
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
26
32
|
#define LLAMA_SESSION_VERSION 1
|
27
33
|
|
28
34
|
#ifdef __cplusplus
|
@@ -40,9 +46,9 @@ extern "C" {
|
|
40
46
|
typedef int llama_token;
|
41
47
|
|
42
48
|
typedef struct llama_token_data {
|
43
|
-
llama_token id;
|
44
|
-
float logit;
|
45
|
-
float p;
|
49
|
+
llama_token id; // token id
|
50
|
+
float logit; // log-odds of the token
|
51
|
+
float p; // probability of the token
|
46
52
|
} llama_token_data;
|
47
53
|
|
48
54
|
typedef struct llama_token_data_array {
|
@@ -54,9 +60,9 @@ extern "C" {
|
|
54
60
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
55
61
|
|
56
62
|
struct llama_context_params {
|
57
|
-
int n_ctx;
|
58
|
-
int
|
59
|
-
int seed;
|
63
|
+
int n_ctx; // text context
|
64
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
65
|
+
int seed; // RNG seed, -1 for random
|
60
66
|
|
61
67
|
bool f16_kv; // use fp16 for KV cache
|
62
68
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -73,16 +79,16 @@ extern "C" {
|
|
73
79
|
|
74
80
|
// model file types
|
75
81
|
enum llama_ftype {
|
76
|
-
LLAMA_FTYPE_ALL_F32
|
77
|
-
LLAMA_FTYPE_MOSTLY_F16
|
78
|
-
LLAMA_FTYPE_MOSTLY_Q4_0
|
79
|
-
LLAMA_FTYPE_MOSTLY_Q4_1
|
82
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
83
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
84
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
85
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
80
86
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
81
|
-
LLAMA_FTYPE_MOSTLY_Q4_2
|
82
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3
|
83
|
-
LLAMA_FTYPE_MOSTLY_Q8_0
|
84
|
-
LLAMA_FTYPE_MOSTLY_Q5_0
|
85
|
-
LLAMA_FTYPE_MOSTLY_Q5_1
|
87
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
88
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
86
92
|
};
|
87
93
|
|
88
94
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -90,6 +96,13 @@ extern "C" {
|
|
90
96
|
LLAMA_API bool llama_mmap_supported();
|
91
97
|
LLAMA_API bool llama_mlock_supported();
|
92
98
|
|
99
|
+
// TODO: not great API - very likely to change
|
100
|
+
// Initialize the llama + ggml backend
|
101
|
+
// Call once at the start of the program
|
102
|
+
LLAMA_API void llama_init_backend();
|
103
|
+
|
104
|
+
LLAMA_API int64_t llama_time_us();
|
105
|
+
|
93
106
|
// Various functions for loading a ggml llama model.
|
94
107
|
// Allocate (almost) all memory needed for the model.
|
95
108
|
// Return NULL on failure
|
@@ -134,11 +147,11 @@ extern "C" {
|
|
134
147
|
// Copies the state to the specified destination address.
|
135
148
|
// Destination needs to have allocated enough memory.
|
136
149
|
// Returns the number of bytes copied
|
137
|
-
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
150
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
138
151
|
|
139
152
|
// Set the state reading from the specified address
|
140
153
|
// Returns the number of bytes read
|
141
|
-
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx,
|
154
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
142
155
|
|
143
156
|
// Save/load session file
|
144
157
|
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
@@ -202,16 +215,16 @@ extern "C" {
|
|
202
215
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
216
|
|
204
217
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
-
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep
|
218
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
206
219
|
|
207
220
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
-
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
221
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
209
222
|
|
210
223
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
-
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep
|
224
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
212
225
|
|
213
226
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
-
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
227
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
215
228
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
229
|
|
217
230
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
data/lib/llama_cpp/client.rb
CHANGED
@@ -9,7 +9,6 @@ module LLaMACpp
|
|
9
9
|
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
10
|
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
11
|
# @param n_ctx [Integer] The context size.
|
12
|
-
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
12
|
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
13
|
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
14
|
# @param use_mlock [Boolean] The flag hether to use mlock.
|
@@ -19,7 +18,7 @@ module LLaMACpp
|
|
19
18
|
# @return [Client]
|
20
19
|
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
20
|
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
-
n_ctx: 512,
|
21
|
+
n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
22
|
embedding: false,
|
24
23
|
n_threads: 1, seed: 0)
|
25
24
|
@params = {
|
@@ -27,7 +26,6 @@ module LLaMACpp
|
|
27
26
|
lora_adapter_path: lora_adapter_path,
|
28
27
|
lora_base_path: lora_base_path,
|
29
28
|
n_ctx: n_ctx,
|
30
|
-
n_parts: n_parts,
|
31
29
|
memory_f16: memory_f16,
|
32
30
|
use_mmap: use_mmap,
|
33
31
|
use_mlock: use_mlock,
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-265db98'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -10,11 +10,11 @@ module LLaMACpp
|
|
10
10
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
-
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
13
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
15
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
16
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
17
16
|
|
17
|
+
def self?.init_backend: () -> void
|
18
18
|
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
19
19
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
20
20
|
def self?.print_system_info: () -> void
|
@@ -65,6 +65,8 @@ module LLaMACpp
|
|
65
65
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
66
|
def kv_cache_token_count: () -> Integer
|
67
67
|
def set_rng_seed: (Integer) -> void
|
68
|
+
def load_session_file: (session_path: String) -> void
|
69
|
+
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
68
70
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
71
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
72
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
@@ -90,8 +92,6 @@ module LLaMACpp
|
|
90
92
|
def logits_all=: (bool) -> bool
|
91
93
|
def n_ctx: () -> Integer
|
92
94
|
def n_ctx=: (Integer) -> Integer
|
93
|
-
def n_parts: () -> Integer
|
94
|
-
def n_parts=: (Integer) -> Integer
|
95
95
|
def seed: () -> Integer
|
96
96
|
def seed=: (Integer) -> Integer
|
97
97
|
def use_mlock: () -> bool
|
@@ -106,7 +106,7 @@ module LLaMACpp
|
|
106
106
|
|
107
107
|
class Client
|
108
108
|
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
109
|
-
?n_ctx: Integer, ?
|
109
|
+
?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
110
110
|
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
111
111
|
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
112
112
|
?frequency: Float, ?presence: Float,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|