llama_cpp 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/extconf.rb +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +153 -21
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-opencl.c +291 -215
- data/ext/llama_cpp/src/ggml.c +4428 -2143
- data/ext/llama_cpp/src/ggml.h +216 -13
- data/ext/llama_cpp/src/llama-util.h +23 -23
- data/ext/llama_cpp/src/llama.cpp +300 -149
- data/ext/llama_cpp/src/llama.h +38 -25
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -0
- data/sig/llama_cpp.rbs +4 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,10 +19,16 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define
|
23
|
-
#define
|
24
|
-
#define
|
25
|
-
#define
|
22
|
+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
23
|
+
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
24
|
+
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
25
|
+
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
26
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
27
|
+
|
28
|
+
#define LLAMA_FILE_VERSION 3
|
29
|
+
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
30
|
+
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
31
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
26
32
|
#define LLAMA_SESSION_VERSION 1
|
27
33
|
|
28
34
|
#ifdef __cplusplus
|
@@ -40,9 +46,9 @@ extern "C" {
|
|
40
46
|
typedef int llama_token;
|
41
47
|
|
42
48
|
typedef struct llama_token_data {
|
43
|
-
llama_token id;
|
44
|
-
float logit;
|
45
|
-
float p;
|
49
|
+
llama_token id; // token id
|
50
|
+
float logit; // log-odds of the token
|
51
|
+
float p; // probability of the token
|
46
52
|
} llama_token_data;
|
47
53
|
|
48
54
|
typedef struct llama_token_data_array {
|
@@ -54,9 +60,9 @@ extern "C" {
|
|
54
60
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
55
61
|
|
56
62
|
struct llama_context_params {
|
57
|
-
int n_ctx;
|
58
|
-
int
|
59
|
-
int seed;
|
63
|
+
int n_ctx; // text context
|
64
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
65
|
+
int seed; // RNG seed, -1 for random
|
60
66
|
|
61
67
|
bool f16_kv; // use fp16 for KV cache
|
62
68
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -73,16 +79,16 @@ extern "C" {
|
|
73
79
|
|
74
80
|
// model file types
|
75
81
|
enum llama_ftype {
|
76
|
-
LLAMA_FTYPE_ALL_F32
|
77
|
-
LLAMA_FTYPE_MOSTLY_F16
|
78
|
-
LLAMA_FTYPE_MOSTLY_Q4_0
|
79
|
-
LLAMA_FTYPE_MOSTLY_Q4_1
|
82
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
83
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
84
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
85
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
80
86
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
81
|
-
LLAMA_FTYPE_MOSTLY_Q4_2
|
82
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3
|
83
|
-
LLAMA_FTYPE_MOSTLY_Q8_0
|
84
|
-
LLAMA_FTYPE_MOSTLY_Q5_0
|
85
|
-
LLAMA_FTYPE_MOSTLY_Q5_1
|
87
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
88
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
86
92
|
};
|
87
93
|
|
88
94
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
@@ -90,6 +96,13 @@ extern "C" {
|
|
90
96
|
LLAMA_API bool llama_mmap_supported();
|
91
97
|
LLAMA_API bool llama_mlock_supported();
|
92
98
|
|
99
|
+
// TODO: not great API - very likely to change
|
100
|
+
// Initialize the llama + ggml backend
|
101
|
+
// Call once at the start of the program
|
102
|
+
LLAMA_API void llama_init_backend();
|
103
|
+
|
104
|
+
LLAMA_API int64_t llama_time_us();
|
105
|
+
|
93
106
|
// Various functions for loading a ggml llama model.
|
94
107
|
// Allocate (almost) all memory needed for the model.
|
95
108
|
// Return NULL on failure
|
@@ -134,11 +147,11 @@ extern "C" {
|
|
134
147
|
// Copies the state to the specified destination address.
|
135
148
|
// Destination needs to have allocated enough memory.
|
136
149
|
// Returns the number of bytes copied
|
137
|
-
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
150
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
138
151
|
|
139
152
|
// Set the state reading from the specified address
|
140
153
|
// Returns the number of bytes read
|
141
|
-
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx,
|
154
|
+
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
|
142
155
|
|
143
156
|
// Save/load session file
|
144
157
|
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
|
@@ -202,16 +215,16 @@ extern "C" {
|
|
202
215
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
216
|
|
204
217
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
-
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep
|
218
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
206
219
|
|
207
220
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
-
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
221
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
209
222
|
|
210
223
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
-
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep
|
224
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
212
225
|
|
213
226
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
-
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
227
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
215
228
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
229
|
|
217
230
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
data/lib/llama_cpp/client.rb
CHANGED
@@ -9,7 +9,6 @@ module LLaMACpp
|
|
9
9
|
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
10
|
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
11
|
# @param n_ctx [Integer] The context size.
|
12
|
-
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
12
|
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
13
|
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
14
|
# @param use_mlock [Boolean] The flag hether to use mlock.
|
@@ -19,7 +18,7 @@ module LLaMACpp
|
|
19
18
|
# @return [Client]
|
20
19
|
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
20
|
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
-
n_ctx: 512,
|
21
|
+
n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
22
|
embedding: false,
|
24
23
|
n_threads: 1, seed: 0)
|
25
24
|
@params = {
|
@@ -27,7 +26,6 @@ module LLaMACpp
|
|
27
26
|
lora_adapter_path: lora_adapter_path,
|
28
27
|
lora_base_path: lora_base_path,
|
29
28
|
n_ctx: n_ctx,
|
30
|
-
n_parts: n_parts,
|
31
29
|
memory_f16: memory_f16,
|
32
30
|
use_mmap: use_mmap,
|
33
31
|
use_mlock: use_mlock,
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.2'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-265db98'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -10,11 +10,11 @@ module LLaMACpp
|
|
10
10
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
-
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
13
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
15
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
16
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
17
16
|
|
17
|
+
def self?.init_backend: () -> void
|
18
18
|
def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
|
19
19
|
def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
|
20
20
|
def self?.print_system_info: () -> void
|
@@ -65,6 +65,8 @@ module LLaMACpp
|
|
65
65
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
66
|
def kv_cache_token_count: () -> Integer
|
67
67
|
def set_rng_seed: (Integer) -> void
|
68
|
+
def load_session_file: (session_path: String) -> void
|
69
|
+
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
68
70
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
71
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
72
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
@@ -90,8 +92,6 @@ module LLaMACpp
|
|
90
92
|
def logits_all=: (bool) -> bool
|
91
93
|
def n_ctx: () -> Integer
|
92
94
|
def n_ctx=: (Integer) -> Integer
|
93
|
-
def n_parts: () -> Integer
|
94
|
-
def n_parts=: (Integer) -> Integer
|
95
95
|
def seed: () -> Integer
|
96
96
|
def seed=: (Integer) -> Integer
|
97
97
|
def use_mlock: () -> bool
|
@@ -106,7 +106,7 @@ module LLaMACpp
|
|
106
106
|
|
107
107
|
class Client
|
108
108
|
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
109
|
-
?n_ctx: Integer, ?
|
109
|
+
?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
110
110
|
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
111
111
|
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
112
112
|
?frequency: Float, ?presence: Float,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|