llama_cpp 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +93 -15
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +85 -122
- data/ext/llama_cpp/src/ggml.c +6268 -4208
- data/ext/llama_cpp/src/ggml.h +205 -12
- data/ext/llama_cpp/src/llama.cpp +159 -79
- data/ext/llama_cpp/src/llama.h +10 -10
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,7 +19,7 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define LLAMA_FILE_VERSION
|
22
|
+
#define LLAMA_FILE_VERSION 2
|
23
23
|
#define LLAMA_FILE_MAGIC 'ggjt'
|
24
24
|
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
25
25
|
#define LLAMA_SESSION_MAGIC 'ggsn'
|
@@ -54,9 +54,9 @@ extern "C" {
|
|
54
54
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
55
55
|
|
56
56
|
struct llama_context_params {
|
57
|
-
int n_ctx;
|
58
|
-
int
|
59
|
-
int seed;
|
57
|
+
int n_ctx; // text context
|
58
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
59
|
+
int seed; // RNG seed, -1 for random
|
60
60
|
|
61
61
|
bool f16_kv; // use fp16 for KV cache
|
62
62
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -78,7 +78,7 @@ extern "C" {
|
|
78
78
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
79
79
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
80
80
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
81
|
-
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, //
|
81
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
82
82
|
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
83
83
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
84
84
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
@@ -134,7 +134,7 @@ extern "C" {
|
|
134
134
|
// Copies the state to the specified destination address.
|
135
135
|
// Destination needs to have allocated enough memory.
|
136
136
|
// Returns the number of bytes copied
|
137
|
-
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
137
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
138
138
|
|
139
139
|
// Set the state reading from the specified address
|
140
140
|
// Returns the number of bytes read
|
@@ -202,16 +202,16 @@ extern "C" {
|
|
202
202
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
203
|
|
204
204
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
-
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep
|
205
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
206
206
|
|
207
207
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
-
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
208
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
209
209
|
|
210
210
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
-
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep
|
211
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
212
212
|
|
213
213
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
-
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
214
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
215
215
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
216
|
|
217
217
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
data/lib/llama_cpp/client.rb
CHANGED
@@ -9,7 +9,6 @@ module LLaMACpp
|
|
9
9
|
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
10
|
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
11
|
# @param n_ctx [Integer] The context size.
|
12
|
-
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
12
|
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
13
|
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
14
|
# @param use_mlock [Boolean] The flag hether to use mlock.
|
@@ -19,7 +18,7 @@ module LLaMACpp
|
|
19
18
|
# @return [Client]
|
20
19
|
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
20
|
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
-
n_ctx: 512,
|
21
|
+
n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
22
|
embedding: false,
|
24
23
|
n_threads: 1, seed: 0)
|
25
24
|
@params = {
|
@@ -27,7 +26,6 @@ module LLaMACpp
|
|
27
26
|
lora_adapter_path: lora_adapter_path,
|
28
27
|
lora_base_path: lora_base_path,
|
29
28
|
n_ctx: n_ctx,
|
30
|
-
n_parts: n_parts,
|
31
29
|
memory_f16: memory_f16,
|
32
30
|
use_mmap: use_mmap,
|
33
31
|
use_mlock: use_mlock,
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-6986c78'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -10,7 +10,6 @@ module LLaMACpp
|
|
10
10
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
-
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
13
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
15
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
16
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
@@ -65,6 +64,8 @@ module LLaMACpp
|
|
65
64
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
65
|
def kv_cache_token_count: () -> Integer
|
67
66
|
def set_rng_seed: (Integer) -> void
|
67
|
+
def load_session_file: (session_path: String) -> void
|
68
|
+
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
68
69
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
70
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
71
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
@@ -90,8 +91,6 @@ module LLaMACpp
|
|
90
91
|
def logits_all=: (bool) -> bool
|
91
92
|
def n_ctx: () -> Integer
|
92
93
|
def n_ctx=: (Integer) -> Integer
|
93
|
-
def n_parts: () -> Integer
|
94
|
-
def n_parts=: (Integer) -> Integer
|
95
94
|
def seed: () -> Integer
|
96
95
|
def seed=: (Integer) -> Integer
|
97
96
|
def use_mlock: () -> bool
|
@@ -106,7 +105,7 @@ module LLaMACpp
|
|
106
105
|
|
107
106
|
class Client
|
108
107
|
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
109
|
-
?n_ctx: Integer, ?
|
108
|
+
?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
110
109
|
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
111
110
|
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
112
111
|
?frequency: Float, ?presence: Float,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|