llama_cpp 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +93 -15
- data/ext/llama_cpp/src/ggml-cuda.h +2 -0
- data/ext/llama_cpp/src/ggml-opencl.c +85 -122
- data/ext/llama_cpp/src/ggml.c +6268 -4208
- data/ext/llama_cpp/src/ggml.h +205 -12
- data/ext/llama_cpp/src/llama.cpp +159 -79
- data/ext/llama_cpp/src/llama.h +10 -10
- data/lib/llama_cpp/client.rb +1 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -4
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -19,7 +19,7 @@
|
|
19
19
|
# define LLAMA_API
|
20
20
|
#endif
|
21
21
|
|
22
|
-
#define LLAMA_FILE_VERSION
|
22
|
+
#define LLAMA_FILE_VERSION 2
|
23
23
|
#define LLAMA_FILE_MAGIC 'ggjt'
|
24
24
|
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
|
25
25
|
#define LLAMA_SESSION_MAGIC 'ggsn'
|
@@ -54,9 +54,9 @@ extern "C" {
|
|
54
54
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
55
55
|
|
56
56
|
struct llama_context_params {
|
57
|
-
int n_ctx;
|
58
|
-
int
|
59
|
-
int seed;
|
57
|
+
int n_ctx; // text context
|
58
|
+
int n_gpu_layers; // number of layers to store in VRAM
|
59
|
+
int seed; // RNG seed, -1 for random
|
60
60
|
|
61
61
|
bool f16_kv; // use fp16 for KV cache
|
62
62
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
@@ -78,7 +78,7 @@ extern "C" {
|
|
78
78
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
79
79
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
80
80
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
81
|
-
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, //
|
81
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
82
82
|
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
|
83
83
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
84
84
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
@@ -134,7 +134,7 @@ extern "C" {
|
|
134
134
|
// Copies the state to the specified destination address.
|
135
135
|
// Destination needs to have allocated enough memory.
|
136
136
|
// Returns the number of bytes copied
|
137
|
-
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t *
|
137
|
+
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
138
138
|
|
139
139
|
// Set the state reading from the specified address
|
140
140
|
// Returns the number of bytes read
|
@@ -202,16 +202,16 @@ extern "C" {
|
|
202
202
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
203
203
|
|
204
204
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
205
|
-
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep
|
205
|
+
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
|
206
206
|
|
207
207
|
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
208
|
-
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
208
|
+
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
209
209
|
|
210
210
|
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
211
|
-
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep
|
211
|
+
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
|
212
212
|
|
213
213
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
214
|
-
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep
|
214
|
+
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
215
215
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
216
216
|
|
217
217
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
data/lib/llama_cpp/client.rb
CHANGED
@@ -9,7 +9,6 @@ module LLaMACpp
|
|
9
9
|
# @param lora_adapter_path [String] The path to the LoRA adapter file.
|
10
10
|
# @param lora_base_path [String] The path to the LoRA base model file.
|
11
11
|
# @param n_ctx [Integer] The context size.
|
12
|
-
# @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
|
13
12
|
# @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
|
14
13
|
# @param use_mmap [Boolean] The flag whether to use mmap.
|
15
14
|
# @param use_mlock [Boolean] The flag hether to use mlock.
|
@@ -19,7 +18,7 @@ module LLaMACpp
|
|
19
18
|
# @return [Client]
|
20
19
|
# rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
21
20
|
def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
|
22
|
-
n_ctx: 512,
|
21
|
+
n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
|
23
22
|
embedding: false,
|
24
23
|
n_threads: 1, seed: 0)
|
25
24
|
@params = {
|
@@ -27,7 +26,6 @@ module LLaMACpp
|
|
27
26
|
lora_adapter_path: lora_adapter_path,
|
28
27
|
lora_base_path: lora_base_path,
|
29
28
|
n_ctx: n_ctx,
|
30
|
-
n_parts: n_parts,
|
31
29
|
memory_f16: memory_f16,
|
32
30
|
use_mmap: use_mmap,
|
33
31
|
use_mlock: use_mlock,
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-6986c78'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -10,7 +10,6 @@ module LLaMACpp
|
|
10
10
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
11
11
|
LLAMA_FTYPE_MOSTLY_Q4_1: Integer
|
12
12
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
|
13
|
-
LLAMA_FTYPE_MOSTLY_Q4_2: Integer
|
14
13
|
LLAMA_FTYPE_MOSTLY_Q8_0: Integer
|
15
14
|
LLAMA_FTYPE_MOSTLY_Q5_0: Integer
|
16
15
|
LLAMA_FTYPE_MOSTLY_Q5_1: Integer
|
@@ -65,6 +64,8 @@ module LLaMACpp
|
|
65
64
|
def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
|
66
65
|
def kv_cache_token_count: () -> Integer
|
67
66
|
def set_rng_seed: (Integer) -> void
|
67
|
+
def load_session_file: (session_path: String) -> void
|
68
|
+
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
68
69
|
def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
|
69
70
|
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
70
71
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
@@ -90,8 +91,6 @@ module LLaMACpp
|
|
90
91
|
def logits_all=: (bool) -> bool
|
91
92
|
def n_ctx: () -> Integer
|
92
93
|
def n_ctx=: (Integer) -> Integer
|
93
|
-
def n_parts: () -> Integer
|
94
|
-
def n_parts=: (Integer) -> Integer
|
95
94
|
def seed: () -> Integer
|
96
95
|
def seed=: (Integer) -> Integer
|
97
96
|
def use_mlock: () -> bool
|
@@ -106,7 +105,7 @@ module LLaMACpp
|
|
106
105
|
|
107
106
|
class Client
|
108
107
|
def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
|
109
|
-
?n_ctx: Integer, ?
|
108
|
+
?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
|
110
109
|
?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
|
111
110
|
def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
|
112
111
|
?frequency: Float, ?presence: Float,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|