llama_cpp 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -133,11 +133,12 @@ extern "C" {
|
|
133
133
|
typedef struct llama_batch {
|
134
134
|
int32_t n_tokens;
|
135
135
|
|
136
|
-
llama_token *
|
137
|
-
float *
|
138
|
-
llama_pos *
|
139
|
-
|
140
|
-
|
136
|
+
llama_token * token;
|
137
|
+
float * embd;
|
138
|
+
llama_pos * pos;
|
139
|
+
int32_t * n_seq_id;
|
140
|
+
llama_seq_id ** seq_id;
|
141
|
+
int8_t * logits;
|
141
142
|
|
142
143
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
143
144
|
// for future-proof code, use the above fields instead and ignore everything below
|
@@ -446,7 +447,8 @@ extern "C" {
|
|
446
447
|
llama_pos pos_0,
|
447
448
|
llama_seq_id seq_id);
|
448
449
|
|
449
|
-
// Allocates a batch of tokens on the heap
|
450
|
+
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
451
|
+
// Each token can be assigned up to n_seq_max sequence ids
|
450
452
|
// The batch has to be freed with llama_batch_free()
|
451
453
|
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
452
454
|
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
@@ -454,7 +456,8 @@ extern "C" {
|
|
454
456
|
// All members are left uninitialized
|
455
457
|
LLAMA_API struct llama_batch llama_batch_init(
|
456
458
|
int32_t n_tokens,
|
457
|
-
int32_t embd
|
459
|
+
int32_t embd,
|
460
|
+
int32_t n_seq_max);
|
458
461
|
|
459
462
|
// Frees a batch of tokens allocated with llama_batch_init()
|
460
463
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
@@ -491,37 +494,41 @@ extern "C" {
|
|
491
494
|
// Vocab
|
492
495
|
//
|
493
496
|
|
494
|
-
LLAMA_API const char * llama_token_get_text(const struct
|
497
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
495
498
|
|
496
|
-
LLAMA_API float llama_token_get_score(const struct
|
499
|
+
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
497
500
|
|
498
|
-
LLAMA_API enum llama_token_type llama_token_get_type(const struct
|
501
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
499
502
|
|
500
503
|
// Special tokens
|
501
|
-
LLAMA_API llama_token llama_token_bos(const struct
|
502
|
-
LLAMA_API llama_token llama_token_eos(const struct
|
503
|
-
LLAMA_API llama_token llama_token_nl (const struct
|
504
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
505
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
506
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
507
|
+
|
504
508
|
// codellama infill tokens
|
505
|
-
LLAMA_API llama_token llama_token_prefix(const struct
|
506
|
-
LLAMA_API llama_token llama_token_middle(const struct
|
507
|
-
LLAMA_API llama_token llama_token_suffix(const struct
|
508
|
-
LLAMA_API llama_token llama_token_eot (const struct
|
509
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
510
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
511
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
512
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
509
513
|
|
510
514
|
//
|
511
515
|
// Tokenization
|
512
516
|
//
|
513
517
|
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
+
/// @details Convert the provided text into tokens.
|
519
|
+
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
520
|
+
/// @return Returns the number of tokens on success, no more than n_max_tokens
|
521
|
+
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
522
|
+
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
523
|
+
/// Does not insert a leading space.
|
518
524
|
LLAMA_API int llama_tokenize(
|
519
525
|
const struct llama_model * model,
|
520
526
|
const char * text,
|
521
527
|
int text_len,
|
522
528
|
llama_token * tokens,
|
523
529
|
int n_max_tokens,
|
524
|
-
bool add_bos
|
530
|
+
bool add_bos,
|
531
|
+
bool special);
|
525
532
|
|
526
533
|
// Token Id -> Piece.
|
527
534
|
// Uses the vocabulary in the provided context.
|
@@ -554,21 +561,15 @@ extern "C" {
|
|
554
561
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
555
562
|
|
556
563
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
557
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
558
|
-
struct llama_context * ctx,
|
559
|
-
llama_token_data_array * candidates,
|
560
|
-
const llama_token * last_tokens,
|
561
|
-
size_t last_tokens_size,
|
562
|
-
float penalty);
|
563
|
-
|
564
564
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
565
|
-
LLAMA_API void
|
565
|
+
LLAMA_API void llama_sample_repetition_penalties(
|
566
566
|
struct llama_context * ctx,
|
567
567
|
llama_token_data_array * candidates,
|
568
568
|
const llama_token * last_tokens,
|
569
|
-
size_t
|
570
|
-
float
|
571
|
-
float
|
569
|
+
size_t penalty_last_n,
|
570
|
+
float penalty_repeat,
|
571
|
+
float penalty_freq,
|
572
|
+
float penalty_present);
|
572
573
|
|
573
574
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
574
575
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.9.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1429'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
|
|
67
67
|
|
68
68
|
# apply penalties
|
69
69
|
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.
|
71
|
-
|
72
|
-
|
70
|
+
context.sample_repetition_penalties(
|
71
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
+
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
73
|
)
|
74
74
|
|
75
75
|
# temperature sampling
|
@@ -97,7 +97,7 @@ module LLaMACpp
|
|
97
97
|
|
98
98
|
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
99
|
|
100
|
-
break if !embd.empty? && embd[-1] == context.token_eos
|
100
|
+
break if !embd.empty? && embd[-1] == context.model.token_eos
|
101
101
|
end
|
102
102
|
|
103
103
|
output.join.scrub('?').strip.delete_prefix(prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -78,10 +78,20 @@ module LLaMACpp
|
|
78
78
|
def n_embd: () -> Integer
|
79
79
|
def rope_freq_scale_train: () -> Float
|
80
80
|
def token_to_piece: (Integer) -> String
|
81
|
-
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
81
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
|
82
82
|
def desc: () -> String
|
83
83
|
def size: () -> Integer
|
84
84
|
def n_params: () -> Integer
|
85
|
+
def text: (Integer) -> String
|
86
|
+
def score: (Integer) -> Float
|
87
|
+
def type: (Integer) -> Integer
|
88
|
+
def token_bos: () -> Integer
|
89
|
+
def token_eos: () -> Integer
|
90
|
+
def token_nl: () -> Integer
|
91
|
+
def token_prefix: () -> Integer
|
92
|
+
def token_middle: () -> Integer
|
93
|
+
def token_suffix: () -> Integer
|
94
|
+
def token_eot: () -> Integer
|
85
95
|
end
|
86
96
|
|
87
97
|
class Timings
|
@@ -117,7 +127,7 @@ module LLaMACpp
|
|
117
127
|
class Batch
|
118
128
|
public
|
119
129
|
|
120
|
-
def initialize: (n_tokens: Integer, embd: Integer) -> void
|
130
|
+
def initialize: (n_tokens: Integer, embd: Integer, n_seq_max: Integer) -> void
|
121
131
|
def n_tokens=: (Integer) -> Integer
|
122
132
|
def n_tokens: () -> Integer
|
123
133
|
def all_pos_zero=: (Integer) -> Integer
|
@@ -130,8 +140,8 @@ module LLaMACpp
|
|
130
140
|
def get_token: (Integer) -> Integer
|
131
141
|
def set_pos: (Integer, Integer) -> Integer
|
132
142
|
def get_pos: (Integer) -> Integer
|
133
|
-
def set_seq_id: (Integer, Integer) -> Integer
|
134
|
-
def get_seq_id: (Integer) -> Integer
|
143
|
+
def set_seq_id: (Integer, Integer, Integer) -> Integer
|
144
|
+
def get_seq_id: (Integer, Integer) -> Integer
|
135
145
|
def set_logit: (Integer, bool) -> bool
|
136
146
|
def get_logit: (Integer) -> bool
|
137
147
|
end
|
@@ -143,16 +153,6 @@ module LLaMACpp
|
|
143
153
|
|
144
154
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
145
155
|
def embeddings: () -> Array[Float]
|
146
|
-
def text: (Integer) -> String
|
147
|
-
def score: (Integer) -> Float
|
148
|
-
def type: (Integer) -> Integer
|
149
|
-
def token_bos: () -> Integer
|
150
|
-
def token_eos: () -> Integer
|
151
|
-
def token_nl: () -> Integer
|
152
|
-
def token_prefix: () -> Integer
|
153
|
-
def token_middle: () -> Integer
|
154
|
-
def token_suffix: () -> Integer
|
155
|
-
def token_eot: () -> Integer
|
156
156
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
157
157
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
158
158
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -170,8 +170,7 @@ module LLaMACpp
|
|
170
170
|
def set_rng_seed: (Integer) -> void
|
171
171
|
def load_session_file: (session_path: String) -> void
|
172
172
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
173
|
-
def
|
174
|
-
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
173
|
+
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
175
174
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
176
175
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
177
176
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -78,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
|
-
rubygems_version: 3.4.
|
81
|
+
rubygems_version: 3.4.20
|
82
82
|
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: Ruby bindings for the llama.cpp.
|