llama_cpp 0.7.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -133,11 +133,12 @@ extern "C" {
|
|
133
133
|
typedef struct llama_batch {
|
134
134
|
int32_t n_tokens;
|
135
135
|
|
136
|
-
llama_token *
|
137
|
-
float *
|
138
|
-
llama_pos *
|
139
|
-
|
140
|
-
|
136
|
+
llama_token * token;
|
137
|
+
float * embd;
|
138
|
+
llama_pos * pos;
|
139
|
+
int32_t * n_seq_id;
|
140
|
+
llama_seq_id ** seq_id;
|
141
|
+
int8_t * logits;
|
141
142
|
|
142
143
|
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
143
144
|
// for future-proof code, use the above fields instead and ignore everything below
|
@@ -446,7 +447,8 @@ extern "C" {
|
|
446
447
|
llama_pos pos_0,
|
447
448
|
llama_seq_id seq_id);
|
448
449
|
|
449
|
-
// Allocates a batch of tokens on the heap
|
450
|
+
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
451
|
+
// Each token can be assigned up to n_seq_max sequence ids
|
450
452
|
// The batch has to be freed with llama_batch_free()
|
451
453
|
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
452
454
|
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
@@ -454,7 +456,8 @@ extern "C" {
|
|
454
456
|
// All members are left uninitialized
|
455
457
|
LLAMA_API struct llama_batch llama_batch_init(
|
456
458
|
int32_t n_tokens,
|
457
|
-
int32_t embd
|
459
|
+
int32_t embd,
|
460
|
+
int32_t n_seq_max);
|
458
461
|
|
459
462
|
// Frees a batch of tokens allocated with llama_batch_init()
|
460
463
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
@@ -491,37 +494,41 @@ extern "C" {
|
|
491
494
|
// Vocab
|
492
495
|
//
|
493
496
|
|
494
|
-
LLAMA_API const char * llama_token_get_text(const struct
|
497
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
495
498
|
|
496
|
-
LLAMA_API float llama_token_get_score(const struct
|
499
|
+
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
497
500
|
|
498
|
-
LLAMA_API enum llama_token_type llama_token_get_type(const struct
|
501
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
499
502
|
|
500
503
|
// Special tokens
|
501
|
-
LLAMA_API llama_token llama_token_bos(const struct
|
502
|
-
LLAMA_API llama_token llama_token_eos(const struct
|
503
|
-
LLAMA_API llama_token llama_token_nl (const struct
|
504
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
505
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
506
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
507
|
+
|
504
508
|
// codellama infill tokens
|
505
|
-
LLAMA_API llama_token llama_token_prefix(const struct
|
506
|
-
LLAMA_API llama_token llama_token_middle(const struct
|
507
|
-
LLAMA_API llama_token llama_token_suffix(const struct
|
508
|
-
LLAMA_API llama_token llama_token_eot (const struct
|
509
|
+
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
510
|
+
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
511
|
+
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
512
|
+
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
509
513
|
|
510
514
|
//
|
511
515
|
// Tokenization
|
512
516
|
//
|
513
517
|
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
+
/// @details Convert the provided text into tokens.
|
519
|
+
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
520
|
+
/// @return Returns the number of tokens on success, no more than n_max_tokens
|
521
|
+
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
522
|
+
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
|
523
|
+
/// Does not insert a leading space.
|
518
524
|
LLAMA_API int llama_tokenize(
|
519
525
|
const struct llama_model * model,
|
520
526
|
const char * text,
|
521
527
|
int text_len,
|
522
528
|
llama_token * tokens,
|
523
529
|
int n_max_tokens,
|
524
|
-
bool add_bos
|
530
|
+
bool add_bos,
|
531
|
+
bool special);
|
525
532
|
|
526
533
|
// Token Id -> Piece.
|
527
534
|
// Uses the vocabulary in the provided context.
|
@@ -554,21 +561,15 @@ extern "C" {
|
|
554
561
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
555
562
|
|
556
563
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
557
|
-
LLAMA_API void llama_sample_repetition_penalty(
|
558
|
-
struct llama_context * ctx,
|
559
|
-
llama_token_data_array * candidates,
|
560
|
-
const llama_token * last_tokens,
|
561
|
-
size_t last_tokens_size,
|
562
|
-
float penalty);
|
563
|
-
|
564
564
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
565
|
-
LLAMA_API void
|
565
|
+
LLAMA_API void llama_sample_repetition_penalties(
|
566
566
|
struct llama_context * ctx,
|
567
567
|
llama_token_data_array * candidates,
|
568
568
|
const llama_token * last_tokens,
|
569
|
-
size_t
|
570
|
-
float
|
571
|
-
float
|
569
|
+
size_t penalty_last_n,
|
570
|
+
float penalty_repeat,
|
571
|
+
float penalty_freq,
|
572
|
+
float penalty_present);
|
572
573
|
|
573
574
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
574
575
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.9.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1429'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -67,9 +67,9 @@ module LLaMACpp
|
|
67
67
|
|
68
68
|
# apply penalties
|
69
69
|
last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
|
70
|
-
context.
|
71
|
-
|
72
|
-
|
70
|
+
context.sample_repetition_penalties(
|
71
|
+
candidates, last_n_tokens[-last_n_repeat..],
|
72
|
+
penalty_repeat: repeat_penalty, penalty_freq: frequency, penalty_present: presence
|
73
73
|
)
|
74
74
|
|
75
75
|
# temperature sampling
|
@@ -97,7 +97,7 @@ module LLaMACpp
|
|
97
97
|
|
98
98
|
embd.each { |token| output << context.model.token_to_piece(token) }
|
99
99
|
|
100
|
-
break if !embd.empty? && embd[-1] == context.token_eos
|
100
|
+
break if !embd.empty? && embd[-1] == context.model.token_eos
|
101
101
|
end
|
102
102
|
|
103
103
|
output.join.scrub('?').strip.delete_prefix(prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -78,10 +78,20 @@ module LLaMACpp
|
|
78
78
|
def n_embd: () -> Integer
|
79
79
|
def rope_freq_scale_train: () -> Float
|
80
80
|
def token_to_piece: (Integer) -> String
|
81
|
-
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
81
|
+
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
|
82
82
|
def desc: () -> String
|
83
83
|
def size: () -> Integer
|
84
84
|
def n_params: () -> Integer
|
85
|
+
def text: (Integer) -> String
|
86
|
+
def score: (Integer) -> Float
|
87
|
+
def type: (Integer) -> Integer
|
88
|
+
def token_bos: () -> Integer
|
89
|
+
def token_eos: () -> Integer
|
90
|
+
def token_nl: () -> Integer
|
91
|
+
def token_prefix: () -> Integer
|
92
|
+
def token_middle: () -> Integer
|
93
|
+
def token_suffix: () -> Integer
|
94
|
+
def token_eot: () -> Integer
|
85
95
|
end
|
86
96
|
|
87
97
|
class Timings
|
@@ -117,7 +127,7 @@ module LLaMACpp
|
|
117
127
|
class Batch
|
118
128
|
public
|
119
129
|
|
120
|
-
def initialize: (n_tokens: Integer, embd: Integer) -> void
|
130
|
+
def initialize: (n_tokens: Integer, embd: Integer, n_seq_max: Integer) -> void
|
121
131
|
def n_tokens=: (Integer) -> Integer
|
122
132
|
def n_tokens: () -> Integer
|
123
133
|
def all_pos_zero=: (Integer) -> Integer
|
@@ -130,8 +140,8 @@ module LLaMACpp
|
|
130
140
|
def get_token: (Integer) -> Integer
|
131
141
|
def set_pos: (Integer, Integer) -> Integer
|
132
142
|
def get_pos: (Integer) -> Integer
|
133
|
-
def set_seq_id: (Integer, Integer) -> Integer
|
134
|
-
def get_seq_id: (Integer) -> Integer
|
143
|
+
def set_seq_id: (Integer, Integer, Integer) -> Integer
|
144
|
+
def get_seq_id: (Integer, Integer) -> Integer
|
135
145
|
def set_logit: (Integer, bool) -> bool
|
136
146
|
def get_logit: (Integer) -> bool
|
137
147
|
end
|
@@ -143,16 +153,6 @@ module LLaMACpp
|
|
143
153
|
|
144
154
|
def initialize: (model: ::LLaMACpp::Model, params: ::LLaMACpp::ContextParams) -> void
|
145
155
|
def embeddings: () -> Array[Float]
|
146
|
-
def text: (Integer) -> String
|
147
|
-
def score: (Integer) -> Float
|
148
|
-
def type: (Integer) -> Integer
|
149
|
-
def token_bos: () -> Integer
|
150
|
-
def token_eos: () -> Integer
|
151
|
-
def token_nl: () -> Integer
|
152
|
-
def token_prefix: () -> Integer
|
153
|
-
def token_middle: () -> Integer
|
154
|
-
def token_suffix: () -> Integer
|
155
|
-
def token_eot: () -> Integer
|
156
156
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer) -> void
|
157
157
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer) -> void
|
158
158
|
def decode: (::LLaMACpp::Batch) -> void
|
@@ -170,8 +170,7 @@ module LLaMACpp
|
|
170
170
|
def set_rng_seed: (Integer) -> void
|
171
171
|
def load_session_file: (session_path: String) -> void
|
172
172
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
173
|
-
def
|
174
|
-
def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
|
173
|
+
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
175
174
|
def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
|
176
175
|
def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
|
177
176
|
def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -78,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: '0'
|
80
80
|
requirements: []
|
81
|
-
rubygems_version: 3.4.
|
81
|
+
rubygems_version: 3.4.20
|
82
82
|
signing_key:
|
83
83
|
specification_version: 4
|
84
84
|
summary: Ruby bindings for the llama.cpp.
|