llama_cpp 0.0.7 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,9 +19,11 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 1
23
- #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
22
+ #define LLAMA_FILE_VERSION 1
23
+ #define LLAMA_FILE_MAGIC 'ggjt'
24
+ #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
+ #define LLAMA_SESSION_MAGIC 'ggsn'
26
+ #define LLAMA_SESSION_VERSION 1
25
27
 
26
28
  #ifdef __cplusplus
27
29
  extern "C" {
@@ -39,18 +41,22 @@ extern "C" {
39
41
 
40
42
  typedef struct llama_token_data {
41
43
  llama_token id; // token id
42
-
44
+ float logit; // log-odds of the token
43
45
  float p; // probability of the token
44
- float plog; // log probability of the token
45
-
46
46
  } llama_token_data;
47
47
 
48
+ typedef struct llama_token_data_array {
49
+ llama_token_data * data;
50
+ size_t size;
51
+ bool sorted;
52
+ } llama_token_data_array;
53
+
48
54
  typedef void (*llama_progress_callback)(float progress, void *ctx);
49
55
 
50
56
  struct llama_context_params {
51
57
  int n_ctx; // text context
52
58
  int n_parts; // -1 for default
53
- int seed; // RNG seed, 0 for random
59
+ int seed; // RNG seed, -1 for random
54
60
 
55
61
  bool f16_kv; // use fp16 for KV cache
56
62
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,7 +79,7 @@ extern "C" {
73
79
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
80
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
81
  LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
- LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
82
+ // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
77
83
  LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
78
84
  LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
79
85
  LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@@ -116,13 +122,14 @@ extern "C" {
116
122
  int n_threads);
117
123
 
118
124
  // Returns the number of tokens in the KV cache
119
- LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
125
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
120
126
 
121
127
  // Sets the current rng seed.
122
128
  LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
123
129
 
124
- // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
125
- LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
130
+ // Returns the maximum size in bytes of the state (rng, logits, embedding
131
+ // and kv_cache) - will often be smaller after compacting tokens
132
+ LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
126
133
 
127
134
  // Copies the state to the specified destination address.
128
135
  // Destination needs to have allocated enough memory.
@@ -133,6 +140,10 @@ extern "C" {
133
140
  // Returns the number of bytes read
134
141
  LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
135
142
 
143
+ // Save/load session file
144
+ LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
145
+ LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
146
+
136
147
  // Run the llama inference to obtain the logits and probabilities for the next token.
137
148
  // tokens + n_tokens is the provided batch of new tokens to process
138
149
  // n_past is the number of tokens to use from previous eval calls
@@ -156,9 +167,9 @@ extern "C" {
156
167
  int n_max_tokens,
157
168
  bool add_bos);
158
169
 
159
- LLAMA_API int llama_n_vocab(struct llama_context * ctx);
160
- LLAMA_API int llama_n_ctx (struct llama_context * ctx);
161
- LLAMA_API int llama_n_embd (struct llama_context * ctx);
170
+ LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
171
+ LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
172
+ LLAMA_API int llama_n_embd (const struct llama_context * ctx);
162
173
 
163
174
  // Token logits obtained from the last call to llama_eval()
164
175
  // The logits for the last token are stored in the last row
@@ -172,21 +183,57 @@ extern "C" {
172
183
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
173
184
 
174
185
  // Token Id -> String. Uses the vocabulary in the provided context
175
- LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
186
+ LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
176
187
 
177
188
  // Special tokens
178
189
  LLAMA_API llama_token llama_token_bos();
179
190
  LLAMA_API llama_token llama_token_eos();
191
+ LLAMA_API llama_token llama_token_nl();
192
+
193
+ // Sampling functions
194
+
195
+ /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
196
+ LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
197
+
198
+ /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
199
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
200
+
201
+ /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
202
+ LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
203
+
204
+ /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
205
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
206
+
207
+ /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
208
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
209
+
210
+ /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
211
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
212
+
213
+ /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
214
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
215
+ LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
216
+
217
+ /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
218
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
219
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
220
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
221
+ /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
222
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
223
+ LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
224
+
225
+ /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
226
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
227
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
228
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
229
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
230
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
231
+
232
+ /// @details Selects the token with the highest probability.
233
+ LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
180
234
 
181
- // TODO: improve the last_n_tokens interface ?
182
- LLAMA_API llama_token llama_sample_top_p_top_k(
183
- struct llama_context * ctx,
184
- const llama_token * last_n_tokens_data,
185
- int last_n_tokens_size,
186
- int top_k,
187
- float top_p,
188
- float temp,
189
- float repeat_penalty);
235
+ /// @details Randomly selects a token from the candidates based on their probabilities.
236
+ LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
190
237
 
191
238
  // Performance information
192
239
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -2,7 +2,7 @@
2
2
 
3
3
  module LLaMACpp
4
4
  # Client provides a high-level interface to the LLM model.
5
- class Client
5
+ class Client # rubocop:disable Metrics/ClassLength
6
6
  # Creates a new client.
7
7
  #
8
8
  # @param model_path [String] The path to the model file.
@@ -61,14 +61,19 @@ module LLaMACpp
61
61
  # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
62
  # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
63
  # @param n_batch [Integer] The batch size.
64
+ # @param frequency [Float] The frequency penalty value.
65
+ # @param presence [Float] The presence penalty value.
64
66
  # @param top_k [Integer] The top-k value.
65
67
  # @param top_p [Float] The top-p value.
68
+ # @param tfs_z [Float] The tail free sampling parameter.
69
+ # @param typical_p [Float] The typical probability value.
66
70
  # @param temperature [Float] The temperature value.
67
71
  # @param repeat_penalty [Float] The repeat penalty value.
68
72
  # @return [String]
69
73
  # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
70
74
  def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
71
- top_k: 40, top_p: 0.95, temperature: 0.80, repeat_penalty: 1.1)
75
+ frequency: 0.0, presence: 0.0,
76
+ top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
72
77
  embd_input = tokenize_prompt(prompt)
73
78
 
74
79
  n_ctx = @context.n_ctx
@@ -80,6 +85,7 @@ module LLaMACpp
80
85
  n_consumed = 0
81
86
  n_past = 0
82
87
  n_remain = max_tokens
88
+ n_vocab = @context.n_vocab
83
89
  output = []
84
90
 
85
91
  while n_remain != 0
@@ -97,11 +103,28 @@ module LLaMACpp
97
103
  embd.clear
98
104
 
99
105
  if embd_input.size <= n_consumed
100
- start = n_ctx - repeat_last_n
101
- id = @context.sample_top_p_top_k(
102
- last_n_tokens[start...(start + repeat_last_n)],
103
- top_k: top_k, top_p: top_p, temp: temperature, penalty: repeat_penalty
106
+ logits = @context.logits
107
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
108
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
109
+
110
+ # apply penalties
111
+ last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
112
+ @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
113
+ @context.sample_frequency_and_presence_penalties(
114
+ candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
104
115
  )
116
+
117
+ # temperature sampling
118
+ @context.sample_top_k(candidates, k: top_k)
119
+ @context.sample_tail_free(candidates, z: tfs_z)
120
+ @context.sample_typical(candidates, prob: typical_p)
121
+ @context.sample_top_p(candidates, prob: top_p)
122
+ @context.sample_temperature(candidates, temperature: temperature)
123
+ id = @context.sample_token(candidates)
124
+
125
+ last_n_tokens.shift
126
+ last_n_tokens.push(id)
127
+
105
128
  last_n_tokens.shift
106
129
  last_n_tokens.push(id)
107
130
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.7'
6
+ VERSION = '0.1.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-11d9023'
9
+ LLAMA_CPP_VERSION = 'master-173d0e6'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -37,7 +37,16 @@ module LLaMACpp
37
37
  n_past = 0
38
38
  n_remain = n_predict
39
39
  repeat_last_n = 64
40
+ repeat_penalty = 1.1
41
+ frequency = 0.0
42
+ presence = 0.0
43
+ top_k = 40
44
+ top_p = 0.95
45
+ tfs_z = 1.0
46
+ typical_p = 1.0
47
+ temperature = 0.8
40
48
  n_batch = 512
49
+ n_vocab = context.n_vocab
41
50
  output = []
42
51
 
43
52
  while n_remain != 0
@@ -55,10 +64,25 @@ module LLaMACpp
55
64
  embd.clear
56
65
 
57
66
  if embd_input.size <= n_consumed
58
- start = n_ctx - repeat_last_n
59
- id = context.sample_top_p_top_k(
60
- last_n_tokens[start...(start + repeat_last_n)], top_k: 40, top_p: 0.95, temp: 0.80, penalty: 1.1
67
+ logits = context.logits
68
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
69
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
70
+
71
+ # apply penalties
72
+ last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
73
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
74
+ context.sample_frequency_and_presence_penalties(
75
+ candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
61
76
  )
77
+
78
+ # temperature sampling
79
+ context.sample_top_k(candidates, k: top_k)
80
+ context.sample_tail_free(candidates, z: tfs_z)
81
+ context.sample_typical(candidates, prob: typical_p)
82
+ context.sample_top_p(candidates, prob: top_p)
83
+ context.sample_temperature(candidates, temperature: temperature)
84
+ id = context.sample_token(candidates)
85
+
62
86
  last_n_tokens.shift
63
87
  last_n_tokens.push(id)
64
88
 
data/sig/llama_cpp.rbs CHANGED
@@ -11,7 +11,6 @@ module LLaMACpp
11
11
  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
12
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
13
  LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
- LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
14
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
16
15
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
17
16
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
@@ -21,9 +20,30 @@ module LLaMACpp
21
20
  def self?.print_system_info: () -> void
22
21
  def self?.token_bos: () -> Integer
23
22
  def self?.token_eos: () -> Integer
23
+ def self?.token_nl: () -> Integer
24
24
  def self?.mmap_supported?: () -> bool
25
25
  def self?.mlock_supported?: () -> bool
26
26
 
27
+ class TokenData
28
+ public
29
+
30
+ def initialize: (id: Integer, logit: Float, p: Float) -> void
31
+ def id: () -> Integer
32
+ def id=: (Integer) -> Integer
33
+ def logit: () -> Float
34
+ def logit=: (Float) -> Float
35
+ def p: () -> Float
36
+ def p=: (Float) -> Float
37
+ end
38
+
39
+ class TokenDataArray
40
+ public
41
+
42
+ def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
43
+ def size: () -> Integer
44
+ def sorted: () -> bool
45
+ end
46
+
27
47
  class Context
28
48
  public
29
49
 
@@ -40,10 +60,23 @@ module LLaMACpp
40
60
  def n_vocab: () -> Integer
41
61
  def print_timings: () -> void
42
62
  def reset_timings: () -> void
43
- def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
44
63
  def token_to_str: (Integer) -> String
45
64
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
46
65
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
66
+ def kv_cache_token_count: () -> Integer
67
+ def set_rng_seed: (Integer) -> void
68
+ def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
69
+ def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
70
+ def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
71
+ def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
72
+ def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
73
+ def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
74
+ def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
75
+ def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
76
+ def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
77
+ def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
78
+ def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
79
+ def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
47
80
  end
48
81
 
49
82
  class ContextParams
@@ -76,7 +109,9 @@ module LLaMACpp
76
109
  ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
77
110
  ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
78
111
  def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
79
- ?top_k: Integer, ?top_p: Float, ?temperature: Float, ?repeat_penalty: Float) -> String
112
+ ?frequency: Float, ?presence: Float,
113
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
114
+ ?repeat_penalty: Float) -> String
80
115
  def embeddings(String) -> Array[Float]
81
116
  end
82
117
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-29 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -31,9 +31,9 @@ files:
31
31
  - ext/llama_cpp/src/ggml-opencl.h
32
32
  - ext/llama_cpp/src/ggml.c
33
33
  - ext/llama_cpp/src/ggml.h
34
+ - ext/llama_cpp/src/llama-util.h
34
35
  - ext/llama_cpp/src/llama.cpp
35
36
  - ext/llama_cpp/src/llama.h
36
- - ext/llama_cpp/src/llama_util.h
37
37
  - lib/llama_cpp.rb
38
38
  - lib/llama_cpp/client.rb
39
39
  - lib/llama_cpp/version.rb