llama_cpp 0.3.3 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -53,6 +53,10 @@
53
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
54
54
  #endif
55
55
 
56
+ #ifndef LLAMA_DEFAULT_RMS_EPS
57
+ #define LLAMA_DEFAULT_RMS_EPS 5e-6f
58
+ #endif
59
+
56
60
  #ifdef __cplusplus
57
61
  extern "C" {
58
62
  #endif
@@ -83,12 +87,20 @@ extern "C" {
83
87
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
88
 
85
89
  struct llama_context_params {
86
- uint32_t seed; // RNG seed, -1 for random
87
- int32_t n_ctx; // text context
88
- int32_t n_batch; // prompt processing batch size
89
- int32_t n_gpu_layers; // number of layers to store in VRAM
90
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
+ uint32_t seed; // RNG seed, -1 for random
91
+ int32_t n_ctx; // text context
92
+ int32_t n_batch; // prompt processing batch size
93
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
94
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
95
+ int32_t n_gpu_layers; // number of layers to store in VRAM
96
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
97
+
98
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
99
+
100
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
101
+ float rope_freq_base; // RoPE base frequency
102
+ float rope_freq_scale; // RoPE frequency scaling factor
103
+
92
104
  // called with a progress value between 0 and 1, pass NULL to disable
93
105
  llama_progress_callback progress_callback;
94
106
  // context pointer passed to the progress callback
@@ -134,6 +146,40 @@ extern "C" {
134
146
  bool quantize_output_tensor; // quantize output.weight
135
147
  } llama_model_quantize_params;
136
148
 
149
+ // grammar types
150
+ struct llama_grammar;
151
+
152
+ // grammar element type
153
+ enum llama_gretype {
154
+ // end of rule definition
155
+ LLAMA_GRETYPE_END = 0,
156
+
157
+ // start of alternate definition for rule
158
+ LLAMA_GRETYPE_ALT = 1,
159
+
160
+ // non-terminal element: reference to rule
161
+ LLAMA_GRETYPE_RULE_REF = 2,
162
+
163
+ // terminal element: character (code point)
164
+ LLAMA_GRETYPE_CHAR = 3,
165
+
166
+ // inverse char(s) ([^a], [^a-b] [^abc])
167
+ LLAMA_GRETYPE_CHAR_NOT = 4,
168
+
169
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
170
+ // be an inclusive range ([a-z])
171
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
172
+
173
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
174
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
175
+ LLAMA_GRETYPE_CHAR_ALT = 6,
176
+ };
177
+
178
+ typedef struct llama_grammar_element {
179
+ enum llama_gretype type;
180
+ uint32_t value; // Unicode code point or rule ID
181
+ } llama_grammar_element;
182
+
137
183
  // performance timing information
138
184
  struct llama_timings {
139
185
  double t_start_ms;
@@ -148,6 +194,8 @@ extern "C" {
148
194
  int32_t n_eval;
149
195
  };
150
196
 
197
+ LLAMA_API int llama_max_devices();
198
+
151
199
  LLAMA_API struct llama_context_params llama_context_default_params();
152
200
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
153
201
 
@@ -270,10 +318,21 @@ extern "C" {
270
318
  int n_max_tokens,
271
319
  bool add_bos);
272
320
 
321
+ LLAMA_API int llama_tokenize_with_model(
322
+ const struct llama_model * model,
323
+ const char * text,
324
+ llama_token * tokens,
325
+ int n_max_tokens,
326
+ bool add_bos);
327
+
273
328
  LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
274
329
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
275
330
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
276
331
 
332
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
333
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
334
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
335
+
277
336
  // Get the vocabulary as output parameters.
278
337
  // Returns number of results.
279
338
  LLAMA_API int llama_get_vocab(
@@ -282,6 +341,12 @@ extern "C" {
282
341
  float * scores,
283
342
  int capacity);
284
343
 
344
+ LLAMA_API int llama_get_vocab_from_model(
345
+ const struct llama_model * model,
346
+ const char * * strings,
347
+ float * scores,
348
+ int capacity);
349
+
285
350
  // Token logits obtained from the last call to llama_eval()
286
351
  // The logits for the last token are stored in the last row
287
352
  // Can be mutated in order to change the probabilities of the next token
@@ -294,13 +359,28 @@ extern "C" {
294
359
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
295
360
 
296
361
  // Token Id -> String. Uses the vocabulary in the provided context
297
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
362
+ LLAMA_API const char * llama_token_to_str(
363
+ const struct llama_context * ctx,
364
+ llama_token token);
365
+
366
+ LLAMA_API const char * llama_token_to_str_with_model(
367
+ const struct llama_model * model,
368
+ llama_token token);
298
369
 
299
370
  // Special tokens
300
371
  LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
301
372
  LLAMA_API llama_token llama_token_eos(); // end-of-sentence
302
373
  LLAMA_API llama_token llama_token_nl(); // next-line
303
374
 
375
+ // Grammar
376
+ //
377
+ LLAMA_API struct llama_grammar * llama_grammar_init(
378
+ const llama_grammar_element ** rules,
379
+ size_t n_rules,
380
+ size_t start_rule_index);
381
+
382
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
383
+
304
384
  // Sampling functions
305
385
 
306
386
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -313,13 +393,11 @@ extern "C" {
313
393
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
394
  /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
395
  /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
- /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
396
  LLAMA_API void llama_sample_classifier_free_guidance(
318
397
  struct llama_context * ctx,
319
398
  llama_token_data_array * candidates,
320
399
  struct llama_context * guidance_ctx,
321
- float scale,
322
- float smooth_factor);
400
+ float scale);
323
401
 
324
402
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
325
403
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -337,6 +415,9 @@ extern "C" {
337
415
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
338
416
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
339
417
 
418
+ /// @details Apply constraints from grammar
419
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
420
+
340
421
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
341
422
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
342
423
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -358,6 +439,9 @@ extern "C" {
358
439
  /// @details Randomly selects a token from the candidates based on their probabilities.
359
440
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
360
441
 
442
+ /// @details Accepts the sampled token into the grammar
443
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
444
+
361
445
  // Performance information
362
446
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
363
447
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.3'
6
+ VERSION = '0.3.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-32c5411'
9
+ LLAMA_CPP_VERSION = 'master-1a94186'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -109,3 +109,4 @@ module LLaMACpp
109
109
  end
110
110
 
111
111
  LLaMACpp.backend_init
112
+ at_exit { LLaMACpp.backend_free }
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
+ LLAMA_GRETYPE_END: Integer
30
+ LLAMA_GRETYPE_ALT: Integer
31
+ LLAMA_GRETYPE_RULE_REF: Integer
32
+ LLAMA_GRETYPE_CHAR: Integer
33
+ LLAMA_GRETYPE_CHAR_NOT: Integer
34
+ LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
35
+ LLAMA_GRETYPE_CHAR_ALT: Integer
36
+
29
37
  def self?.backend_init: (?numa: bool) -> void
30
38
  def self?.backend_free: () -> void
31
39
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -39,6 +47,7 @@ module LLaMACpp
39
47
  def self?.token_nl: () -> Integer
40
48
  def self?.mmap_supported?: () -> bool
41
49
  def self?.mlock_supported?: () -> bool
50
+ def self?.max_devices: () -> Integer
42
51
 
43
52
  class TokenData
44
53
  public
@@ -69,6 +78,12 @@ module LLaMACpp
69
78
  def free: () -> void
70
79
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
71
80
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
81
+ def n_vocab: () -> Integer
82
+ def n_ctx: () -> Integer
83
+ def n_embd: () -> Integer
84
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
85
+ def token_to_str: (Integer) -> String
86
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
72
87
  end
73
88
 
74
89
  class Timings
@@ -109,7 +124,7 @@ module LLaMACpp
109
124
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
110
125
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
111
126
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
127
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
113
128
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
114
129
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
115
130
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -120,6 +135,8 @@ module LLaMACpp
120
135
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
121
136
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
122
137
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
138
+ def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
139
+ def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
123
140
  end
124
141
 
125
142
  class ContextParams
@@ -140,6 +157,10 @@ module LLaMACpp
140
157
  def main_gpu: () -> Integer
141
158
  def main_gpu=: (Integer) -> Integer
142
159
  def tensor_split: () -> Array[Float]
160
+ def rope_freq_base=: (Float) -> Float
161
+ def rope_freq_base: () -> Float
162
+ def rope_freq_scale=: (Float) -> Float
163
+ def rope_freq_scale: () -> Float
143
164
  def low_vram: () -> bool
144
165
  def low_vram=: (bool) -> bool
145
166
  def seed: () -> Integer
@@ -166,4 +187,18 @@ module LLaMACpp
166
187
  end
167
188
 
168
189
  class Params = ContextParams
190
+
191
+ class GrammarElement
192
+ public
193
+
194
+ def initialize: (?type: Integer, ?value: Integer) -> void
195
+ def type: () -> Integer
196
+ def type=: (Integer) -> Integer
197
+ def value: () -> Integer
198
+ def value=: (Integer) -> Integer
199
+ end
200
+
201
+ class Grammar
202
+ def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
203
+ end
169
204
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-15 00:00:00.000000000 Z
11
+ date: 2023-07-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: