llama_cpp 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,10 @@
53
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
54
54
  #endif
55
55
 
56
+ #ifndef LLAMA_DEFAULT_RMS_EPS
57
+ #define LLAMA_DEFAULT_RMS_EPS 5e-6f
58
+ #endif
59
+
56
60
  #ifdef __cplusplus
57
61
  extern "C" {
58
62
  #endif
@@ -83,12 +87,20 @@ extern "C" {
83
87
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
88
 
85
89
  struct llama_context_params {
86
- uint32_t seed; // RNG seed, -1 for random
87
- int32_t n_ctx; // text context
88
- int32_t n_batch; // prompt processing batch size
89
- int32_t n_gpu_layers; // number of layers to store in VRAM
90
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
+ uint32_t seed; // RNG seed, -1 for random
91
+ int32_t n_ctx; // text context
92
+ int32_t n_batch; // prompt processing batch size
93
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
94
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
95
+ int32_t n_gpu_layers; // number of layers to store in VRAM
96
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
97
+
98
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
99
+
100
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
101
+ float rope_freq_base; // RoPE base frequency
102
+ float rope_freq_scale; // RoPE frequency scaling factor
103
+
92
104
  // called with a progress value between 0 and 1, pass NULL to disable
93
105
  llama_progress_callback progress_callback;
94
106
  // context pointer passed to the progress callback
@@ -134,6 +146,40 @@ extern "C" {
134
146
  bool quantize_output_tensor; // quantize output.weight
135
147
  } llama_model_quantize_params;
136
148
 
149
+ // grammar types
150
+ struct llama_grammar;
151
+
152
+ // grammar element type
153
+ enum llama_gretype {
154
+ // end of rule definition
155
+ LLAMA_GRETYPE_END = 0,
156
+
157
+ // start of alternate definition for rule
158
+ LLAMA_GRETYPE_ALT = 1,
159
+
160
+ // non-terminal element: reference to rule
161
+ LLAMA_GRETYPE_RULE_REF = 2,
162
+
163
+ // terminal element: character (code point)
164
+ LLAMA_GRETYPE_CHAR = 3,
165
+
166
+ // inverse char(s) ([^a], [^a-b] [^abc])
167
+ LLAMA_GRETYPE_CHAR_NOT = 4,
168
+
169
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
170
+ // be an inclusive range ([a-z])
171
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
172
+
173
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
174
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
175
+ LLAMA_GRETYPE_CHAR_ALT = 6,
176
+ };
177
+
178
+ typedef struct llama_grammar_element {
179
+ enum llama_gretype type;
180
+ uint32_t value; // Unicode code point or rule ID
181
+ } llama_grammar_element;
182
+
137
183
  // performance timing information
138
184
  struct llama_timings {
139
185
  double t_start_ms;
@@ -148,6 +194,8 @@ extern "C" {
148
194
  int32_t n_eval;
149
195
  };
150
196
 
197
+ LLAMA_API int llama_max_devices();
198
+
151
199
  LLAMA_API struct llama_context_params llama_context_default_params();
152
200
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
153
201
 
@@ -270,10 +318,21 @@ extern "C" {
270
318
  int n_max_tokens,
271
319
  bool add_bos);
272
320
 
321
+ LLAMA_API int llama_tokenize_with_model(
322
+ const struct llama_model * model,
323
+ const char * text,
324
+ llama_token * tokens,
325
+ int n_max_tokens,
326
+ bool add_bos);
327
+
273
328
  LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
274
329
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
275
330
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
276
331
 
332
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
333
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
334
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
335
+
277
336
  // Get the vocabulary as output parameters.
278
337
  // Returns number of results.
279
338
  LLAMA_API int llama_get_vocab(
@@ -282,6 +341,12 @@ extern "C" {
282
341
  float * scores,
283
342
  int capacity);
284
343
 
344
+ LLAMA_API int llama_get_vocab_from_model(
345
+ const struct llama_model * model,
346
+ const char * * strings,
347
+ float * scores,
348
+ int capacity);
349
+
285
350
  // Token logits obtained from the last call to llama_eval()
286
351
  // The logits for the last token are stored in the last row
287
352
  // Can be mutated in order to change the probabilities of the next token
@@ -294,13 +359,28 @@ extern "C" {
294
359
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
295
360
 
296
361
  // Token Id -> String. Uses the vocabulary in the provided context
297
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
362
+ LLAMA_API const char * llama_token_to_str(
363
+ const struct llama_context * ctx,
364
+ llama_token token);
365
+
366
+ LLAMA_API const char * llama_token_to_str_with_model(
367
+ const struct llama_model * model,
368
+ llama_token token);
298
369
 
299
370
  // Special tokens
300
371
  LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
301
372
  LLAMA_API llama_token llama_token_eos(); // end-of-sentence
302
373
  LLAMA_API llama_token llama_token_nl(); // next-line
303
374
 
375
+ // Grammar
376
+ //
377
+ LLAMA_API struct llama_grammar * llama_grammar_init(
378
+ const llama_grammar_element ** rules,
379
+ size_t n_rules,
380
+ size_t start_rule_index);
381
+
382
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
383
+
304
384
  // Sampling functions
305
385
 
306
386
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -313,13 +393,11 @@ extern "C" {
313
393
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
394
  /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
395
  /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
- /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
396
  LLAMA_API void llama_sample_classifier_free_guidance(
318
397
  struct llama_context * ctx,
319
398
  llama_token_data_array * candidates,
320
399
  struct llama_context * guidance_ctx,
321
- float scale,
322
- float smooth_factor);
400
+ float scale);
323
401
 
324
402
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
325
403
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -337,6 +415,9 @@ extern "C" {
337
415
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
338
416
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
339
417
 
418
+ /// @details Apply constraints from grammar
419
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
420
+
340
421
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
341
422
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
342
423
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -358,6 +439,9 @@ extern "C" {
358
439
  /// @details Randomly selects a token from the candidates based on their probabilities.
359
440
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
360
441
 
442
+ /// @details Accepts the sampled token into the grammar
443
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
444
+
361
445
  // Performance information
362
446
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
363
447
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.3'
6
+ VERSION = '0.3.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-32c5411'
9
+ LLAMA_CPP_VERSION = 'master-1a94186'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -109,3 +109,4 @@ module LLaMACpp
109
109
  end
110
110
 
111
111
  LLaMACpp.backend_init
112
+ at_exit { LLaMACpp.backend_free }
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
+ LLAMA_GRETYPE_END: Integer
30
+ LLAMA_GRETYPE_ALT: Integer
31
+ LLAMA_GRETYPE_RULE_REF: Integer
32
+ LLAMA_GRETYPE_CHAR: Integer
33
+ LLAMA_GRETYPE_CHAR_NOT: Integer
34
+ LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
35
+ LLAMA_GRETYPE_CHAR_ALT: Integer
36
+
29
37
  def self?.backend_init: (?numa: bool) -> void
30
38
  def self?.backend_free: () -> void
31
39
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -39,6 +47,7 @@ module LLaMACpp
39
47
  def self?.token_nl: () -> Integer
40
48
  def self?.mmap_supported?: () -> bool
41
49
  def self?.mlock_supported?: () -> bool
50
+ def self?.max_devices: () -> Integer
42
51
 
43
52
  class TokenData
44
53
  public
@@ -69,6 +78,12 @@ module LLaMACpp
69
78
  def free: () -> void
70
79
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
71
80
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
81
+ def n_vocab: () -> Integer
82
+ def n_ctx: () -> Integer
83
+ def n_embd: () -> Integer
84
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
85
+ def token_to_str: (Integer) -> String
86
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
72
87
  end
73
88
 
74
89
  class Timings
@@ -109,7 +124,7 @@ module LLaMACpp
109
124
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
110
125
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
111
126
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
- def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
127
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
113
128
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
114
129
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
115
130
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -120,6 +135,8 @@ module LLaMACpp
120
135
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
121
136
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
122
137
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
138
+ def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
139
+ def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
123
140
  end
124
141
 
125
142
  class ContextParams
@@ -140,6 +157,10 @@ module LLaMACpp
140
157
  def main_gpu: () -> Integer
141
158
  def main_gpu=: (Integer) -> Integer
142
159
  def tensor_split: () -> Array[Float]
160
+ def rope_freq_base=: (Float) -> Float
161
+ def rope_freq_base: () -> Float
162
+ def rope_freq_scale=: (Float) -> Float
163
+ def rope_freq_scale: () -> Float
143
164
  def low_vram: () -> bool
144
165
  def low_vram=: (bool) -> bool
145
166
  def seed: () -> Integer
@@ -166,4 +187,18 @@ module LLaMACpp
166
187
  end
167
188
 
168
189
  class Params = ContextParams
190
+
191
+ class GrammarElement
192
+ public
193
+
194
+ def initialize: (?type: Integer, ?value: Integer) -> void
195
+ def type: () -> Integer
196
+ def type=: (Integer) -> Integer
197
+ def value: () -> Integer
198
+ def value=: (Integer) -> Integer
199
+ end
200
+
201
+ class Grammar
202
+ def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
203
+ end
169
204
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-15 00:00:00.000000000 Z
11
+ date: 2023-07-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: