llama_cpp 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,10 @@
53
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
54
54
  #endif
55
55
 
56
+ #ifndef LLAMA_DEFAULT_RMS_EPS
57
+ #define LLAMA_DEFAULT_RMS_EPS 5e-6f
58
+ #endif
59
+
56
60
  #ifdef __cplusplus
57
61
  extern "C" {
58
62
  #endif
@@ -83,11 +87,13 @@ extern "C" {
83
87
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
88
 
85
89
  struct llama_context_params {
86
- uint32_t seed; // RNG seed, -1 for random
87
- int32_t n_ctx; // text context
88
- int32_t n_batch; // prompt processing batch size
89
- int32_t n_gpu_layers; // number of layers to store in VRAM
90
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
90
+ uint32_t seed; // RNG seed, -1 for random
91
+ int32_t n_ctx; // text context
92
+ int32_t n_batch; // prompt processing batch size
93
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
94
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
95
+ int32_t n_gpu_layers; // number of layers to store in VRAM
96
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
97
 
92
98
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
99
 
@@ -140,6 +146,40 @@ extern "C" {
140
146
  bool quantize_output_tensor; // quantize output.weight
141
147
  } llama_model_quantize_params;
142
148
 
149
+ // grammar types
150
+ struct llama_grammar;
151
+
152
+ // grammar element type
153
+ enum llama_gretype {
154
+ // end of rule definition
155
+ LLAMA_GRETYPE_END = 0,
156
+
157
+ // start of alternate definition for rule
158
+ LLAMA_GRETYPE_ALT = 1,
159
+
160
+ // non-terminal element: reference to rule
161
+ LLAMA_GRETYPE_RULE_REF = 2,
162
+
163
+ // terminal element: character (code point)
164
+ LLAMA_GRETYPE_CHAR = 3,
165
+
166
+ // inverse char(s) ([^a], [^a-b] [^abc])
167
+ LLAMA_GRETYPE_CHAR_NOT = 4,
168
+
169
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
170
+ // be an inclusive range ([a-z])
171
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
172
+
173
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
174
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
175
+ LLAMA_GRETYPE_CHAR_ALT = 6,
176
+ };
177
+
178
+ typedef struct llama_grammar_element {
179
+ enum llama_gretype type;
180
+ uint32_t value; // Unicode code point or rule ID
181
+ } llama_grammar_element;
182
+
143
183
  // performance timing information
144
184
  struct llama_timings {
145
185
  double t_start_ms;
@@ -332,6 +372,15 @@ extern "C" {
332
372
  LLAMA_API llama_token llama_token_eos(); // end-of-sentence
333
373
  LLAMA_API llama_token llama_token_nl(); // next-line
334
374
 
375
+ // Grammar
376
+ //
377
+ LLAMA_API struct llama_grammar * llama_grammar_init(
378
+ const llama_grammar_element ** rules,
379
+ size_t n_rules,
380
+ size_t start_rule_index);
381
+
382
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
383
+
335
384
  // Sampling functions
336
385
 
337
386
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -366,6 +415,9 @@ extern "C" {
366
415
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
367
416
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
368
417
 
418
+ /// @details Apply constraints from grammar
419
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
420
+
369
421
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
370
422
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
371
423
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -387,6 +439,9 @@ extern "C" {
387
439
  /// @details Randomly selects a token from the candidates based on their probabilities.
388
440
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
389
441
 
442
+ /// @details Accepts the sampled token into the grammar
443
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
444
+
390
445
  // Performance information
391
446
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
392
447
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.4'
6
+ VERSION = '0.3.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-d924522'
9
+ LLAMA_CPP_VERSION = 'master-1a94186'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
+ LLAMA_GRETYPE_END: Integer
30
+ LLAMA_GRETYPE_ALT: Integer
31
+ LLAMA_GRETYPE_RULE_REF: Integer
32
+ LLAMA_GRETYPE_CHAR: Integer
33
+ LLAMA_GRETYPE_CHAR_NOT: Integer
34
+ LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
35
+ LLAMA_GRETYPE_CHAR_ALT: Integer
36
+
29
37
  def self?.backend_init: (?numa: bool) -> void
30
38
  def self?.backend_free: () -> void
31
39
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -127,6 +135,8 @@ module LLaMACpp
127
135
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
128
136
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
129
137
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
138
+ def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
139
+ def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
130
140
  end
131
141
 
132
142
  class ContextParams
@@ -177,4 +187,18 @@ module LLaMACpp
177
187
  end
178
188
 
179
189
  class Params = ContextParams
190
+
191
+ class GrammarElement
192
+ public
193
+
194
+ def initialize: (?type: Integer, ?value: Integer) -> void
195
+ def type: () -> Integer
196
+ def type=: (Integer) -> Integer
197
+ def value: () -> Integer
198
+ def value=: (Integer) -> Integer
199
+ end
200
+
201
+ class Grammar
202
+ def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
203
+ end
180
204
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-22 00:00:00.000000000 Z
11
+ date: 2023-07-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: