llama_cpp 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -53,6 +53,10 @@
53
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
54
54
  #endif
55
55
 
56
+ #ifndef LLAMA_DEFAULT_RMS_EPS
57
+ #define LLAMA_DEFAULT_RMS_EPS 5e-6f
58
+ #endif
59
+
56
60
  #ifdef __cplusplus
57
61
  extern "C" {
58
62
  #endif
@@ -83,11 +87,13 @@ extern "C" {
83
87
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
88
 
85
89
  struct llama_context_params {
86
- uint32_t seed; // RNG seed, -1 for random
87
- int32_t n_ctx; // text context
88
- int32_t n_batch; // prompt processing batch size
89
- int32_t n_gpu_layers; // number of layers to store in VRAM
90
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
90
+ uint32_t seed; // RNG seed, -1 for random
91
+ int32_t n_ctx; // text context
92
+ int32_t n_batch; // prompt processing batch size
93
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
94
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
95
+ int32_t n_gpu_layers; // number of layers to store in VRAM
96
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
97
 
92
98
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
99
 
@@ -140,6 +146,40 @@ extern "C" {
140
146
  bool quantize_output_tensor; // quantize output.weight
141
147
  } llama_model_quantize_params;
142
148
 
149
+ // grammar types
150
+ struct llama_grammar;
151
+
152
+ // grammar element type
153
+ enum llama_gretype {
154
+ // end of rule definition
155
+ LLAMA_GRETYPE_END = 0,
156
+
157
+ // start of alternate definition for rule
158
+ LLAMA_GRETYPE_ALT = 1,
159
+
160
+ // non-terminal element: reference to rule
161
+ LLAMA_GRETYPE_RULE_REF = 2,
162
+
163
+ // terminal element: character (code point)
164
+ LLAMA_GRETYPE_CHAR = 3,
165
+
166
+ // inverse char(s) ([^a], [^a-b] [^abc])
167
+ LLAMA_GRETYPE_CHAR_NOT = 4,
168
+
169
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
170
+ // be an inclusive range ([a-z])
171
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
172
+
173
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
174
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
175
+ LLAMA_GRETYPE_CHAR_ALT = 6,
176
+ };
177
+
178
+ typedef struct llama_grammar_element {
179
+ enum llama_gretype type;
180
+ uint32_t value; // Unicode code point or rule ID
181
+ } llama_grammar_element;
182
+
143
183
  // performance timing information
144
184
  struct llama_timings {
145
185
  double t_start_ms;
@@ -332,6 +372,15 @@ extern "C" {
332
372
  LLAMA_API llama_token llama_token_eos(); // end-of-sentence
333
373
  LLAMA_API llama_token llama_token_nl(); // next-line
334
374
 
375
+ // Grammar
376
+ //
377
+ LLAMA_API struct llama_grammar * llama_grammar_init(
378
+ const llama_grammar_element ** rules,
379
+ size_t n_rules,
380
+ size_t start_rule_index);
381
+
382
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
383
+
335
384
  // Sampling functions
336
385
 
337
386
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -366,6 +415,9 @@ extern "C" {
366
415
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
367
416
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
368
417
 
418
+ /// @details Apply constraints from grammar
419
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
420
+
369
421
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
370
422
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
371
423
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -387,6 +439,9 @@ extern "C" {
387
439
  /// @details Randomly selects a token from the candidates based on their probabilities.
388
440
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
389
441
 
442
+ /// @details Accepts the sampled token into the grammar
443
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
444
+
390
445
  // Performance information
391
446
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
392
447
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.4'
6
+ VERSION = '0.3.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-d924522'
9
+ LLAMA_CPP_VERSION = 'master-1a94186'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
+ LLAMA_GRETYPE_END: Integer
30
+ LLAMA_GRETYPE_ALT: Integer
31
+ LLAMA_GRETYPE_RULE_REF: Integer
32
+ LLAMA_GRETYPE_CHAR: Integer
33
+ LLAMA_GRETYPE_CHAR_NOT: Integer
34
+ LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
35
+ LLAMA_GRETYPE_CHAR_ALT: Integer
36
+
29
37
  def self?.backend_init: (?numa: bool) -> void
30
38
  def self?.backend_free: () -> void
31
39
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -127,6 +135,8 @@ module LLaMACpp
127
135
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
128
136
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
129
137
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
138
+ def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
139
+ def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
130
140
  end
131
141
 
132
142
  class ContextParams
@@ -177,4 +187,18 @@ module LLaMACpp
177
187
  end
178
188
 
179
189
  class Params = ContextParams
190
+
191
+ class GrammarElement
192
+ public
193
+
194
+ def initialize: (?type: Integer, ?value: Integer) -> void
195
+ def type: () -> Integer
196
+ def type=: (Integer) -> Integer
197
+ def value: () -> Integer
198
+ def value=: (Integer) -> Integer
199
+ end
200
+
201
+ class Grammar
202
+ def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
203
+ end
180
204
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-22 00:00:00.000000000 Z
11
+ date: 2023-07-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: