llama_cpp 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,10 @@
53
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
54
54
  #endif
55
55
 
56
+ #ifndef LLAMA_DEFAULT_RMS_EPS
57
+ #define LLAMA_DEFAULT_RMS_EPS 5e-6f
58
+ #endif
59
+
56
60
  #ifdef __cplusplus
57
61
  extern "C" {
58
62
  #endif
@@ -83,11 +87,13 @@ extern "C" {
83
87
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
88
 
85
89
  struct llama_context_params {
86
- uint32_t seed; // RNG seed, -1 for random
87
- int32_t n_ctx; // text context
88
- int32_t n_batch; // prompt processing batch size
89
- int32_t n_gpu_layers; // number of layers to store in VRAM
90
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
90
+ uint32_t seed; // RNG seed, -1 for random
91
+ int32_t n_ctx; // text context
92
+ int32_t n_batch; // prompt processing batch size
93
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
94
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
95
+ int32_t n_gpu_layers; // number of layers to store in VRAM
96
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
97
 
92
98
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
99
 
@@ -102,6 +108,7 @@ extern "C" {
102
108
 
103
109
  // Keep the booleans together to avoid misalignment during copy-by-value.
104
110
  bool low_vram; // if true, reduce VRAM usage at the cost of performance
111
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
105
112
  bool f16_kv; // use fp16 for KV cache
106
113
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
107
114
  bool vocab_only; // only load the vocabulary, no weights
@@ -140,6 +147,40 @@ extern "C" {
140
147
  bool quantize_output_tensor; // quantize output.weight
141
148
  } llama_model_quantize_params;
142
149
 
150
+ // grammar types
151
+ struct llama_grammar;
152
+
153
+ // grammar element type
154
+ enum llama_gretype {
155
+ // end of rule definition
156
+ LLAMA_GRETYPE_END = 0,
157
+
158
+ // start of alternate definition for rule
159
+ LLAMA_GRETYPE_ALT = 1,
160
+
161
+ // non-terminal element: reference to rule
162
+ LLAMA_GRETYPE_RULE_REF = 2,
163
+
164
+ // terminal element: character (code point)
165
+ LLAMA_GRETYPE_CHAR = 3,
166
+
167
+ // inverse char(s) ([^a], [^a-b] [^abc])
168
+ LLAMA_GRETYPE_CHAR_NOT = 4,
169
+
170
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
171
+ // be an inclusive range ([a-z])
172
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
173
+
174
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
175
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
176
+ LLAMA_GRETYPE_CHAR_ALT = 6,
177
+ };
178
+
179
+ typedef struct llama_grammar_element {
180
+ enum llama_gretype type;
181
+ uint32_t value; // Unicode code point or rule ID
182
+ } llama_grammar_element;
183
+
143
184
  // performance timing information
144
185
  struct llama_timings {
145
186
  double t_start_ms;
@@ -332,6 +373,15 @@ extern "C" {
332
373
  LLAMA_API llama_token llama_token_eos(); // end-of-sentence
333
374
  LLAMA_API llama_token llama_token_nl(); // next-line
334
375
 
376
+ // Grammar
377
+ //
378
+ LLAMA_API struct llama_grammar * llama_grammar_init(
379
+ const llama_grammar_element ** rules,
380
+ size_t n_rules,
381
+ size_t start_rule_index);
382
+
383
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
384
+
335
385
  // Sampling functions
336
386
 
337
387
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -366,6 +416,9 @@ extern "C" {
366
416
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
367
417
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
368
418
 
419
+ /// @details Apply constraints from grammar
420
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
421
+
369
422
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
370
423
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
371
424
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -387,6 +440,9 @@ extern "C" {
387
440
  /// @details Randomly selects a token from the candidates based on their probabilities.
388
441
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
389
442
 
443
+ /// @details Accepts the sampled token into the grammar
444
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
445
+
390
446
  // Performance information
391
447
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
392
448
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.4'
6
+ VERSION = '0.3.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-d924522'
9
+ LLAMA_CPP_VERSION = 'master-468ea24'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
+ LLAMA_GRETYPE_END: Integer
30
+ LLAMA_GRETYPE_ALT: Integer
31
+ LLAMA_GRETYPE_RULE_REF: Integer
32
+ LLAMA_GRETYPE_CHAR: Integer
33
+ LLAMA_GRETYPE_CHAR_NOT: Integer
34
+ LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
35
+ LLAMA_GRETYPE_CHAR_ALT: Integer
36
+
29
37
  def self?.backend_init: (?numa: bool) -> void
30
38
  def self?.backend_free: () -> void
31
39
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -127,6 +135,8 @@ module LLaMACpp
127
135
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
128
136
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
129
137
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
138
+ def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
139
+ def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
130
140
  end
131
141
 
132
142
  class ContextParams
@@ -153,6 +163,8 @@ module LLaMACpp
153
163
  def rope_freq_scale: () -> Float
154
164
  def low_vram: () -> bool
155
165
  def low_vram=: (bool) -> bool
166
+ def mul_mat_q: () -> bool
167
+ def mul_mat_q=: (bool) -> bool
156
168
  def seed: () -> Integer
157
169
  def seed=: (Integer) -> Integer
158
170
  def use_mlock: () -> bool
@@ -177,4 +189,18 @@ module LLaMACpp
177
189
  end
178
190
 
179
191
  class Params = ContextParams
192
+
193
+ class GrammarElement
194
+ public
195
+
196
+ def initialize: (?type: Integer, ?value: Integer) -> void
197
+ def type: () -> Integer
198
+ def type=: (Integer) -> Integer
199
+ def value: () -> Integer
200
+ def value=: (Integer) -> Integer
201
+ end
202
+
203
+ class Grammar
204
+ def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
205
+ end
180
206
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-22 00:00:00.000000000 Z
11
+ date: 2023-08-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,8 @@ files:
30
30
  - ext/llama_cpp/llama_cpp.cpp
31
31
  - ext/llama_cpp/llama_cpp.h
32
32
  - ext/llama_cpp/src/LICENSE
33
+ - ext/llama_cpp/src/ggml-alloc.c
34
+ - ext/llama_cpp/src/ggml-alloc.h
33
35
  - ext/llama_cpp/src/ggml-cuda.cu
34
36
  - ext/llama_cpp/src/ggml-cuda.h
35
37
  - ext/llama_cpp/src/ggml-metal.h