llama_cpp 0.3.4 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -53,6 +53,10 @@
53
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
54
54
  #endif
55
55
 
56
+ #ifndef LLAMA_DEFAULT_RMS_EPS
57
+ #define LLAMA_DEFAULT_RMS_EPS 5e-6f
58
+ #endif
59
+
56
60
  #ifdef __cplusplus
57
61
  extern "C" {
58
62
  #endif
@@ -83,11 +87,13 @@ extern "C" {
83
87
  typedef void (*llama_progress_callback)(float progress, void *ctx);
84
88
 
85
89
  struct llama_context_params {
86
- uint32_t seed; // RNG seed, -1 for random
87
- int32_t n_ctx; // text context
88
- int32_t n_batch; // prompt processing batch size
89
- int32_t n_gpu_layers; // number of layers to store in VRAM
90
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
90
+ uint32_t seed; // RNG seed, -1 for random
91
+ int32_t n_ctx; // text context
92
+ int32_t n_batch; // prompt processing batch size
93
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
94
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
95
+ int32_t n_gpu_layers; // number of layers to store in VRAM
96
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
97
 
92
98
  const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
99
 
@@ -102,6 +108,7 @@ extern "C" {
102
108
 
103
109
  // Keep the booleans together to avoid misalignment during copy-by-value.
104
110
  bool low_vram; // if true, reduce VRAM usage at the cost of performance
111
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
105
112
  bool f16_kv; // use fp16 for KV cache
106
113
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
107
114
  bool vocab_only; // only load the vocabulary, no weights
@@ -140,6 +147,40 @@ extern "C" {
140
147
  bool quantize_output_tensor; // quantize output.weight
141
148
  } llama_model_quantize_params;
142
149
 
150
+ // grammar types
151
+ struct llama_grammar;
152
+
153
+ // grammar element type
154
+ enum llama_gretype {
155
+ // end of rule definition
156
+ LLAMA_GRETYPE_END = 0,
157
+
158
+ // start of alternate definition for rule
159
+ LLAMA_GRETYPE_ALT = 1,
160
+
161
+ // non-terminal element: reference to rule
162
+ LLAMA_GRETYPE_RULE_REF = 2,
163
+
164
+ // terminal element: character (code point)
165
+ LLAMA_GRETYPE_CHAR = 3,
166
+
167
+ // inverse char(s) ([^a], [^a-b] [^abc])
168
+ LLAMA_GRETYPE_CHAR_NOT = 4,
169
+
170
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
171
+ // be an inclusive range ([a-z])
172
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
173
+
174
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
175
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
176
+ LLAMA_GRETYPE_CHAR_ALT = 6,
177
+ };
178
+
179
+ typedef struct llama_grammar_element {
180
+ enum llama_gretype type;
181
+ uint32_t value; // Unicode code point or rule ID
182
+ } llama_grammar_element;
183
+
143
184
  // performance timing information
144
185
  struct llama_timings {
145
186
  double t_start_ms;
@@ -332,6 +373,15 @@ extern "C" {
332
373
  LLAMA_API llama_token llama_token_eos(); // end-of-sentence
333
374
  LLAMA_API llama_token llama_token_nl(); // next-line
334
375
 
376
+ // Grammar
377
+ //
378
+ LLAMA_API struct llama_grammar * llama_grammar_init(
379
+ const llama_grammar_element ** rules,
380
+ size_t n_rules,
381
+ size_t start_rule_index);
382
+
383
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
384
+
335
385
  // Sampling functions
336
386
 
337
387
  /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -366,6 +416,9 @@ extern "C" {
366
416
  LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
367
417
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
368
418
 
419
+ /// @details Apply constraints from grammar
420
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
421
+
369
422
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
370
423
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
371
424
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -387,6 +440,9 @@ extern "C" {
387
440
  /// @details Randomly selects a token from the candidates based on their probabilities.
388
441
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
389
442
 
443
+ /// @details Accepts the sampled token into the grammar
444
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
445
+
390
446
  // Performance information
391
447
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
392
448
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.4'
6
+ VERSION = '0.3.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-d924522'
9
+ LLAMA_CPP_VERSION = 'master-468ea24'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -26,6 +26,14 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
+ LLAMA_GRETYPE_END: Integer
30
+ LLAMA_GRETYPE_ALT: Integer
31
+ LLAMA_GRETYPE_RULE_REF: Integer
32
+ LLAMA_GRETYPE_CHAR: Integer
33
+ LLAMA_GRETYPE_CHAR_NOT: Integer
34
+ LLAMA_GRETYPE_CHAR_RNG_UPPER: Integer
35
+ LLAMA_GRETYPE_CHAR_ALT: Integer
36
+
29
37
  def self?.backend_init: (?numa: bool) -> void
30
38
  def self?.backend_free: () -> void
31
39
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
@@ -127,6 +135,8 @@ module LLaMACpp
127
135
  def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
128
136
  def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
129
137
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
138
+ def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
139
+ def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
130
140
  end
131
141
 
132
142
  class ContextParams
@@ -153,6 +163,8 @@ module LLaMACpp
153
163
  def rope_freq_scale: () -> Float
154
164
  def low_vram: () -> bool
155
165
  def low_vram=: (bool) -> bool
166
+ def mul_mat_q: () -> bool
167
+ def mul_mat_q=: (bool) -> bool
156
168
  def seed: () -> Integer
157
169
  def seed=: (Integer) -> Integer
158
170
  def use_mlock: () -> bool
@@ -177,4 +189,18 @@ module LLaMACpp
177
189
  end
178
190
 
179
191
  class Params = ContextParams
192
+
193
+ class GrammarElement
194
+ public
195
+
196
+ def initialize: (?type: Integer, ?value: Integer) -> void
197
+ def type: () -> Integer
198
+ def type=: (Integer) -> Integer
199
+ def value: () -> Integer
200
+ def value=: (Integer) -> Integer
201
+ end
202
+
203
+ class Grammar
204
+ def initialize: (rules: Array[Array[LLaMACpp::GrammarElement]], start_rule_index: Integer) -> void
205
+ end
180
206
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-22 00:00:00.000000000 Z
11
+ date: 2023-08-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -30,6 +30,8 @@ files:
30
30
  - ext/llama_cpp/llama_cpp.cpp
31
31
  - ext/llama_cpp/llama_cpp.h
32
32
  - ext/llama_cpp/src/LICENSE
33
+ - ext/llama_cpp/src/ggml-alloc.c
34
+ - ext/llama_cpp/src/ggml-alloc.h
33
35
  - ext/llama_cpp/src/ggml-cuda.cu
34
36
  - ext/llama_cpp/src/ggml-cuda.h
35
37
  - ext/llama_cpp/src/ggml-metal.h