llama_cpp 0.3.2 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -88,7 +88,13 @@ extern "C" {
88
88
  int32_t n_batch; // prompt processing batch size
89
89
  int32_t n_gpu_layers; // number of layers to store in VRAM
90
90
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
91
+
92
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
+
94
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
95
+ float rope_freq_base; // RoPE base frequency
96
+ float rope_freq_scale; // RoPE frequency scaling factor
97
+
92
98
  // called with a progress value between 0 and 1, pass NULL to disable
93
99
  llama_progress_callback progress_callback;
94
100
  // context pointer passed to the progress callback
@@ -148,6 +154,8 @@ extern "C" {
148
154
  int32_t n_eval;
149
155
  };
150
156
 
157
+ LLAMA_API int llama_max_devices();
158
+
151
159
  LLAMA_API struct llama_context_params llama_context_default_params();
152
160
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
153
161
 
@@ -158,7 +166,9 @@ extern "C" {
158
166
  // Initialize the llama + ggml backend
159
167
  // If numa is true, use NUMA optimizations
160
168
  // Call once at the start of the program
161
- LLAMA_API void llama_init_backend(bool numa);
169
+ LLAMA_API void llama_backend_init(bool numa);
170
+ // Call once at the end of the program - currently only used for MPI
171
+ LLAMA_API void llama_backend_free();
162
172
 
163
173
  LLAMA_API int64_t llama_time_us();
164
174
 
@@ -268,10 +278,21 @@ extern "C" {
268
278
  int n_max_tokens,
269
279
  bool add_bos);
270
280
 
281
+ LLAMA_API int llama_tokenize_with_model(
282
+ const struct llama_model * model,
283
+ const char * text,
284
+ llama_token * tokens,
285
+ int n_max_tokens,
286
+ bool add_bos);
287
+
271
288
  LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
272
289
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
273
290
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
274
291
 
292
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
293
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
294
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
295
+
275
296
  // Get the vocabulary as output parameters.
276
297
  // Returns number of results.
277
298
  LLAMA_API int llama_get_vocab(
@@ -280,6 +301,12 @@ extern "C" {
280
301
  float * scores,
281
302
  int capacity);
282
303
 
304
+ LLAMA_API int llama_get_vocab_from_model(
305
+ const struct llama_model * model,
306
+ const char * * strings,
307
+ float * scores,
308
+ int capacity);
309
+
283
310
  // Token logits obtained from the last call to llama_eval()
284
311
  // The logits for the last token are stored in the last row
285
312
  // Can be mutated in order to change the probabilities of the next token
@@ -292,7 +319,13 @@ extern "C" {
292
319
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
293
320
 
294
321
  // Token Id -> String. Uses the vocabulary in the provided context
295
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
322
+ LLAMA_API const char * llama_token_to_str(
323
+ const struct llama_context * ctx,
324
+ llama_token token);
325
+
326
+ LLAMA_API const char * llama_token_to_str_with_model(
327
+ const struct llama_model * model,
328
+ llama_token token);
296
329
 
297
330
  // Special tokens
298
331
  LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
@@ -307,6 +340,16 @@ extern "C" {
307
340
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
308
341
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
309
342
 
343
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
344
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
345
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
346
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
347
+ LLAMA_API void llama_sample_classifier_free_guidance(
348
+ struct llama_context * ctx,
349
+ llama_token_data_array * candidates,
350
+ struct llama_context * guidance_ctx,
351
+ float scale);
352
+
310
353
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
311
354
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
312
355
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.2'
6
+ VERSION = '0.3.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-481f793'
9
+ LLAMA_CPP_VERSION = 'master-d924522'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -108,4 +108,5 @@ module LLaMACpp
108
108
  end
109
109
  end
110
110
 
111
- LLaMACpp.init_backend
111
+ LLaMACpp.backend_init
112
+ at_exit { LLaMACpp.backend_free }
data/sig/llama_cpp.rbs CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
- def self?.init_backend: (?numa: bool) -> void
29
+ def self?.backend_init: (?numa: bool) -> void
30
+ def self?.backend_free: () -> void
30
31
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
32
  def self?.generate: (::LLaMACpp::Context, String,
32
33
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -38,6 +39,7 @@ module LLaMACpp
38
39
  def self?.token_nl: () -> Integer
39
40
  def self?.mmap_supported?: () -> bool
40
41
  def self?.mlock_supported?: () -> bool
42
+ def self?.max_devices: () -> Integer
41
43
 
42
44
  class TokenData
43
45
  public
@@ -68,6 +70,12 @@ module LLaMACpp
68
70
  def free: () -> void
69
71
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
70
72
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
73
+ def n_vocab: () -> Integer
74
+ def n_ctx: () -> Integer
75
+ def n_embd: () -> Integer
76
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
77
+ def token_to_str: (Integer) -> String
78
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
71
79
  end
72
80
 
73
81
  class Timings
@@ -108,6 +116,7 @@ module LLaMACpp
108
116
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
109
117
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
110
118
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
119
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
111
120
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
112
121
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
113
122
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -138,6 +147,10 @@ module LLaMACpp
138
147
  def main_gpu: () -> Integer
139
148
  def main_gpu=: (Integer) -> Integer
140
149
  def tensor_split: () -> Array[Float]
150
+ def rope_freq_base=: (Float) -> Float
151
+ def rope_freq_base: () -> Float
152
+ def rope_freq_scale=: (Float) -> Float
153
+ def rope_freq_scale: () -> Float
141
154
  def low_vram: () -> bool
142
155
  def low_vram=: (bool) -> bool
143
156
  def seed: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-08 00:00:00.000000000 Z
11
+ date: 2023-07-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -35,6 +35,8 @@ files:
35
35
  - ext/llama_cpp/src/ggml-metal.h
36
36
  - ext/llama_cpp/src/ggml-metal.m
37
37
  - ext/llama_cpp/src/ggml-metal.metal
38
+ - ext/llama_cpp/src/ggml-mpi.c
39
+ - ext/llama_cpp/src/ggml-mpi.h
38
40
  - ext/llama_cpp/src/ggml-opencl.cpp
39
41
  - ext/llama_cpp/src/ggml-opencl.h
40
42
  - ext/llama_cpp/src/ggml.c