llama_cpp 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -88,7 +88,13 @@ extern "C" {
88
88
  int32_t n_batch; // prompt processing batch size
89
89
  int32_t n_gpu_layers; // number of layers to store in VRAM
90
90
  int32_t main_gpu; // the GPU that is used for scratch and small tensors
91
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
91
+
92
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
93
+
94
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
95
+ float rope_freq_base; // RoPE base frequency
96
+ float rope_freq_scale; // RoPE frequency scaling factor
97
+
92
98
  // called with a progress value between 0 and 1, pass NULL to disable
93
99
  llama_progress_callback progress_callback;
94
100
  // context pointer passed to the progress callback
@@ -148,6 +154,8 @@ extern "C" {
148
154
  int32_t n_eval;
149
155
  };
150
156
 
157
+ LLAMA_API int llama_max_devices();
158
+
151
159
  LLAMA_API struct llama_context_params llama_context_default_params();
152
160
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
153
161
 
@@ -158,7 +166,9 @@ extern "C" {
158
166
  // Initialize the llama + ggml backend
159
167
  // If numa is true, use NUMA optimizations
160
168
  // Call once at the start of the program
161
- LLAMA_API void llama_init_backend(bool numa);
169
+ LLAMA_API void llama_backend_init(bool numa);
170
+ // Call once at the end of the program - currently only used for MPI
171
+ LLAMA_API void llama_backend_free();
162
172
 
163
173
  LLAMA_API int64_t llama_time_us();
164
174
 
@@ -268,10 +278,21 @@ extern "C" {
268
278
  int n_max_tokens,
269
279
  bool add_bos);
270
280
 
281
+ LLAMA_API int llama_tokenize_with_model(
282
+ const struct llama_model * model,
283
+ const char * text,
284
+ llama_token * tokens,
285
+ int n_max_tokens,
286
+ bool add_bos);
287
+
271
288
  LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
272
289
  LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
273
290
  LLAMA_API int llama_n_embd (const struct llama_context * ctx);
274
291
 
292
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
293
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
294
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
295
+
275
296
  // Get the vocabulary as output parameters.
276
297
  // Returns number of results.
277
298
  LLAMA_API int llama_get_vocab(
@@ -280,6 +301,12 @@ extern "C" {
280
301
  float * scores,
281
302
  int capacity);
282
303
 
304
+ LLAMA_API int llama_get_vocab_from_model(
305
+ const struct llama_model * model,
306
+ const char * * strings,
307
+ float * scores,
308
+ int capacity);
309
+
283
310
  // Token logits obtained from the last call to llama_eval()
284
311
  // The logits for the last token are stored in the last row
285
312
  // Can be mutated in order to change the probabilities of the next token
@@ -292,7 +319,13 @@ extern "C" {
292
319
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
293
320
 
294
321
  // Token Id -> String. Uses the vocabulary in the provided context
295
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
322
+ LLAMA_API const char * llama_token_to_str(
323
+ const struct llama_context * ctx,
324
+ llama_token token);
325
+
326
+ LLAMA_API const char * llama_token_to_str_with_model(
327
+ const struct llama_model * model,
328
+ llama_token token);
296
329
 
297
330
  // Special tokens
298
331
  LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
@@ -307,6 +340,16 @@ extern "C" {
307
340
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
308
341
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
309
342
 
343
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
344
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
345
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
346
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
347
+ LLAMA_API void llama_sample_classifier_free_guidance(
348
+ struct llama_context * ctx,
349
+ llama_token_data_array * candidates,
350
+ struct llama_context * guidance_ctx,
351
+ float scale);
352
+
310
353
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
311
354
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
312
355
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.2'
6
+ VERSION = '0.3.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-481f793'
9
+ LLAMA_CPP_VERSION = 'master-d924522'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -108,4 +108,5 @@ module LLaMACpp
108
108
  end
109
109
  end
110
110
 
111
- LLaMACpp.init_backend
111
+ LLaMACpp.backend_init
112
+ at_exit { LLaMACpp.backend_free }
data/sig/llama_cpp.rbs CHANGED
@@ -26,7 +26,8 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
- def self?.init_backend: (?numa: bool) -> void
29
+ def self?.backend_init: (?numa: bool) -> void
30
+ def self?.backend_free: () -> void
30
31
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
32
  def self?.generate: (::LLaMACpp::Context, String,
32
33
  ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
@@ -38,6 +39,7 @@ module LLaMACpp
38
39
  def self?.token_nl: () -> Integer
39
40
  def self?.mmap_supported?: () -> bool
40
41
  def self?.mlock_supported?: () -> bool
42
+ def self?.max_devices: () -> Integer
41
43
 
42
44
  class TokenData
43
45
  public
@@ -68,6 +70,12 @@ module LLaMACpp
68
70
  def free: () -> void
69
71
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
70
72
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
73
+ def n_vocab: () -> Integer
74
+ def n_ctx: () -> Integer
75
+ def n_embd: () -> Integer
76
+ def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
77
+ def token_to_str: (Integer) -> String
78
+ def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
71
79
  end
72
80
 
73
81
  class Timings
@@ -108,6 +116,7 @@ module LLaMACpp
108
116
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
109
117
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
110
118
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
119
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float) -> void
111
120
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
112
121
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
113
122
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
@@ -138,6 +147,10 @@ module LLaMACpp
138
147
  def main_gpu: () -> Integer
139
148
  def main_gpu=: (Integer) -> Integer
140
149
  def tensor_split: () -> Array[Float]
150
+ def rope_freq_base=: (Float) -> Float
151
+ def rope_freq_base: () -> Float
152
+ def rope_freq_scale=: (Float) -> Float
153
+ def rope_freq_scale: () -> Float
141
154
  def low_vram: () -> bool
142
155
  def low_vram=: (bool) -> bool
143
156
  def seed: () -> Integer
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-08 00:00:00.000000000 Z
11
+ date: 2023-07-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -35,6 +35,8 @@ files:
35
35
  - ext/llama_cpp/src/ggml-metal.h
36
36
  - ext/llama_cpp/src/ggml-metal.m
37
37
  - ext/llama_cpp/src/ggml-metal.metal
38
+ - ext/llama_cpp/src/ggml-mpi.c
39
+ - ext/llama_cpp/src/ggml-mpi.h
38
40
  - ext/llama_cpp/src/ggml-opencl.cpp
39
41
  - ext/llama_cpp/src/ggml-opencl.h
40
42
  - ext/llama_cpp/src/ggml.c