llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -43,7 +43,7 @@
43
43
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
44
44
 
45
45
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
46
- #define LLAMA_SESSION_VERSION 3
46
+ #define LLAMA_SESSION_VERSION 4
47
47
 
48
48
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
49
49
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -118,6 +118,12 @@ extern "C" {
118
118
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
119
119
  };
120
120
 
121
+ enum llama_split_mode {
122
+ LLAMA_SPLIT_NONE = 0, // single GPU
123
+ LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
124
+ LLAMA_SPLIT_ROW = 2, // split rows across GPUs
125
+ };
126
+
121
127
  typedef struct llama_token_data {
122
128
  llama_token id; // token id
123
129
  float logit; // log-odds of the token
@@ -180,8 +186,16 @@ extern "C" {
180
186
 
181
187
  struct llama_model_params {
182
188
  int32_t n_gpu_layers; // number of layers to store in VRAM
183
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
184
- const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
189
+ enum llama_split_mode split_mode; // how to split the model across multiple GPUs
190
+
191
+ // main_gpu interpretation depends on split_mode:
192
+ // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
193
+ // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
194
+ // LLAMA_SPLIT_LAYER: ignored
195
+ int32_t main_gpu;
196
+
197
+ // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
198
+ const float * tensor_split;
185
199
 
186
200
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
187
201
  // If the provided progress_callback returns true, model loading continues.
@@ -235,6 +249,7 @@ extern "C" {
235
249
  bool quantize_output_tensor; // quantize output.weight
236
250
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
237
251
  bool pure; // disable k-quant mixtures and quantize all tensors to the same type
252
+ void * imatrix; // pointer to importance matrix data
238
253
  } llama_model_quantize_params;
239
254
 
240
255
  // grammar types
@@ -699,14 +714,21 @@ extern "C" {
699
714
  float penalty_present);
700
715
 
701
716
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
702
- /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
703
- /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
704
- /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
705
- LLAMA_API void llama_sample_classifier_free_guidance(
717
+ /// @param logits Logits extracted from the original generation context.
718
+ /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
719
+ /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
720
+ LLAMA_API void llama_sample_apply_guidance(
721
+ struct llama_context * ctx,
722
+ float * logits,
723
+ float * logits_guidance,
724
+ float scale);
725
+
726
+ LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
706
727
  struct llama_context * ctx,
707
728
  llama_token_data_array * candidates,
708
729
  struct llama_context * guidance_ctx,
709
- float scale);
730
+ float scale),
731
+ "use llama_sample_apply_guidance() instead");
710
732
 
711
733
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
712
734
  LLAMA_API void llama_sample_softmax(
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.12.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-13 00:00:00.000000000 Z
11
+ date: 2024-01-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: