llama_cpp 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,7 +43,7 @@
43
43
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
44
44
 
45
45
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
46
- #define LLAMA_SESSION_VERSION 3
46
+ #define LLAMA_SESSION_VERSION 4
47
47
 
48
48
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
49
49
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -118,6 +118,12 @@ extern "C" {
118
118
  LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
119
119
  };
120
120
 
121
+ enum llama_split_mode {
122
+ LLAMA_SPLIT_NONE = 0, // single GPU
123
+ LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
124
+ LLAMA_SPLIT_ROW = 2, // split rows across GPUs
125
+ };
126
+
121
127
  typedef struct llama_token_data {
122
128
  llama_token id; // token id
123
129
  float logit; // log-odds of the token
@@ -180,8 +186,16 @@ extern "C" {
180
186
 
181
187
  struct llama_model_params {
182
188
  int32_t n_gpu_layers; // number of layers to store in VRAM
183
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
184
- const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
189
+ enum llama_split_mode split_mode; // how to split the model across multiple GPUs
190
+
191
+ // main_gpu interpretation depends on split_mode:
192
+ // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
193
+ // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
194
+ // LLAMA_SPLIT_LAYER: ignored
195
+ int32_t main_gpu;
196
+
197
+ // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
198
+ const float * tensor_split;
185
199
 
186
200
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
187
201
  // If the provided progress_callback returns true, model loading continues.
@@ -235,6 +249,7 @@ extern "C" {
235
249
  bool quantize_output_tensor; // quantize output.weight
236
250
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
237
251
  bool pure; // disable k-quant mixtures and quantize all tensors to the same type
252
+ void * imatrix; // pointer to importance matrix data
238
253
  } llama_model_quantize_params;
239
254
 
240
255
  // grammar types
@@ -699,14 +714,21 @@ extern "C" {
699
714
  float penalty_present);
700
715
 
701
716
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
702
- /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
703
- /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
704
- /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
705
- LLAMA_API void llama_sample_classifier_free_guidance(
717
+ /// @param logits Logits extracted from the original generation context.
718
+ /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
719
+ /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
720
+ LLAMA_API void llama_sample_apply_guidance(
721
+ struct llama_context * ctx,
722
+ float * logits,
723
+ float * logits_guidance,
724
+ float scale);
725
+
726
+ LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
706
727
  struct llama_context * ctx,
707
728
  llama_token_data_array * candidates,
708
729
  struct llama_context * guidance_ctx,
709
- float scale);
730
+ float scale),
731
+ "use llama_sample_apply_guidance() instead");
710
732
 
711
733
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
712
734
  LLAMA_API void llama_sample_softmax(
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.12.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-01-13 00:00:00.000000000 Z
11
+ date: 2024-01-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: