llama_cpp 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -129,6 +129,7 @@ extern "C" {
129
129
  };
130
130
 
131
131
  enum llama_pooling_type {
132
+ LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
132
133
  LLAMA_POOLING_TYPE_NONE = 0,
133
134
  LLAMA_POOLING_TYPE_MEAN = 1,
134
135
  LLAMA_POOLING_TYPE_CLS = 2,
@@ -162,7 +163,7 @@ extern "C" {
162
163
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
163
164
  // - pos : the positions of the respective token in the sequence
164
165
  // - seq_id : the sequence to which the respective token belongs
165
- // - logits : if zero, the logits for the respective token will not be output
166
+ // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
166
167
  //
167
168
  typedef struct llama_batch {
168
169
  int32_t n_tokens;
@@ -172,7 +173,7 @@ extern "C" {
172
173
  llama_pos * pos;
173
174
  int32_t * n_seq_id;
174
175
  llama_seq_id ** seq_id;
175
- int8_t * logits;
176
+ int8_t * logits; // TODO: rename this to "output"
176
177
 
177
178
  // NOTE: helpers for smooth API transition - can be deprecated in the future
178
179
  // for future-proof code, use the above fields instead and ignore everything below
@@ -236,7 +237,10 @@ extern "C" {
236
237
  uint32_t n_batch; // prompt processing maximum batch size
237
238
  uint32_t n_threads; // number of threads to use for generation
238
239
  uint32_t n_threads_batch; // number of threads to use for batch processing
239
- int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
240
+
241
+ enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
242
+ enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
243
+ // (ignored if no pooling layer)
240
244
 
241
245
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
242
246
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -255,10 +259,15 @@ extern "C" {
255
259
  enum ggml_type type_v; // data type for V cache
256
260
 
257
261
  // Keep the booleans together to avoid misalignment during copy-by-value.
258
- bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
259
- bool embedding; // embedding mode only
262
+ bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
263
+ bool embeddings; // if true, extract embeddings (together with logits)
260
264
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
261
- bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
265
+
266
+ // Abort callback
267
+ // if it returns true, execution of llama_decode() will be aborted
268
+ // currently works only with CPU execution
269
+ ggml_abort_callback abort_callback;
270
+ void * abort_callback_data;
262
271
  };
263
272
 
264
273
  // model quantization parameters
@@ -632,7 +641,10 @@ extern "C" {
632
641
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
633
642
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
634
643
 
635
- // Token logits obtained from the last call to llama_eval()
644
+ // Set abort callback
645
+ LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
646
+
647
+ // Token logits obtained from the last call to llama_decode()
636
648
  // The logits for the last token are stored in the last row
637
649
  // Logits for which llama_batch.logits[i] == 0 are undefined
638
650
  // Rows: n_tokens provided with llama_batch
@@ -643,14 +655,20 @@ extern "C" {
643
655
  // llama_get_logits(ctx) + i*n_vocab
644
656
  LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
645
657
 
646
- // Get the embeddings for the input
647
- // shape: [n_embd] (1-dimensional)
658
+ // Get all output token embeddings
659
+ // shape: [n_tokens*n_embd] (1-dimensional)
648
660
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
649
661
 
650
- // Get the embeddings for the ith sequence
662
+ // Get the embeddings for the ith token
651
663
  // llama_get_embeddings(ctx) + i*n_embd
664
+ // shape: [n_embd] (1-dimensional)
652
665
  LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
653
666
 
667
+ // Get the embeddings for a sequence id
668
+ // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
669
+ // shape: [n_embd] (1-dimensional)
670
+ LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
671
+
654
672
  //
655
673
  // Vocab
656
674
  //
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-03-02 00:00:00.000000000 Z
11
+ date: 2024-03-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: