llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,6 +174,7 @@ extern "C" {
174
174
  LLAMA_POOLING_TYPE_NONE = 0,
175
175
  LLAMA_POOLING_TYPE_MEAN = 1,
176
176
  LLAMA_POOLING_TYPE_CLS = 2,
177
+ LLAMA_POOLING_TYPE_LAST = 3,
177
178
  };
178
179
 
179
180
  enum llama_split_mode {
@@ -293,7 +294,6 @@ extern "C" {
293
294
 
294
295
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
295
296
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
296
- // (ignored if no pooling layer)
297
297
 
298
298
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
299
299
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -786,6 +786,10 @@ extern "C" {
786
786
  // Get the number of threads used for prompt and batch processing (multiple token).
787
787
  LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
788
788
 
789
+ // Set whether the model is in embeddings model or not
790
+ // If true, embeddings will be returned but logits will not
791
+ LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
792
+
789
793
  // Set whether to use causal attention or not
790
794
  // If set to true, the model will only attend to the past tokens
791
795
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -43,8 +43,10 @@
43
43
  // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
44
44
  // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
45
45
 
46
+ #if defined(__GNUC__)
46
47
  #pragma GCC diagnostic ignored "-Wpedantic"
47
48
  #pragma GCC diagnostic ignored "-Wignored-attributes"
49
+ #endif
48
50
 
49
51
  #include "sgemm.h"
50
52
  #include "ggml-impl.h"