llama_cpp 0.16.1 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -174,6 +174,7 @@ extern "C" {
174
174
  LLAMA_POOLING_TYPE_NONE = 0,
175
175
  LLAMA_POOLING_TYPE_MEAN = 1,
176
176
  LLAMA_POOLING_TYPE_CLS = 2,
177
+ LLAMA_POOLING_TYPE_LAST = 3,
177
178
  };
178
179
 
179
180
  enum llama_split_mode {
@@ -293,7 +294,6 @@ extern "C" {
293
294
 
294
295
  enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
295
296
  enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
296
- // (ignored if no pooling layer)
297
297
 
298
298
  // ref: https://github.com/ggerganov/llama.cpp/pull/2054
299
299
  float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -786,6 +786,10 @@ extern "C" {
786
786
  // Get the number of threads used for prompt and batch processing (multiple token).
787
787
  LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
788
788
 
789
+ // Set whether the model is in embeddings model or not
790
+ // If true, embeddings will be returned but logits will not
791
+ LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
792
+
789
793
  // Set whether to use causal attention or not
790
794
  // If set to true, the model will only attend to the past tokens
791
795
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -43,8 +43,10 @@
43
43
  // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
44
44
  // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
45
45
 
46
+ #if defined(__GNUC__)
46
47
  #pragma GCC diagnostic ignored "-Wpedantic"
47
48
  #pragma GCC diagnostic ignored "-Wignored-attributes"
49
+ #endif
48
50
 
49
51
  #include "sgemm.h"
50
52
  #include "ggml-impl.h"