llama_cpp 0.15.1 → 0.15.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -81,9 +81,10 @@ extern "C" {
81
81
  LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82
82
  LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83
83
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85
- LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86
- LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
84
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
85
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
86
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
87
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
87
88
  };
88
89
 
89
90
  // note: these values should be synchronized with ggml_rope
@@ -242,6 +243,9 @@ extern "C" {
242
243
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
243
244
  const float * tensor_split;
244
245
 
246
+ // comma separated list of RPC servers to use for offloading
247
+ const char * rpc_servers;
248
+
245
249
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
246
250
  // If the provided progress_callback returns true, model loading continues.
247
251
  // If it returns false, model loading is immediately aborted.
@@ -755,6 +759,12 @@ extern "C" {
755
759
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
756
760
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
757
761
 
762
+ // Get the number of threads used for generation of a single token.
763
+ LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
764
+
765
+ // Get the number of threads used for prompt and batch processing (multiple token).
766
+ LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
767
+
758
768
  // Set whether to use causal attention or not
759
769
  // If set to true, the model will only attend to the past tokens
760
770
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);