llama_cpp 0.15.1 → 0.15.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -81,9 +81,10 @@ extern "C" {
81
81
  LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
82
82
  LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
83
83
  LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
84
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
85
- LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
86
- LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
84
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
85
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
86
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
87
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
87
88
  };
88
89
 
89
90
  // note: these values should be synchronized with ggml_rope
@@ -242,6 +243,9 @@ extern "C" {
242
243
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
243
244
  const float * tensor_split;
244
245
 
246
+ // comma separated list of RPC servers to use for offloading
247
+ const char * rpc_servers;
248
+
245
249
  // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
246
250
  // If the provided progress_callback returns true, model loading continues.
247
251
  // If it returns false, model loading is immediately aborted.
@@ -755,6 +759,12 @@ extern "C" {
755
759
  // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
756
760
  LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
757
761
 
762
+ // Get the number of threads used for generation of a single token.
763
+ LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
764
+
765
+ // Get the number of threads used for prompt and batch processing (multiple token).
766
+ LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
767
+
758
768
  // Set whether to use causal attention or not
759
769
  // If set to true, the model will only attend to the past tokens
760
770
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);