llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,6 +64,15 @@ extern "C" {
64
64
  LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
65
65
  };
66
66
 
67
+ // note: these values should be synchronized with ggml_rope
68
+ // TODO: maybe move this enum to ggml.h (ggml_rope_type)
69
+ enum llama_rope_type {
70
+ LLAMA_ROPE_TYPE_NONE = -1,
71
+ LLAMA_ROPE_TYPE_NORM = 0,
72
+ LLAMA_ROPE_TYPE_NEOX = 2,
73
+ LLAMA_ROPE_TYPE_GLM = 4,
74
+ };
75
+
67
76
  enum llama_token_type {
68
77
  LLAMA_TOKEN_TYPE_UNDEFINED = 0,
69
78
  LLAMA_TOKEN_TYPE_NORMAL = 1,
@@ -98,24 +107,37 @@ extern "C" {
98
107
  LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
99
108
  LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
100
109
  LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
101
- LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
110
+ LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
102
111
  LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
112
+ LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
113
+ LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
114
+ LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
115
+ LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
116
+ LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
117
+ LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
118
+ LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
103
119
 
104
120
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
105
121
  };
106
122
 
107
123
  enum llama_rope_scaling_type {
108
- LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
109
- LLAMA_ROPE_SCALING_NONE = 0,
110
- LLAMA_ROPE_SCALING_LINEAR = 1,
111
- LLAMA_ROPE_SCALING_YARN = 2,
112
- LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
124
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
125
+ LLAMA_ROPE_SCALING_TYPE_NONE = 0,
126
+ LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
127
+ LLAMA_ROPE_SCALING_TYPE_YARN = 2,
128
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
129
+ };
130
+
131
+ enum llama_pooling_type {
132
+ LLAMA_POOLING_TYPE_NONE = 0,
133
+ LLAMA_POOLING_TYPE_MEAN = 1,
134
+ LLAMA_POOLING_TYPE_CLS = 2,
113
135
  };
114
136
 
115
137
  enum llama_split_mode {
116
- LLAMA_SPLIT_NONE = 0, // single GPU
117
- LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
118
- LLAMA_SPLIT_ROW = 2, // split rows across GPUs
138
+ LLAMA_SPLIT_MODE_NONE = 0, // single GPU
139
+ LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
140
+ LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
119
141
  };
120
142
 
121
143
  typedef struct llama_token_data {
@@ -163,9 +185,9 @@ extern "C" {
163
185
  } llama_batch;
164
186
 
165
187
  enum llama_model_kv_override_type {
166
- LLAMA_KV_OVERRIDE_INT,
167
- LLAMA_KV_OVERRIDE_FLOAT,
168
- LLAMA_KV_OVERRIDE_BOOL,
188
+ LLAMA_KV_OVERRIDE_TYPE_INT,
189
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT,
190
+ LLAMA_KV_OVERRIDE_TYPE_BOOL,
169
191
  };
170
192
 
171
193
  struct llama_model_kv_override {
@@ -224,6 +246,7 @@ extern "C" {
224
246
  float yarn_beta_fast; // YaRN low correction dim
225
247
  float yarn_beta_slow; // YaRN high correction dim
226
248
  uint32_t yarn_orig_ctx; // YaRN original context size
249
+ float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
227
250
 
228
251
  ggml_backend_sched_eval_callback cb_eval;
229
252
  void * cb_eval_user_data;
@@ -232,7 +255,6 @@ extern "C" {
232
255
  enum ggml_type type_v; // data type for V cache
233
256
 
234
257
  // Keep the booleans together to avoid misalignment during copy-by-value.
235
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
236
258
  bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
237
259
  bool embedding; // embedding mode only
238
260
  bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
@@ -298,6 +320,12 @@ extern "C" {
298
320
  int32_t n_eval;
299
321
  };
300
322
 
323
+ // used in chat template
324
+ typedef struct llama_chat_message {
325
+ const char * role;
326
+ const char * content;
327
+ } llama_chat_message;
328
+
301
329
  // Helpers for getting default parameters
302
330
  LLAMA_API struct llama_model_params llama_model_default_params(void);
303
331
  LLAMA_API struct llama_context_params llama_context_default_params(void);
@@ -306,7 +334,10 @@ extern "C" {
306
334
  // Initialize the llama + ggml backend
307
335
  // If numa is true, use NUMA optimizations
308
336
  // Call once at the start of the program
309
- LLAMA_API void llama_backend_init(bool numa);
337
+ LLAMA_API void llama_backend_init(void);
338
+
339
+ //optional:
340
+ LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
310
341
 
311
342
  // Call once at the end of the program - currently only used for MPI
312
343
  LLAMA_API void llama_backend_free(void);
@@ -332,15 +363,13 @@ extern "C" {
332
363
  LLAMA_API bool llama_supports_mlock (void);
333
364
  LLAMA_API bool llama_supports_gpu_offload(void);
334
365
 
335
- LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
336
- LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
337
-
338
366
  LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
339
367
 
340
368
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
341
369
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
342
370
 
343
371
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
372
+ LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
344
373
 
345
374
  LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
346
375
  LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
@@ -390,14 +419,6 @@ extern "C" {
390
419
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
391
420
  // will be applied on top of the previous one
392
421
  // Returns 0 on success
393
- LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
394
- struct llama_context * ctx,
395
- const char * path_lora,
396
- float scale,
397
- const char * path_base_model,
398
- int32_t n_threads),
399
- "use llama_model_apply_lora_from_file instead");
400
-
401
422
  LLAMA_API int32_t llama_model_apply_lora_from_file(
402
423
  const struct llama_model * model,
403
424
  const char * path_lora,
@@ -495,10 +516,12 @@ extern "C" {
495
516
  llama_seq_id seq_id);
496
517
 
497
518
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
498
- // If the KV cache is RoPEd, the KV data is updated accordingly
519
+ // If the KV cache is RoPEd, the KV data is updated accordingly:
520
+ // - lazily on next llama_decode()
521
+ // - explicitly with llama_kv_cache_update()
499
522
  // p0 < 0 : [0, p1]
500
523
  // p1 < 0 : [p0, inf)
501
- LLAMA_API void llama_kv_cache_seq_shift(
524
+ LLAMA_API void llama_kv_cache_seq_add(
502
525
  struct llama_context * ctx,
503
526
  llama_seq_id seq_id,
504
527
  llama_pos p0,
@@ -506,7 +529,9 @@ extern "C" {
506
529
  llama_pos delta);
507
530
 
508
531
  // Integer division of the positions by factor of `d > 1`
509
- // If the KV cache is RoPEd, the KV data is updated accordingly
532
+ // If the KV cache is RoPEd, the KV data is updated accordingly:
533
+ // - lazily on next llama_decode()
534
+ // - explicitly with llama_kv_cache_update()
510
535
  // p0 < 0 : [0, p1]
511
536
  // p1 < 0 : [p0, inf)
512
537
  LLAMA_API void llama_kv_cache_seq_div(
@@ -516,6 +541,20 @@ extern "C" {
516
541
  llama_pos p1,
517
542
  int d);
518
543
 
544
+ // Returns the largest position present in the KV cache for the specified sequence
545
+ LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
546
+ struct llama_context * ctx,
547
+ llama_seq_id seq_id);
548
+
549
+ // Defragment the KV cache
550
+ // This will be applied:
551
+ // - lazily on next llama_decode()
552
+ // - explicitly with llama_kv_cache_update()
553
+ LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
554
+
555
+ // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
556
+ LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
557
+
519
558
  //
520
559
  // State / sessions
521
560
  //
@@ -535,7 +574,7 @@ extern "C" {
535
574
  // Returns the number of bytes read
536
575
  LLAMA_API size_t llama_set_state_data(
537
576
  struct llama_context * ctx,
538
- uint8_t * src);
577
+ const uint8_t * src);
539
578
 
540
579
  // Save/load session file
541
580
  LLAMA_API bool llama_load_session_file(
@@ -555,27 +594,6 @@ extern "C" {
555
594
  // Decoding
556
595
  //
557
596
 
558
- // Run the llama inference to obtain the logits and probabilities for the next token(s).
559
- // tokens + n_tokens is the provided batch of new tokens to process
560
- // n_past is the number of tokens to use from previous eval calls
561
- // Returns 0 on success
562
- // DEPRECATED: use llama_decode() instead
563
- LLAMA_API DEPRECATED(int llama_eval(
564
- struct llama_context * ctx,
565
- llama_token * tokens,
566
- int32_t n_tokens,
567
- int32_t n_past),
568
- "use llama_decode() instead");
569
-
570
- // Same as llama_eval, but use float matrix input directly.
571
- // DEPRECATED: use llama_decode() instead
572
- LLAMA_API DEPRECATED(int llama_eval_embd(
573
- struct llama_context * ctx,
574
- float * embd,
575
- int32_t n_tokens,
576
- int32_t n_past),
577
- "use llama_decode() instead");
578
-
579
597
  // Return batch for single sequence of tokens starting at pos_0
580
598
  //
581
599
  // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@@ -689,6 +707,25 @@ extern "C" {
689
707
  char * buf,
690
708
  int32_t length);
691
709
 
710
+ /// Apply chat template. Inspired by hf apply_chat_template() on python.
711
+ /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
712
+ /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
713
+ /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
714
+ /// @param chat Pointer to a list of multiple llama_chat_message
715
+ /// @param n_msg Number of llama_chat_message in this chat
716
+ /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
717
+ /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
718
+ /// @param length The size of the allocated buffer
719
+ /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
720
+ LLAMA_API int32_t llama_chat_apply_template(
721
+ const struct llama_model * model,
722
+ const char * tmpl,
723
+ const struct llama_chat_message * chat,
724
+ size_t n_msg,
725
+ bool add_ass,
726
+ char * buf,
727
+ int32_t length);
728
+
692
729
  //
693
730
  // Grammar
694
731
  //
@@ -730,13 +767,6 @@ extern "C" {
730
767
  float * logits_guidance,
731
768
  float scale);
732
769
 
733
- LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
734
- struct llama_context * ctx,
735
- llama_token_data_array * candidates,
736
- struct llama_context * guidance_ctx,
737
- float scale),
738
- "use llama_sample_apply_guidance() instead");
739
-
740
770
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
741
771
  LLAMA_API void llama_sample_softmax(
742
772
  struct llama_context * ctx,
@@ -790,12 +820,6 @@ extern "C" {
790
820
  llama_token_data_array * candidates,
791
821
  float temp);
792
822
 
793
- LLAMA_API DEPRECATED(void llama_sample_temperature(
794
- struct llama_context * ctx,
795
- llama_token_data_array * candidates,
796
- float temp),
797
- "use llama_sample_temp instead");
798
-
799
823
  /// @details Apply constraints from grammar
800
824
  LLAMA_API void llama_sample_grammar(
801
825
  struct llama_context * ctx,
@@ -1,6 +1,6 @@
1
1
  ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
2
2
  GF_CC_IS_GCC = 1
3
- GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
3
+ GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
4
4
  else
5
5
  GF_CC_IS_CLANG = 1
6
6
  ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'