llama_cpp 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -187,6 +187,16 @@
187
187
  # define GGML_API
188
188
  #endif
189
189
 
190
+ #ifdef GGML_MULTIPLATFORM
191
+ # if defined(_WIN32)
192
+ # define GGML_CALL
193
+ # else
194
+ # define GGML_CALL __attribute__((__ms_abi__))
195
+ # endif
196
+ #else
197
+ # define GGML_CALL
198
+ #endif
199
+
190
200
  // TODO: support for clang
191
201
  #ifdef __GNUC__
192
202
  # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -218,7 +228,9 @@
218
228
  #define GGML_MAX_PARAMS 2048
219
229
  #define GGML_MAX_CONTEXTS 64
220
230
  #define GGML_MAX_SRC 10
231
+ #ifndef GGML_MAX_NAME
221
232
  #define GGML_MAX_NAME 64
233
+ #endif
222
234
  #define GGML_MAX_OP_PARAMS 64
223
235
  #define GGML_DEFAULT_N_THREADS 4
224
236
  #define GGML_DEFAULT_GRAPH_SIZE 2048
@@ -339,6 +351,8 @@ extern "C" {
339
351
  GGML_TYPE_Q5_K = 13,
340
352
  GGML_TYPE_Q6_K = 14,
341
353
  GGML_TYPE_Q8_K = 15,
354
+ GGML_TYPE_IQ2_XXS = 16,
355
+ GGML_TYPE_IQ2_XS = 17,
342
356
  GGML_TYPE_I8,
343
357
  GGML_TYPE_I16,
344
358
  GGML_TYPE_I32,
@@ -373,6 +387,8 @@ extern "C" {
373
387
  GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
374
388
  GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
375
389
  GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
390
+ GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
376
392
  };
377
393
 
378
394
  // available tensor operations:
@@ -643,41 +659,41 @@ extern "C" {
643
659
  GGML_API void ggml_print_object (const struct ggml_object * obj);
644
660
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
645
661
 
646
- GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
647
- GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
648
- GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
649
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
662
+ GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
663
+ GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
664
+ GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
665
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
650
666
 
651
- GGML_API int ggml_blck_size(enum ggml_type type);
652
- GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
653
- GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
667
+ GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
668
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
669
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
654
670
 
655
671
  GGML_DEPRECATED(
656
672
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
657
673
  "use ggml_row_size() instead");
658
674
 
659
- GGML_API const char * ggml_type_name(enum ggml_type type);
660
- GGML_API const char * ggml_op_name (enum ggml_op op);
661
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
675
+ GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
676
+ GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
677
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
662
678
 
663
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
664
- GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
679
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
680
+ GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
665
681
 
666
- GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
682
+ GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
667
683
 
668
- GGML_API bool ggml_is_quantized(enum ggml_type type);
684
+ GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
669
685
 
670
686
  // TODO: temporary until model loading of ggml examples is refactored
671
687
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
672
688
 
673
- GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
674
- GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
675
- GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
676
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
677
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
678
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
679
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
680
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
689
+ GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
690
+ GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
691
+ GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
693
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
694
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
695
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
696
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
681
697
 
682
698
  GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
683
699
 
@@ -764,7 +780,7 @@ extern "C" {
764
780
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
765
781
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
766
782
 
767
- GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
783
+ GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
768
784
 
769
785
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
770
786
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@@ -1159,22 +1175,16 @@ extern "C" {
1159
1175
  struct ggml_tensor * a,
1160
1176
  struct ggml_tensor * b);
1161
1177
 
1162
- // a -> b, in-place, return view(b)
1163
- GGML_API struct ggml_tensor * ggml_cpy_inplace(
1178
+ GGML_API struct ggml_tensor * ggml_cast(
1164
1179
  struct ggml_context * ctx,
1165
1180
  struct ggml_tensor * a,
1166
- struct ggml_tensor * b);
1181
+ enum ggml_type type);
1167
1182
 
1168
1183
  // make contiguous
1169
1184
  GGML_API struct ggml_tensor * ggml_cont(
1170
1185
  struct ggml_context * ctx,
1171
1186
  struct ggml_tensor * a);
1172
1187
 
1173
- // make contiguous, in-place
1174
- GGML_API struct ggml_tensor * ggml_cont_inplace(
1175
- struct ggml_context * ctx,
1176
- struct ggml_tensor * a);
1177
-
1178
1188
  // make contiguous, with new shape
1179
1189
  GGML_API struct ggml_tensor * ggml_cont_1d(
1180
1190
  struct ggml_context * ctx,
@@ -1413,7 +1423,7 @@ extern "C" {
1413
1423
  float beta_slow);
1414
1424
 
1415
1425
  // compute correction dims for YaRN RoPE scaling
1416
- void ggml_rope_yarn_corr_dims(
1426
+ GGML_CALL void ggml_rope_yarn_corr_dims(
1417
1427
  int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1418
1428
 
1419
1429
  // xPos RoPE, in-place, returns view(a)
@@ -1847,8 +1857,8 @@ extern "C" {
1847
1857
 
1848
1858
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1849
1859
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1850
- GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1851
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1860
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1861
+ GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1852
1862
 
1853
1863
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1854
1864
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -2068,7 +2078,18 @@ extern "C" {
2068
2078
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2069
2079
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2070
2080
 
2071
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
2081
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2082
+ int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2083
+
2084
+ // These are needed for IQ2_XS and IQ2_XXS quantizations
2085
+ GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2086
+ GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2087
+
2088
+ //
2089
+ // Importance matrix
2090
+ //
2091
+ typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
2092
+ GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
2072
2093
 
2073
2094
  //
2074
2095
  // gguf