llama_cpp 0.14.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -337,24 +337,24 @@ extern "C" {
337
337
  struct ggml_object;
338
338
  struct ggml_context;
339
339
 
340
+ // NOTE: always add types at the end of the enum to keep backward compatibility
340
341
  enum ggml_type {
341
- GGML_TYPE_F32 = 0,
342
- GGML_TYPE_F16 = 1,
343
- GGML_TYPE_Q4_0 = 2,
344
- GGML_TYPE_Q4_1 = 3,
342
+ GGML_TYPE_F32 = 0,
343
+ GGML_TYPE_F16 = 1,
344
+ GGML_TYPE_Q4_0 = 2,
345
+ GGML_TYPE_Q4_1 = 3,
345
346
  // GGML_TYPE_Q4_2 = 4, support has been removed
346
- // GGML_TYPE_Q4_3 (5) support has been removed
347
- GGML_TYPE_Q5_0 = 6,
348
- GGML_TYPE_Q5_1 = 7,
349
- GGML_TYPE_Q8_0 = 8,
350
- GGML_TYPE_Q8_1 = 9,
351
- // k-quantizations
352
- GGML_TYPE_Q2_K = 10,
353
- GGML_TYPE_Q3_K = 11,
354
- GGML_TYPE_Q4_K = 12,
355
- GGML_TYPE_Q5_K = 13,
356
- GGML_TYPE_Q6_K = 14,
357
- GGML_TYPE_Q8_K = 15,
347
+ // GGML_TYPE_Q4_3 = 5, support has been removed
348
+ GGML_TYPE_Q5_0 = 6,
349
+ GGML_TYPE_Q5_1 = 7,
350
+ GGML_TYPE_Q8_0 = 8,
351
+ GGML_TYPE_Q8_1 = 9,
352
+ GGML_TYPE_Q2_K = 10,
353
+ GGML_TYPE_Q3_K = 11,
354
+ GGML_TYPE_Q4_K = 12,
355
+ GGML_TYPE_Q5_K = 13,
356
+ GGML_TYPE_Q6_K = 14,
357
+ GGML_TYPE_Q8_K = 15,
358
358
  GGML_TYPE_IQ2_XXS = 16,
359
359
  GGML_TYPE_IQ2_XS = 17,
360
360
  GGML_TYPE_IQ3_XXS = 18,
@@ -363,9 +363,11 @@ extern "C" {
363
363
  GGML_TYPE_IQ3_S = 21,
364
364
  GGML_TYPE_IQ2_S = 22,
365
365
  GGML_TYPE_IQ4_XS = 23,
366
- GGML_TYPE_I8,
367
- GGML_TYPE_I16,
368
- GGML_TYPE_I32,
366
+ GGML_TYPE_I8 = 24,
367
+ GGML_TYPE_I16 = 25,
368
+ GGML_TYPE_I32 = 26,
369
+ GGML_TYPE_I64 = 27,
370
+ GGML_TYPE_F64 = 28,
369
371
  GGML_TYPE_COUNT,
370
372
  };
371
373
 
@@ -383,20 +385,20 @@ extern "C" {
383
385
 
384
386
  // model file types
385
387
  enum ggml_ftype {
386
- GGML_FTYPE_UNKNOWN = -1,
387
- GGML_FTYPE_ALL_F32 = 0,
388
- GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
389
- GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
390
- GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
388
+ GGML_FTYPE_UNKNOWN = -1,
389
+ GGML_FTYPE_ALL_F32 = 0,
390
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
392
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
391
393
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
392
- GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
393
- GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
394
- GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
395
- GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
396
- GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
397
- GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
398
- GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
399
- GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
394
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
395
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
396
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
397
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
398
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
399
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
400
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
401
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
400
402
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
401
403
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
402
404
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
@@ -472,6 +474,8 @@ extern "C" {
472
474
  GGML_OP_FLASH_ATTN,
473
475
  GGML_OP_FLASH_FF,
474
476
  GGML_OP_FLASH_ATTN_BACK,
477
+ GGML_OP_SSM_CONV,
478
+ GGML_OP_SSM_SCAN,
475
479
  GGML_OP_WIN_PART,
476
480
  GGML_OP_WIN_UNPART,
477
481
  GGML_OP_GET_REL_POS,
@@ -1728,6 +1732,23 @@ extern "C" {
1728
1732
  struct ggml_tensor * c0,
1729
1733
  struct ggml_tensor * c1);
1730
1734
 
1735
+ GGML_API struct ggml_tensor * ggml_ssm_conv(
1736
+ struct ggml_context * ctx,
1737
+ struct ggml_tensor * s,
1738
+ struct ggml_tensor * x,
1739
+ struct ggml_tensor * c,
1740
+ struct ggml_tensor * sq);
1741
+
1742
+ GGML_API struct ggml_tensor * ggml_ssm_scan(
1743
+ struct ggml_context * ctx,
1744
+ struct ggml_tensor * s,
1745
+ struct ggml_tensor * x,
1746
+ struct ggml_tensor * dt,
1747
+ struct ggml_tensor * A,
1748
+ struct ggml_tensor * B,
1749
+ struct ggml_tensor * C,
1750
+ struct ggml_tensor * sq);
1751
+
1731
1752
  // partition into non-overlapping windows with padding if needed
1732
1753
  // example:
1733
1754
  // a: 768 64 64 1
@@ -2175,25 +2196,18 @@ extern "C" {
2175
2196
  GGML_API void ggml_quantize_init(enum ggml_type type);
2176
2197
  GGML_API void ggml_quantize_free(void);
2177
2198
 
2178
- // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2179
- GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2180
- GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
2181
- GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
2182
- GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
2183
- GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
2184
-
2185
- GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
2186
- GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
2187
- GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2188
- GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2189
- GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2190
-
2191
2199
  // some quantization type cannot be used without an importance matrix
2192
2200
  GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2193
2201
 
2194
2202
  // calls ggml_quantize_init internally (i.e. can allocate memory)
2195
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2196
- int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2203
+ GGML_API size_t ggml_quantize_chunk(
2204
+ enum ggml_type type,
2205
+ const float * src,
2206
+ void * dst,
2207
+ int start,
2208
+ int nrows,
2209
+ int n_per_row,
2210
+ const float * imatrix);
2197
2211
 
2198
2212
  //
2199
2213
  // gguf