llama_cpp 0.13.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -315,6 +315,16 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
+ enum ggml_status {
319
+ GGML_STATUS_ALLOC_FAILED = -2,
320
+ GGML_STATUS_FAILED = -1,
321
+ GGML_STATUS_SUCCESS = 0,
322
+ GGML_STATUS_ABORTED = 1,
323
+ };
324
+
325
+ // get ggml_status name string
326
+ GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
327
+
318
328
  typedef uint16_t ggml_fp16_t;
319
329
 
320
330
  // convert FP16 <-> FP32
@@ -327,24 +337,24 @@ extern "C" {
327
337
  struct ggml_object;
328
338
  struct ggml_context;
329
339
 
340
+ // NOTE: always add types at the end of the enum to keep backward compatibility
330
341
  enum ggml_type {
331
- GGML_TYPE_F32 = 0,
332
- GGML_TYPE_F16 = 1,
333
- GGML_TYPE_Q4_0 = 2,
334
- GGML_TYPE_Q4_1 = 3,
342
+ GGML_TYPE_F32 = 0,
343
+ GGML_TYPE_F16 = 1,
344
+ GGML_TYPE_Q4_0 = 2,
345
+ GGML_TYPE_Q4_1 = 3,
335
346
  // GGML_TYPE_Q4_2 = 4, support has been removed
336
- // GGML_TYPE_Q4_3 (5) support has been removed
337
- GGML_TYPE_Q5_0 = 6,
338
- GGML_TYPE_Q5_1 = 7,
339
- GGML_TYPE_Q8_0 = 8,
340
- GGML_TYPE_Q8_1 = 9,
341
- // k-quantizations
342
- GGML_TYPE_Q2_K = 10,
343
- GGML_TYPE_Q3_K = 11,
344
- GGML_TYPE_Q4_K = 12,
345
- GGML_TYPE_Q5_K = 13,
346
- GGML_TYPE_Q6_K = 14,
347
- GGML_TYPE_Q8_K = 15,
347
+ // GGML_TYPE_Q4_3 = 5, support has been removed
348
+ GGML_TYPE_Q5_0 = 6,
349
+ GGML_TYPE_Q5_1 = 7,
350
+ GGML_TYPE_Q8_0 = 8,
351
+ GGML_TYPE_Q8_1 = 9,
352
+ GGML_TYPE_Q2_K = 10,
353
+ GGML_TYPE_Q3_K = 11,
354
+ GGML_TYPE_Q4_K = 12,
355
+ GGML_TYPE_Q5_K = 13,
356
+ GGML_TYPE_Q6_K = 14,
357
+ GGML_TYPE_Q8_K = 15,
348
358
  GGML_TYPE_IQ2_XXS = 16,
349
359
  GGML_TYPE_IQ2_XS = 17,
350
360
  GGML_TYPE_IQ3_XXS = 18,
@@ -353,9 +363,11 @@ extern "C" {
353
363
  GGML_TYPE_IQ3_S = 21,
354
364
  GGML_TYPE_IQ2_S = 22,
355
365
  GGML_TYPE_IQ4_XS = 23,
356
- GGML_TYPE_I8,
357
- GGML_TYPE_I16,
358
- GGML_TYPE_I32,
366
+ GGML_TYPE_I8 = 24,
367
+ GGML_TYPE_I16 = 25,
368
+ GGML_TYPE_I32 = 26,
369
+ GGML_TYPE_I64 = 27,
370
+ GGML_TYPE_F64 = 28,
359
371
  GGML_TYPE_COUNT,
360
372
  };
361
373
 
@@ -373,20 +385,20 @@ extern "C" {
373
385
 
374
386
  // model file types
375
387
  enum ggml_ftype {
376
- GGML_FTYPE_UNKNOWN = -1,
377
- GGML_FTYPE_ALL_F32 = 0,
378
- GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
379
- GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
380
- GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
388
+ GGML_FTYPE_UNKNOWN = -1,
389
+ GGML_FTYPE_ALL_F32 = 0,
390
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
392
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
381
393
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
382
- GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
383
- GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
384
- GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
385
- GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
386
- GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
387
- GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
388
- GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
389
- GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
394
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
395
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
396
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
397
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
398
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
399
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
400
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
401
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
390
402
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
391
403
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
392
404
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
@@ -454,12 +466,16 @@ extern "C" {
454
466
  GGML_OP_POOL_2D,
455
467
  GGML_OP_UPSCALE, // nearest interpolate
456
468
  GGML_OP_PAD,
469
+ GGML_OP_ARANGE,
470
+ GGML_OP_TIMESTEP_EMBEDDING,
457
471
  GGML_OP_ARGSORT,
458
472
  GGML_OP_LEAKY_RELU,
459
473
 
460
474
  GGML_OP_FLASH_ATTN,
461
475
  GGML_OP_FLASH_FF,
462
476
  GGML_OP_FLASH_ATTN_BACK,
477
+ GGML_OP_SSM_CONV,
478
+ GGML_OP_SSM_SCAN,
463
479
  GGML_OP_WIN_PART,
464
480
  GGML_OP_WIN_UNPART,
465
481
  GGML_OP_GET_REL_POS,
@@ -1661,6 +1677,15 @@ extern "C" {
1661
1677
  int p2,
1662
1678
  int p3);
1663
1679
 
1680
+ // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1681
+ // timesteps: [N,]
1682
+ // return: [N, dim]
1683
+ GGML_API struct ggml_tensor * ggml_timestep_embedding(
1684
+ struct ggml_context * ctx,
1685
+ struct ggml_tensor * timesteps,
1686
+ int dim,
1687
+ int max_period);
1688
+
1664
1689
  // sort rows
1665
1690
  enum ggml_sort_order {
1666
1691
  GGML_SORT_ORDER_ASC,
@@ -1672,6 +1697,12 @@ extern "C" {
1672
1697
  struct ggml_tensor * a,
1673
1698
  enum ggml_sort_order order);
1674
1699
 
1700
+ GGML_API struct ggml_tensor * ggml_arange(
1701
+ struct ggml_context * ctx,
1702
+ float start,
1703
+ float stop,
1704
+ float step);
1705
+
1675
1706
  // top k elements per row
1676
1707
  GGML_API struct ggml_tensor * ggml_top_k(
1677
1708
  struct ggml_context * ctx,
@@ -1701,6 +1732,23 @@ extern "C" {
1701
1732
  struct ggml_tensor * c0,
1702
1733
  struct ggml_tensor * c1);
1703
1734
 
1735
+ GGML_API struct ggml_tensor * ggml_ssm_conv(
1736
+ struct ggml_context * ctx,
1737
+ struct ggml_tensor * s,
1738
+ struct ggml_tensor * x,
1739
+ struct ggml_tensor * c,
1740
+ struct ggml_tensor * sq);
1741
+
1742
+ GGML_API struct ggml_tensor * ggml_ssm_scan(
1743
+ struct ggml_context * ctx,
1744
+ struct ggml_tensor * s,
1745
+ struct ggml_tensor * x,
1746
+ struct ggml_tensor * dt,
1747
+ struct ggml_tensor * A,
1748
+ struct ggml_tensor * B,
1749
+ struct ggml_tensor * C,
1750
+ struct ggml_tensor * sq);
1751
+
1704
1752
  // partition into non-overlapping windows with padding if needed
1705
1753
  // example:
1706
1754
  // a: 768 64 64 1
@@ -1923,12 +1971,11 @@ extern "C" {
1923
1971
 
1924
1972
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1925
1973
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1926
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1927
- GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1928
-
1974
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1975
+ GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1929
1976
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1930
1977
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1931
- GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1978
+ GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1932
1979
 
1933
1980
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1934
1981
 
@@ -2149,25 +2196,18 @@ extern "C" {
2149
2196
  GGML_API void ggml_quantize_init(enum ggml_type type);
2150
2197
  GGML_API void ggml_quantize_free(void);
2151
2198
 
2152
- // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2153
- GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2154
- GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
2155
- GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
2156
- GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
2157
- GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
2158
-
2159
- GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
2160
- GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
2161
- GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2162
- GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2163
- GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2164
-
2165
2199
  // some quantization type cannot be used without an importance matrix
2166
2200
  GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2167
2201
 
2168
2202
  // calls ggml_quantize_init internally (i.e. can allocate memory)
2169
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2170
- int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2203
+ GGML_API size_t ggml_quantize_chunk(
2204
+ enum ggml_type type,
2205
+ const float * src,
2206
+ void * dst,
2207
+ int start,
2208
+ int nrows,
2209
+ int n_per_row,
2210
+ const float * imatrix);
2171
2211
 
2172
2212
  //
2173
2213
  // gguf