llama_cpp 0.13.0 → 0.14.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -315,6 +315,16 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
+ enum ggml_status {
319
+ GGML_STATUS_ALLOC_FAILED = -2,
320
+ GGML_STATUS_FAILED = -1,
321
+ GGML_STATUS_SUCCESS = 0,
322
+ GGML_STATUS_ABORTED = 1,
323
+ };
324
+
325
+ // get ggml_status name string
326
+ GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
327
+
318
328
  typedef uint16_t ggml_fp16_t;
319
329
 
320
330
  // convert FP16 <-> FP32
@@ -327,24 +337,24 @@ extern "C" {
327
337
  struct ggml_object;
328
338
  struct ggml_context;
329
339
 
340
+ // NOTE: always add types at the end of the enum to keep backward compatibility
330
341
  enum ggml_type {
331
- GGML_TYPE_F32 = 0,
332
- GGML_TYPE_F16 = 1,
333
- GGML_TYPE_Q4_0 = 2,
334
- GGML_TYPE_Q4_1 = 3,
342
+ GGML_TYPE_F32 = 0,
343
+ GGML_TYPE_F16 = 1,
344
+ GGML_TYPE_Q4_0 = 2,
345
+ GGML_TYPE_Q4_1 = 3,
335
346
  // GGML_TYPE_Q4_2 = 4, support has been removed
336
- // GGML_TYPE_Q4_3 (5) support has been removed
337
- GGML_TYPE_Q5_0 = 6,
338
- GGML_TYPE_Q5_1 = 7,
339
- GGML_TYPE_Q8_0 = 8,
340
- GGML_TYPE_Q8_1 = 9,
341
- // k-quantizations
342
- GGML_TYPE_Q2_K = 10,
343
- GGML_TYPE_Q3_K = 11,
344
- GGML_TYPE_Q4_K = 12,
345
- GGML_TYPE_Q5_K = 13,
346
- GGML_TYPE_Q6_K = 14,
347
- GGML_TYPE_Q8_K = 15,
347
+ // GGML_TYPE_Q4_3 = 5, support has been removed
348
+ GGML_TYPE_Q5_0 = 6,
349
+ GGML_TYPE_Q5_1 = 7,
350
+ GGML_TYPE_Q8_0 = 8,
351
+ GGML_TYPE_Q8_1 = 9,
352
+ GGML_TYPE_Q2_K = 10,
353
+ GGML_TYPE_Q3_K = 11,
354
+ GGML_TYPE_Q4_K = 12,
355
+ GGML_TYPE_Q5_K = 13,
356
+ GGML_TYPE_Q6_K = 14,
357
+ GGML_TYPE_Q8_K = 15,
348
358
  GGML_TYPE_IQ2_XXS = 16,
349
359
  GGML_TYPE_IQ2_XS = 17,
350
360
  GGML_TYPE_IQ3_XXS = 18,
@@ -353,9 +363,11 @@ extern "C" {
353
363
  GGML_TYPE_IQ3_S = 21,
354
364
  GGML_TYPE_IQ2_S = 22,
355
365
  GGML_TYPE_IQ4_XS = 23,
356
- GGML_TYPE_I8,
357
- GGML_TYPE_I16,
358
- GGML_TYPE_I32,
366
+ GGML_TYPE_I8 = 24,
367
+ GGML_TYPE_I16 = 25,
368
+ GGML_TYPE_I32 = 26,
369
+ GGML_TYPE_I64 = 27,
370
+ GGML_TYPE_F64 = 28,
359
371
  GGML_TYPE_COUNT,
360
372
  };
361
373
 
@@ -373,20 +385,20 @@ extern "C" {
373
385
 
374
386
  // model file types
375
387
  enum ggml_ftype {
376
- GGML_FTYPE_UNKNOWN = -1,
377
- GGML_FTYPE_ALL_F32 = 0,
378
- GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
379
- GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
380
- GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
388
+ GGML_FTYPE_UNKNOWN = -1,
389
+ GGML_FTYPE_ALL_F32 = 0,
390
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
392
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
381
393
  GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
382
- GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
383
- GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
384
- GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
385
- GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
386
- GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
387
- GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
388
- GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
389
- GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
394
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
395
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
396
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
397
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
398
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
399
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
400
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
401
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
390
402
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
391
403
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
392
404
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
@@ -454,12 +466,16 @@ extern "C" {
454
466
  GGML_OP_POOL_2D,
455
467
  GGML_OP_UPSCALE, // nearest interpolate
456
468
  GGML_OP_PAD,
469
+ GGML_OP_ARANGE,
470
+ GGML_OP_TIMESTEP_EMBEDDING,
457
471
  GGML_OP_ARGSORT,
458
472
  GGML_OP_LEAKY_RELU,
459
473
 
460
474
  GGML_OP_FLASH_ATTN,
461
475
  GGML_OP_FLASH_FF,
462
476
  GGML_OP_FLASH_ATTN_BACK,
477
+ GGML_OP_SSM_CONV,
478
+ GGML_OP_SSM_SCAN,
463
479
  GGML_OP_WIN_PART,
464
480
  GGML_OP_WIN_UNPART,
465
481
  GGML_OP_GET_REL_POS,
@@ -1661,6 +1677,15 @@ extern "C" {
1661
1677
  int p2,
1662
1678
  int p3);
1663
1679
 
1680
+ // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1681
+ // timesteps: [N,]
1682
+ // return: [N, dim]
1683
+ GGML_API struct ggml_tensor * ggml_timestep_embedding(
1684
+ struct ggml_context * ctx,
1685
+ struct ggml_tensor * timesteps,
1686
+ int dim,
1687
+ int max_period);
1688
+
1664
1689
  // sort rows
1665
1690
  enum ggml_sort_order {
1666
1691
  GGML_SORT_ORDER_ASC,
@@ -1672,6 +1697,12 @@ extern "C" {
1672
1697
  struct ggml_tensor * a,
1673
1698
  enum ggml_sort_order order);
1674
1699
 
1700
+ GGML_API struct ggml_tensor * ggml_arange(
1701
+ struct ggml_context * ctx,
1702
+ float start,
1703
+ float stop,
1704
+ float step);
1705
+
1675
1706
  // top k elements per row
1676
1707
  GGML_API struct ggml_tensor * ggml_top_k(
1677
1708
  struct ggml_context * ctx,
@@ -1701,6 +1732,23 @@ extern "C" {
1701
1732
  struct ggml_tensor * c0,
1702
1733
  struct ggml_tensor * c1);
1703
1734
 
1735
+ GGML_API struct ggml_tensor * ggml_ssm_conv(
1736
+ struct ggml_context * ctx,
1737
+ struct ggml_tensor * s,
1738
+ struct ggml_tensor * x,
1739
+ struct ggml_tensor * c,
1740
+ struct ggml_tensor * sq);
1741
+
1742
+ GGML_API struct ggml_tensor * ggml_ssm_scan(
1743
+ struct ggml_context * ctx,
1744
+ struct ggml_tensor * s,
1745
+ struct ggml_tensor * x,
1746
+ struct ggml_tensor * dt,
1747
+ struct ggml_tensor * A,
1748
+ struct ggml_tensor * B,
1749
+ struct ggml_tensor * C,
1750
+ struct ggml_tensor * sq);
1751
+
1704
1752
  // partition into non-overlapping windows with padding if needed
1705
1753
  // example:
1706
1754
  // a: 768 64 64 1
@@ -1923,12 +1971,11 @@ extern "C" {
1923
1971
 
1924
1972
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1925
1973
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1926
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1927
- GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1928
-
1974
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1975
+ GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1929
1976
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1930
1977
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1931
- GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1978
+ GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1932
1979
 
1933
1980
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1934
1981
 
@@ -2149,25 +2196,18 @@ extern "C" {
2149
2196
  GGML_API void ggml_quantize_init(enum ggml_type type);
2150
2197
  GGML_API void ggml_quantize_free(void);
2151
2198
 
2152
- // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2153
- GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2154
- GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
2155
- GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
2156
- GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
2157
- GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
2158
-
2159
- GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
2160
- GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
2161
- GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2162
- GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2163
- GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2164
-
2165
2199
  // some quantization type cannot be used without an importance matrix
2166
2200
  GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2167
2201
 
2168
2202
  // calls ggml_quantize_init internally (i.e. can allocate memory)
2169
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2170
- int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2203
+ GGML_API size_t ggml_quantize_chunk(
2204
+ enum ggml_type type,
2205
+ const float * src,
2206
+ void * dst,
2207
+ int start,
2208
+ int nrows,
2209
+ int n_per_row,
2210
+ const float * imatrix);
2171
2211
 
2172
2212
  //
2173
2213
  // gguf