llama_cpp 0.12.6 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -315,13 +315,7 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
- #if defined(__ARM_NEON) && defined(__CUDACC__)
319
- typedef half ggml_fp16_t;
320
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
321
- typedef __fp16 ggml_fp16_t;
322
- #else
323
318
  typedef uint16_t ggml_fp16_t;
324
- #endif
325
319
 
326
320
  // convert FP16 <-> FP32
327
321
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
@@ -354,6 +348,11 @@ extern "C" {
354
348
  GGML_TYPE_IQ2_XXS = 16,
355
349
  GGML_TYPE_IQ2_XS = 17,
356
350
  GGML_TYPE_IQ3_XXS = 18,
351
+ GGML_TYPE_IQ1_S = 19,
352
+ GGML_TYPE_IQ4_NL = 20,
353
+ GGML_TYPE_IQ3_S = 21,
354
+ GGML_TYPE_IQ2_S = 22,
355
+ GGML_TYPE_IQ4_XS = 23,
357
356
  GGML_TYPE_I8,
358
357
  GGML_TYPE_I16,
359
358
  GGML_TYPE_I32,
@@ -367,9 +366,9 @@ extern "C" {
367
366
  };
368
367
 
369
368
  enum ggml_backend_type {
370
- GGML_BACKEND_CPU = 0,
371
- GGML_BACKEND_GPU = 10,
372
- GGML_BACKEND_GPU_SPLIT = 20,
369
+ GGML_BACKEND_TYPE_CPU = 0,
370
+ GGML_BACKEND_TYPE_GPU = 10,
371
+ GGML_BACKEND_TYPE_GPU_SPLIT = 20,
373
372
  };
374
373
 
375
374
  // model file types
@@ -391,6 +390,11 @@ extern "C" {
391
390
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
392
391
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
393
392
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
393
+ GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
394
+ GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
395
+ GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
396
+ GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
397
+ GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
394
398
  };
395
399
 
396
400
  // available tensor operations:
@@ -498,9 +502,9 @@ extern "C" {
498
502
  };
499
503
 
500
504
  enum ggml_object_type {
501
- GGML_OBJECT_TENSOR,
502
- GGML_OBJECT_GRAPH,
503
- GGML_OBJECT_WORK_BUFFER
505
+ GGML_OBJECT_TYPE_TENSOR,
506
+ GGML_OBJECT_TYPE_GRAPH,
507
+ GGML_OBJECT_TYPE_WORK_BUFFER
504
508
  };
505
509
 
506
510
  enum ggml_log_level {
@@ -642,9 +646,9 @@ extern "C" {
642
646
  // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
643
647
  // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
644
648
  enum ggml_task_type {
645
- GGML_TASK_INIT = 0,
646
- GGML_TASK_COMPUTE,
647
- GGML_TASK_FINALIZE,
649
+ GGML_TASK_TYPE_INIT = 0,
650
+ GGML_TASK_TYPE_COMPUTE,
651
+ GGML_TASK_TYPE_FINALIZE,
648
652
  };
649
653
 
650
654
  struct ggml_compute_params {
@@ -658,6 +662,26 @@ extern "C" {
658
662
  void * wdata;
659
663
  };
660
664
 
665
+ // numa strategies
666
+ enum ggml_numa_strategy {
667
+ GGML_NUMA_STRATEGY_DISABLED = 0,
668
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
669
+ GGML_NUMA_STRATEGY_ISOLATE = 2,
670
+ GGML_NUMA_STRATEGY_NUMACTL = 3,
671
+ GGML_NUMA_STRATEGY_MIRROR = 4,
672
+ GGML_NUMA_STRATEGY_COUNT
673
+ };
674
+
675
+ //
676
+ // GUID
677
+ //
678
+
679
+ // GUID types
680
+ typedef uint8_t ggml_guid[16];
681
+ typedef ggml_guid * ggml_guid_t;
682
+
683
+ GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
684
+
661
685
  // misc
662
686
 
663
687
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -668,7 +692,7 @@ extern "C" {
668
692
 
669
693
  GGML_API void ggml_print_backtrace(void);
670
694
 
671
- GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
695
+ GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
672
696
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
673
697
 
674
698
  GGML_API void ggml_print_object (const struct ggml_object * obj);
@@ -1373,13 +1397,17 @@ extern "C" {
1373
1397
  struct ggml_context * ctx,
1374
1398
  struct ggml_tensor * a);
1375
1399
 
1376
- // fused soft_max(a*scale + mask)
1400
+ // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1377
1401
  // mask is optional
1402
+ // pos is required when max_bias > 0.0f
1403
+ // max_bias = 0.0f for no ALiBi
1378
1404
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1379
1405
  struct ggml_context * ctx,
1380
1406
  struct ggml_tensor * a,
1381
1407
  struct ggml_tensor * mask,
1382
- float scale);
1408
+ struct ggml_tensor * pos,
1409
+ float scale,
1410
+ float max_bias);
1383
1411
 
1384
1412
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1385
1413
  struct ggml_context * ctx,
@@ -1481,12 +1509,13 @@ extern "C" {
1481
1509
 
1482
1510
  // alibi position embedding
1483
1511
  // in-place, returns view(a)
1484
- GGML_API struct ggml_tensor * ggml_alibi(
1512
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1485
1513
  struct ggml_context * ctx,
1486
1514
  struct ggml_tensor * a,
1487
1515
  int n_past,
1488
1516
  int n_head,
1489
- float bias_max);
1517
+ float bias_max),
1518
+ "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1490
1519
 
1491
1520
  // clamp
1492
1521
  // in-place, returns view(a)
@@ -1634,8 +1663,8 @@ extern "C" {
1634
1663
 
1635
1664
  // sort rows
1636
1665
  enum ggml_sort_order {
1637
- GGML_SORT_ASC,
1638
- GGML_SORT_DESC,
1666
+ GGML_SORT_ORDER_ASC,
1667
+ GGML_SORT_ORDER_DESC,
1639
1668
  };
1640
1669
 
1641
1670
  GGML_API struct ggml_tensor * ggml_argsort(
@@ -1928,8 +1957,8 @@ extern "C" {
1928
1957
 
1929
1958
  // optimization methods
1930
1959
  enum ggml_opt_type {
1931
- GGML_OPT_ADAM,
1932
- GGML_OPT_LBFGS,
1960
+ GGML_OPT_TYPE_ADAM,
1961
+ GGML_OPT_TYPE_LBFGS,
1933
1962
  };
1934
1963
 
1935
1964
  // linesearch methods
@@ -1943,12 +1972,12 @@ extern "C" {
1943
1972
 
1944
1973
  // optimization return values
1945
1974
  enum ggml_opt_result {
1946
- GGML_OPT_OK = 0,
1947
- GGML_OPT_DID_NOT_CONVERGE,
1948
- GGML_OPT_NO_CONTEXT,
1949
- GGML_OPT_INVALID_WOLFE,
1950
- GGML_OPT_FAIL,
1951
- GGML_OPT_CANCEL,
1975
+ GGML_OPT_RESULT_OK = 0,
1976
+ GGML_OPT_RESULT_DID_NOT_CONVERGE,
1977
+ GGML_OPT_RESULT_NO_CONTEXT,
1978
+ GGML_OPT_RESULT_INVALID_WOLFE,
1979
+ GGML_OPT_RESULT_FAIL,
1980
+ GGML_OPT_RESULT_CANCEL,
1952
1981
 
1953
1982
  GGML_LINESEARCH_FAIL = -128,
1954
1983
  GGML_LINESEARCH_MINIMUM_STEP,