llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -315,13 +315,7 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
- #if defined(__ARM_NEON) && defined(__CUDACC__)
319
- typedef half ggml_fp16_t;
320
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
321
- typedef __fp16 ggml_fp16_t;
322
- #else
323
318
  typedef uint16_t ggml_fp16_t;
324
- #endif
325
319
 
326
320
  // convert FP16 <-> FP32
327
321
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
@@ -354,6 +348,8 @@ extern "C" {
354
348
  GGML_TYPE_IQ2_XXS = 16,
355
349
  GGML_TYPE_IQ2_XS = 17,
356
350
  GGML_TYPE_IQ3_XXS = 18,
351
+ GGML_TYPE_IQ1_S = 19,
352
+ GGML_TYPE_IQ4_NL = 20,
357
353
  GGML_TYPE_I8,
358
354
  GGML_TYPE_I16,
359
355
  GGML_TYPE_I32,
@@ -391,6 +387,8 @@ extern "C" {
391
387
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
392
388
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
393
389
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
390
+ GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
394
392
  };
395
393
 
396
394
  // available tensor operations:
@@ -505,11 +503,17 @@ extern "C" {
505
503
 
506
504
  enum ggml_log_level {
507
505
  GGML_LOG_LEVEL_ERROR = 2,
508
- GGML_LOG_LEVEL_WARN = 3,
509
- GGML_LOG_LEVEL_INFO = 4,
506
+ GGML_LOG_LEVEL_WARN = 3,
507
+ GGML_LOG_LEVEL_INFO = 4,
510
508
  GGML_LOG_LEVEL_DEBUG = 5
511
509
  };
512
510
 
511
+ enum ggml_tensor_flag {
512
+ GGML_TENSOR_FLAG_INPUT = 1,
513
+ GGML_TENSOR_FLAG_OUTPUT = 2,
514
+ GGML_TENSOR_FLAG_PARAM = 4,
515
+ };
516
+
513
517
  // ggml object
514
518
  struct ggml_object {
515
519
  size_t offs;
@@ -543,7 +547,7 @@ extern "C" {
543
547
  // op params - allocated as int32_t for alignment
544
548
  int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
545
549
 
546
- bool is_param;
550
+ int32_t flags;
547
551
 
548
552
  struct ggml_tensor * grad;
549
553
  struct ggml_tensor * src[GGML_MAX_SRC];
@@ -567,6 +571,11 @@ extern "C" {
567
571
 
568
572
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
569
573
 
574
+ // Abort callback
575
+ // If not NULL, called before ggml computation
576
+ // If it returns true, the computation is aborted
577
+ typedef bool (*ggml_abort_callback)(void * data);
578
+
570
579
  // the compute plan that needs to be prepared for ggml_graph_compute()
571
580
  // since https://github.com/ggerganov/ggml/issues/287
572
581
  struct ggml_cplan {
@@ -576,8 +585,8 @@ extern "C" {
576
585
  int n_threads;
577
586
 
578
587
  // abort ggml_graph_compute when true
579
- bool (*abort_callback)(void * data);
580
- void * abort_callback_data;
588
+ ggml_abort_callback abort_callback;
589
+ void * abort_callback_data;
581
590
  };
582
591
 
583
592
  enum ggml_cgraph_eval_order {
@@ -647,6 +656,16 @@ extern "C" {
647
656
  void * wdata;
648
657
  };
649
658
 
659
+ // numa strategies
660
+ enum ggml_numa_strategy {
661
+ GGML_NUMA_STRATEGY_DISABLED = 0,
662
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
663
+ GGML_NUMA_STRATEGY_ISOLATE = 2,
664
+ GGML_NUMA_STRATEGY_NUMACTL = 3,
665
+ GGML_NUMA_STRATEGY_MIRROR = 4,
666
+ GGML_NUMA_STRATEGY_COUNT
667
+ };
668
+
650
669
  // misc
651
670
 
652
671
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -657,7 +676,7 @@ extern "C" {
657
676
 
658
677
  GGML_API void ggml_print_backtrace(void);
659
678
 
660
- GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
679
+ GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
661
680
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
662
681
 
663
682
  GGML_API void ggml_print_object (const struct ggml_object * obj);
@@ -1362,13 +1381,17 @@ extern "C" {
1362
1381
  struct ggml_context * ctx,
1363
1382
  struct ggml_tensor * a);
1364
1383
 
1365
- // fused soft_max(a*scale + mask)
1384
+ // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1366
1385
  // mask is optional
1386
+ // pos is required when max_bias > 0.0f
1387
+ // max_bias = 0.0f for no ALiBi
1367
1388
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1368
1389
  struct ggml_context * ctx,
1369
1390
  struct ggml_tensor * a,
1370
1391
  struct ggml_tensor * mask,
1371
- float scale);
1392
+ struct ggml_tensor * pos,
1393
+ float scale,
1394
+ float max_bias);
1372
1395
 
1373
1396
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1374
1397
  struct ggml_context * ctx,
@@ -1470,12 +1493,13 @@ extern "C" {
1470
1493
 
1471
1494
  // alibi position embedding
1472
1495
  // in-place, returns view(a)
1473
- GGML_API struct ggml_tensor * ggml_alibi(
1496
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1474
1497
  struct ggml_context * ctx,
1475
1498
  struct ggml_tensor * a,
1476
1499
  int n_past,
1477
1500
  int n_head,
1478
- float bias_max);
1501
+ float bias_max),
1502
+ "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1479
1503
 
1480
1504
  // clamp
1481
1505
  // in-place, returns view(a)
@@ -2087,6 +2111,12 @@ extern "C" {
2087
2111
  ggml_opt_callback callback,
2088
2112
  void * callback_data);
2089
2113
 
2114
+ //
2115
+ // tensor flags
2116
+ //
2117
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2118
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2119
+
2090
2120
  //
2091
2121
  // quantization
2092
2122
  //
@@ -2273,6 +2303,7 @@ extern "C" {
2273
2303
  GGML_API int ggml_cpu_has_ssse3 (void);
2274
2304
  GGML_API int ggml_cpu_has_sycl (void);
2275
2305
  GGML_API int ggml_cpu_has_vsx (void);
2306
+ GGML_API int ggml_cpu_has_matmul_int8(void);
2276
2307
 
2277
2308
  //
2278
2309
  // Internal types and functions exposed for tests and benchmarks
@@ -2286,7 +2317,8 @@ extern "C" {
2286
2317
  #endif
2287
2318
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
2288
2319
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
2289
- typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
2320
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2321
+ const void * GGML_RESTRICT y, size_t by, int nrc);
2290
2322
 
2291
2323
  typedef struct {
2292
2324
  const char * type_name;
@@ -2298,6 +2330,7 @@ extern "C" {
2298
2330
  ggml_from_float_t from_float_reference;
2299
2331
  ggml_vec_dot_t vec_dot;
2300
2332
  enum ggml_type vec_dot_type;
2333
+ int64_t nrows; // number of rows to process simultaneously;
2301
2334
  } ggml_type_traits_t;
2302
2335
 
2303
2336
  GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);