llama_cpp 0.12.6 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -315,13 +315,7 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
- #if defined(__ARM_NEON) && defined(__CUDACC__)
319
- typedef half ggml_fp16_t;
320
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
321
- typedef __fp16 ggml_fp16_t;
322
- #else
323
318
  typedef uint16_t ggml_fp16_t;
324
- #endif
325
319
 
326
320
  // convert FP16 <-> FP32
327
321
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
@@ -354,6 +348,8 @@ extern "C" {
354
348
  GGML_TYPE_IQ2_XXS = 16,
355
349
  GGML_TYPE_IQ2_XS = 17,
356
350
  GGML_TYPE_IQ3_XXS = 18,
351
+ GGML_TYPE_IQ1_S = 19,
352
+ GGML_TYPE_IQ4_NL = 20,
357
353
  GGML_TYPE_I8,
358
354
  GGML_TYPE_I16,
359
355
  GGML_TYPE_I32,
@@ -391,6 +387,8 @@ extern "C" {
391
387
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
392
388
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
393
389
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
390
+ GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
394
392
  };
395
393
 
396
394
  // available tensor operations:
@@ -658,6 +656,16 @@ extern "C" {
658
656
  void * wdata;
659
657
  };
660
658
 
659
+ // numa strategies
660
+ enum ggml_numa_strategy {
661
+ GGML_NUMA_STRATEGY_DISABLED = 0,
662
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
663
+ GGML_NUMA_STRATEGY_ISOLATE = 2,
664
+ GGML_NUMA_STRATEGY_NUMACTL = 3,
665
+ GGML_NUMA_STRATEGY_MIRROR = 4,
666
+ GGML_NUMA_STRATEGY_COUNT
667
+ };
668
+
661
669
  // misc
662
670
 
663
671
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -668,7 +676,7 @@ extern "C" {
668
676
 
669
677
  GGML_API void ggml_print_backtrace(void);
670
678
 
671
- GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
679
+ GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
672
680
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
673
681
 
674
682
  GGML_API void ggml_print_object (const struct ggml_object * obj);
@@ -1373,13 +1381,17 @@ extern "C" {
1373
1381
  struct ggml_context * ctx,
1374
1382
  struct ggml_tensor * a);
1375
1383
 
1376
- // fused soft_max(a*scale + mask)
1384
+ // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1377
1385
  // mask is optional
1386
+ // pos is required when max_bias > 0.0f
1387
+ // max_bias = 0.0f for no ALiBi
1378
1388
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1379
1389
  struct ggml_context * ctx,
1380
1390
  struct ggml_tensor * a,
1381
1391
  struct ggml_tensor * mask,
1382
- float scale);
1392
+ struct ggml_tensor * pos,
1393
+ float scale,
1394
+ float max_bias);
1383
1395
 
1384
1396
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1385
1397
  struct ggml_context * ctx,
@@ -1481,12 +1493,13 @@ extern "C" {
1481
1493
 
1482
1494
  // alibi position embedding
1483
1495
  // in-place, returns view(a)
1484
- GGML_API struct ggml_tensor * ggml_alibi(
1496
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1485
1497
  struct ggml_context * ctx,
1486
1498
  struct ggml_tensor * a,
1487
1499
  int n_past,
1488
1500
  int n_head,
1489
- float bias_max);
1501
+ float bias_max),
1502
+ "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1490
1503
 
1491
1504
  // clamp
1492
1505
  // in-place, returns view(a)