llama_cpp 0.12.6 → 0.12.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -315,13 +315,7 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
- #if defined(__ARM_NEON) && defined(__CUDACC__)
319
- typedef half ggml_fp16_t;
320
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
321
- typedef __fp16 ggml_fp16_t;
322
- #else
323
318
  typedef uint16_t ggml_fp16_t;
324
- #endif
325
319
 
326
320
  // convert FP16 <-> FP32
327
321
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
@@ -354,6 +348,8 @@ extern "C" {
354
348
  GGML_TYPE_IQ2_XXS = 16,
355
349
  GGML_TYPE_IQ2_XS = 17,
356
350
  GGML_TYPE_IQ3_XXS = 18,
351
+ GGML_TYPE_IQ1_S = 19,
352
+ GGML_TYPE_IQ4_NL = 20,
357
353
  GGML_TYPE_I8,
358
354
  GGML_TYPE_I16,
359
355
  GGML_TYPE_I32,
@@ -391,6 +387,8 @@ extern "C" {
391
387
  GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
392
388
  GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
393
389
  GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
390
+ GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
391
+ GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
394
392
  };
395
393
 
396
394
  // available tensor operations:
@@ -658,6 +656,16 @@ extern "C" {
658
656
  void * wdata;
659
657
  };
660
658
 
659
+ // numa strategies
660
+ enum ggml_numa_strategy {
661
+ GGML_NUMA_STRATEGY_DISABLED = 0,
662
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
663
+ GGML_NUMA_STRATEGY_ISOLATE = 2,
664
+ GGML_NUMA_STRATEGY_NUMACTL = 3,
665
+ GGML_NUMA_STRATEGY_MIRROR = 4,
666
+ GGML_NUMA_STRATEGY_COUNT
667
+ };
668
+
661
669
  // misc
662
670
 
663
671
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@@ -668,7 +676,7 @@ extern "C" {
668
676
 
669
677
  GGML_API void ggml_print_backtrace(void);
670
678
 
671
- GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
679
+ GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
672
680
  GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
673
681
 
674
682
  GGML_API void ggml_print_object (const struct ggml_object * obj);
@@ -1373,13 +1381,17 @@ extern "C" {
1373
1381
  struct ggml_context * ctx,
1374
1382
  struct ggml_tensor * a);
1375
1383
 
1376
- // fused soft_max(a*scale + mask)
1384
+ // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1377
1385
  // mask is optional
1386
+ // pos is required when max_bias > 0.0f
1387
+ // max_bias = 0.0f for no ALiBi
1378
1388
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1379
1389
  struct ggml_context * ctx,
1380
1390
  struct ggml_tensor * a,
1381
1391
  struct ggml_tensor * mask,
1382
- float scale);
1392
+ struct ggml_tensor * pos,
1393
+ float scale,
1394
+ float max_bias);
1383
1395
 
1384
1396
  GGML_API struct ggml_tensor * ggml_soft_max_back(
1385
1397
  struct ggml_context * ctx,
@@ -1481,12 +1493,13 @@ extern "C" {
1481
1493
 
1482
1494
  // alibi position embedding
1483
1495
  // in-place, returns view(a)
1484
- GGML_API struct ggml_tensor * ggml_alibi(
1496
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1485
1497
  struct ggml_context * ctx,
1486
1498
  struct ggml_tensor * a,
1487
1499
  int n_past,
1488
1500
  int n_head,
1489
- float bias_max);
1501
+ float bias_max),
1502
+ "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1490
1503
 
1491
1504
  // clamp
1492
1505
  // in-place, returns view(a)