llama_cpp 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -326,14 +326,20 @@ extern "C" {
326
326
  // get ggml_status name string
327
327
  GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
328
328
 
329
+ // ieee 754-2008 half-precision float16
330
+ // todo: make this not an integral type
329
331
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
332
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
333
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
334
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
335
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
336
+
337
+ // google brain half-precision bfloat16
338
+ typedef struct { uint16_t bits; } ggml_bf16_t;
339
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
340
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
341
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
343
 
338
344
  struct ggml_object;
339
345
  struct ggml_context;
@@ -370,6 +376,7 @@ extern "C" {
370
376
  GGML_TYPE_I64 = 27,
371
377
  GGML_TYPE_F64 = 28,
372
378
  GGML_TYPE_IQ1_M = 29,
379
+ GGML_TYPE_BF16 = 30,
373
380
  GGML_TYPE_COUNT,
374
381
  };
375
382
 
@@ -410,6 +417,7 @@ extern "C" {
410
417
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
418
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
419
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
420
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
421
  };
414
422
 
415
423
  // available tensor operations:
@@ -460,7 +468,6 @@ extern "C" {
460
468
  GGML_OP_SOFT_MAX_BACK,
461
469
  GGML_OP_ROPE,
462
470
  GGML_OP_ROPE_BACK,
463
- GGML_OP_ALIBI,
464
471
  GGML_OP_CLAMP,
465
472
  GGML_OP_CONV_TRANSPOSE_1D,
466
473
  GGML_OP_IM2COL,
@@ -512,6 +519,7 @@ extern "C" {
512
519
  GGML_UNARY_OP_TANH,
513
520
  GGML_UNARY_OP_ELU,
514
521
  GGML_UNARY_OP_RELU,
522
+ GGML_UNARY_OP_SIGMOID,
515
523
  GGML_UNARY_OP_GELU,
516
524
  GGML_UNARY_OP_GELU_QUICK,
517
525
  GGML_UNARY_OP_SILU,
@@ -557,7 +565,8 @@ extern "C" {
557
565
  // n-dimensional tensor
558
566
  struct ggml_tensor {
559
567
  enum ggml_type type;
560
- enum ggml_backend_type backend;
568
+
569
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
561
570
 
562
571
  struct ggml_backend_buffer * buffer;
563
572
 
@@ -758,7 +767,8 @@ extern "C" {
758
767
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
759
768
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
760
769
 
761
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
771
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
762
772
 
763
773
  // use this to compute the memory overhead of a tensor
764
774
  GGML_API size_t ggml_tensor_overhead(void);
@@ -1066,6 +1076,14 @@ extern "C" {
1066
1076
  struct ggml_context * ctx,
1067
1077
  struct ggml_tensor * a);
1068
1078
 
1079
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1080
+ struct ggml_context * ctx,
1081
+ struct ggml_tensor * a);
1082
+
1083
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1084
+ struct ggml_context * ctx,
1085
+ struct ggml_tensor * a);
1086
+
1069
1087
  GGML_API struct ggml_tensor * ggml_gelu(
1070
1088
  struct ggml_context * ctx,
1071
1089
  struct ggml_tensor * a);
@@ -1420,15 +1438,13 @@ extern "C" {
1420
1438
  struct ggml_context * ctx,
1421
1439
  struct ggml_tensor * a);
1422
1440
 
1423
- // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1441
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1424
1442
  // mask is optional
1425
- // pos is required when max_bias > 0.0f
1426
1443
  // max_bias = 0.0f for no ALiBi
1427
1444
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1428
1445
  struct ggml_context * ctx,
1429
1446
  struct ggml_tensor * a,
1430
1447
  struct ggml_tensor * mask,
1431
- struct ggml_tensor * pos,
1432
1448
  float scale,
1433
1449
  float max_bias);
1434
1450
 
@@ -1530,16 +1546,6 @@ extern "C" {
1530
1546
  float xpos_base,
1531
1547
  bool xpos_down);
1532
1548
 
1533
- // alibi position embedding
1534
- // in-place, returns view(a)
1535
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1536
- struct ggml_context * ctx,
1537
- struct ggml_tensor * a,
1538
- int n_past,
1539
- int n_head,
1540
- float bias_max),
1541
- "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1542
-
1543
1549
  // clamp
1544
1550
  // in-place, returns view(a)
1545
1551
  GGML_API struct ggml_tensor * ggml_clamp(
@@ -1669,12 +1675,24 @@ extern "C" {
1669
1675
  float p1);
1670
1676
 
1671
1677
  // nearest interpolate
1678
+ // multiplies ne0 and ne1 by scale factor
1672
1679
  // used in stable-diffusion
1673
1680
  GGML_API struct ggml_tensor * ggml_upscale(
1674
1681
  struct ggml_context * ctx,
1675
1682
  struct ggml_tensor * a,
1676
1683
  int scale_factor);
1677
1684
 
1685
+ // nearest interpolate
1686
+ // nearest interpolate to specified dimensions
1687
+ // used in tortoise.cpp
1688
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1689
+ struct ggml_context * ctx,
1690
+ struct ggml_tensor * a,
1691
+ int ne0,
1692
+ int ne1,
1693
+ int ne2,
1694
+ int ne3);
1695
+
1678
1696
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1679
1697
  GGML_API struct ggml_tensor * ggml_pad(
1680
1698
  struct ggml_context * ctx,
@@ -1736,7 +1754,8 @@ extern "C" {
1736
1754
  struct ggml_tensor * k,
1737
1755
  struct ggml_tensor * v,
1738
1756
  struct ggml_tensor * mask,
1739
- float scale);
1757
+ float scale,
1758
+ float max_bias);
1740
1759
 
1741
1760
  GGML_API void ggml_flash_attn_ext_set_prec(
1742
1761
  struct ggml_tensor * a,