llama_cpp 0.15.0 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -326,14 +326,20 @@ extern "C" {
326
326
  // get ggml_status name string
327
327
  GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
328
328
 
329
+ // ieee 754-2008 half-precision float16
330
+ // todo: make this not an integral type
329
331
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
332
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
333
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
334
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
335
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
336
+
337
+ // google brain half-precision bfloat16
338
+ typedef struct { uint16_t bits; } ggml_bf16_t;
339
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
340
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
341
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
343
 
338
344
  struct ggml_object;
339
345
  struct ggml_context;
@@ -370,6 +376,7 @@ extern "C" {
370
376
  GGML_TYPE_I64 = 27,
371
377
  GGML_TYPE_F64 = 28,
372
378
  GGML_TYPE_IQ1_M = 29,
379
+ GGML_TYPE_BF16 = 30,
373
380
  GGML_TYPE_COUNT,
374
381
  };
375
382
 
@@ -410,6 +417,7 @@ extern "C" {
410
417
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
418
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
419
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
420
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
421
  };
414
422
 
415
423
  // available tensor operations:
@@ -460,7 +468,6 @@ extern "C" {
460
468
  GGML_OP_SOFT_MAX_BACK,
461
469
  GGML_OP_ROPE,
462
470
  GGML_OP_ROPE_BACK,
463
- GGML_OP_ALIBI,
464
471
  GGML_OP_CLAMP,
465
472
  GGML_OP_CONV_TRANSPOSE_1D,
466
473
  GGML_OP_IM2COL,
@@ -512,6 +519,7 @@ extern "C" {
512
519
  GGML_UNARY_OP_TANH,
513
520
  GGML_UNARY_OP_ELU,
514
521
  GGML_UNARY_OP_RELU,
522
+ GGML_UNARY_OP_SIGMOID,
515
523
  GGML_UNARY_OP_GELU,
516
524
  GGML_UNARY_OP_GELU_QUICK,
517
525
  GGML_UNARY_OP_SILU,
@@ -557,7 +565,8 @@ extern "C" {
557
565
  // n-dimensional tensor
558
566
  struct ggml_tensor {
559
567
  enum ggml_type type;
560
- enum ggml_backend_type backend;
568
+
569
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
561
570
 
562
571
  struct ggml_backend_buffer * buffer;
563
572
 
@@ -758,7 +767,8 @@ extern "C" {
758
767
  GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
759
768
  GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
760
769
 
761
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
770
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
771
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
762
772
 
763
773
  // use this to compute the memory overhead of a tensor
764
774
  GGML_API size_t ggml_tensor_overhead(void);
@@ -1066,6 +1076,14 @@ extern "C" {
1066
1076
  struct ggml_context * ctx,
1067
1077
  struct ggml_tensor * a);
1068
1078
 
1079
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1080
+ struct ggml_context * ctx,
1081
+ struct ggml_tensor * a);
1082
+
1083
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1084
+ struct ggml_context * ctx,
1085
+ struct ggml_tensor * a);
1086
+
1069
1087
  GGML_API struct ggml_tensor * ggml_gelu(
1070
1088
  struct ggml_context * ctx,
1071
1089
  struct ggml_tensor * a);
@@ -1420,15 +1438,13 @@ extern "C" {
1420
1438
  struct ggml_context * ctx,
1421
1439
  struct ggml_tensor * a);
1422
1440
 
1423
- // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1441
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1424
1442
  // mask is optional
1425
- // pos is required when max_bias > 0.0f
1426
1443
  // max_bias = 0.0f for no ALiBi
1427
1444
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1428
1445
  struct ggml_context * ctx,
1429
1446
  struct ggml_tensor * a,
1430
1447
  struct ggml_tensor * mask,
1431
- struct ggml_tensor * pos,
1432
1448
  float scale,
1433
1449
  float max_bias);
1434
1450
 
@@ -1530,16 +1546,6 @@ extern "C" {
1530
1546
  float xpos_base,
1531
1547
  bool xpos_down);
1532
1548
 
1533
- // alibi position embedding
1534
- // in-place, returns view(a)
1535
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1536
- struct ggml_context * ctx,
1537
- struct ggml_tensor * a,
1538
- int n_past,
1539
- int n_head,
1540
- float bias_max),
1541
- "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1542
-
1543
1549
  // clamp
1544
1550
  // in-place, returns view(a)
1545
1551
  GGML_API struct ggml_tensor * ggml_clamp(
@@ -1669,12 +1675,24 @@ extern "C" {
1669
1675
  float p1);
1670
1676
 
1671
1677
  // nearest interpolate
1678
+ // multiplies ne0 and ne1 by scale factor
1672
1679
  // used in stable-diffusion
1673
1680
  GGML_API struct ggml_tensor * ggml_upscale(
1674
1681
  struct ggml_context * ctx,
1675
1682
  struct ggml_tensor * a,
1676
1683
  int scale_factor);
1677
1684
 
1685
+ // nearest interpolate
1686
+ // nearest interpolate to specified dimensions
1687
+ // used in tortoise.cpp
1688
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1689
+ struct ggml_context * ctx,
1690
+ struct ggml_tensor * a,
1691
+ int ne0,
1692
+ int ne1,
1693
+ int ne2,
1694
+ int ne3);
1695
+
1678
1696
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1679
1697
  GGML_API struct ggml_tensor * ggml_pad(
1680
1698
  struct ggml_context * ctx,
@@ -1736,7 +1754,8 @@ extern "C" {
1736
1754
  struct ggml_tensor * k,
1737
1755
  struct ggml_tensor * v,
1738
1756
  struct ggml_tensor * mask,
1739
- float scale);
1757
+ float scale,
1758
+ float max_bias);
1740
1759
 
1741
1760
  GGML_API void ggml_flash_attn_ext_set_prec(
1742
1761
  struct ggml_tensor * a,