llama_cpp 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -213,8 +213,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
213
213
  error_desc = "insufficient memory";
214
214
  break;
215
215
  }
216
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
217
- __func__, error_desc, size/(1024.0*1024.0));
216
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
218
217
  return NULL;
219
218
  }
220
219
  return aligned_memory;
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
1643
1642
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1644
1643
 
1645
1644
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1645
+ [GGML_TYPE_I8] = {
1646
+ .type_name = "i8",
1647
+ .blck_size = 1,
1648
+ .type_size = sizeof(int8_t),
1649
+ .is_quantized = false,
1650
+ },
1651
+ [GGML_TYPE_I16] = {
1652
+ .type_name = "i16",
1653
+ .blck_size = 1,
1654
+ .type_size = sizeof(int16_t),
1655
+ .is_quantized = false,
1656
+ },
1657
+ [GGML_TYPE_I32] = {
1658
+ .type_name = "i32",
1659
+ .blck_size = 1,
1660
+ .type_size = sizeof(int32_t),
1661
+ .is_quantized = false,
1662
+ },
1646
1663
  [GGML_TYPE_F32] = {
1664
+ .type_name = "f32",
1665
+ .blck_size = 1,
1666
+ .type_size = sizeof(float),
1667
+ .is_quantized = false,
1647
1668
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
1648
1669
  .vec_dot_type = GGML_TYPE_F32,
1649
1670
  },
1650
1671
  [GGML_TYPE_F16] = {
1672
+ .type_name = "f16",
1673
+ .blck_size = 1,
1674
+ .type_size = sizeof(ggml_fp16_t),
1675
+ .is_quantized = false,
1651
1676
  .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
1652
1677
  .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1653
1678
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1655
1680
  .vec_dot_type = GGML_TYPE_F16,
1656
1681
  },
1657
1682
  [GGML_TYPE_Q4_0] = {
1683
+ .type_name = "q4_0",
1684
+ .blck_size = QK4_0,
1685
+ .type_size = sizeof(block_q4_0),
1686
+ .is_quantized = true,
1658
1687
  .to_float = (ggml_to_float_t) dequantize_row_q4_0,
1659
1688
  .from_float = quantize_row_q4_0,
1660
1689
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1662
1691
  .vec_dot_type = GGML_TYPE_Q8_0,
1663
1692
  },
1664
1693
  [GGML_TYPE_Q4_1] = {
1694
+ .type_name = "q4_1",
1695
+ .blck_size = QK4_1,
1696
+ .type_size = sizeof(block_q4_1),
1697
+ .is_quantized = true,
1665
1698
  .to_float = (ggml_to_float_t) dequantize_row_q4_1,
1666
1699
  .from_float = quantize_row_q4_1,
1667
1700
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1669
1702
  .vec_dot_type = GGML_TYPE_Q8_1,
1670
1703
  },
1671
1704
  [GGML_TYPE_Q5_0] = {
1705
+ .type_name = "q5_0",
1706
+ .blck_size = QK5_0,
1707
+ .type_size = sizeof(block_q5_0),
1708
+ .is_quantized = true,
1672
1709
  .to_float = (ggml_to_float_t) dequantize_row_q5_0,
1673
1710
  .from_float = quantize_row_q5_0,
1674
1711
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1676
1713
  .vec_dot_type = GGML_TYPE_Q8_0,
1677
1714
  },
1678
1715
  [GGML_TYPE_Q5_1] = {
1716
+ .type_name = "q5_1",
1717
+ .blck_size = QK5_1,
1718
+ .type_size = sizeof(block_q5_1),
1719
+ .is_quantized = true,
1679
1720
  .to_float = (ggml_to_float_t) dequantize_row_q5_1,
1680
1721
  .from_float = quantize_row_q5_1,
1681
1722
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1683
1724
  .vec_dot_type = GGML_TYPE_Q8_1,
1684
1725
  },
1685
1726
  [GGML_TYPE_Q8_0] = {
1727
+ .type_name = "q8_0",
1728
+ .blck_size = QK8_0,
1729
+ .type_size = sizeof(block_q8_0),
1730
+ .is_quantized = true,
1686
1731
  .to_float = dequantize_row_q8_0,
1687
1732
  .from_float = quantize_row_q8_0,
1688
1733
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1690
1735
  .vec_dot_type = GGML_TYPE_Q8_0,
1691
1736
  },
1692
1737
  [GGML_TYPE_Q8_1] = {
1738
+ .type_name = "q8_1",
1739
+ .blck_size = QK8_1,
1740
+ .type_size = sizeof(block_q8_1),
1741
+ .is_quantized = true,
1693
1742
  .from_float = quantize_row_q8_1,
1694
1743
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
1695
1744
  .vec_dot_type = GGML_TYPE_Q8_1,
1696
1745
  },
1697
1746
  #ifdef GGML_USE_K_QUANTS
1698
1747
  [GGML_TYPE_Q2_K] = {
1748
+ .type_name = "q2_K",
1749
+ .blck_size = QK_K,
1750
+ .type_size = sizeof(block_q2_K),
1751
+ .is_quantized = true,
1699
1752
  .to_float = (ggml_to_float_t) dequantize_row_q2_K,
1700
1753
  .from_float = quantize_row_q2_K,
1701
1754
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1703
1756
  .vec_dot_type = GGML_TYPE_Q8_K,
1704
1757
  },
1705
1758
  [GGML_TYPE_Q3_K] = {
1759
+ .type_name = "q3_K",
1760
+ .blck_size = QK_K,
1761
+ .type_size = sizeof(block_q3_K),
1762
+ .is_quantized = true,
1706
1763
  .to_float = (ggml_to_float_t) dequantize_row_q3_K,
1707
1764
  .from_float = quantize_row_q3_K,
1708
1765
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1710
1767
  .vec_dot_type = GGML_TYPE_Q8_K,
1711
1768
  },
1712
1769
  [GGML_TYPE_Q4_K] = {
1770
+ .type_name = "q4_K",
1771
+ .blck_size = QK_K,
1772
+ .type_size = sizeof(block_q4_K),
1773
+ .is_quantized = true,
1713
1774
  .to_float = (ggml_to_float_t) dequantize_row_q4_K,
1714
1775
  .from_float = quantize_row_q4_K,
1715
1776
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1717
1778
  .vec_dot_type = GGML_TYPE_Q8_K,
1718
1779
  },
1719
1780
  [GGML_TYPE_Q5_K] = {
1781
+ .type_name = "q5_K",
1782
+ .blck_size = QK_K,
1783
+ .type_size = sizeof(block_q5_K),
1784
+ .is_quantized = true,
1720
1785
  .to_float = (ggml_to_float_t) dequantize_row_q5_K,
1721
1786
  .from_float = quantize_row_q5_K,
1722
1787
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1724
1789
  .vec_dot_type = GGML_TYPE_Q8_K,
1725
1790
  },
1726
1791
  [GGML_TYPE_Q6_K] = {
1792
+ .type_name = "q6_K",
1793
+ .blck_size = QK_K,
1794
+ .type_size = sizeof(block_q6_K),
1795
+ .is_quantized = true,
1727
1796
  .to_float = (ggml_to_float_t) dequantize_row_q6_K,
1728
1797
  .from_float = quantize_row_q6_K,
1729
1798
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1731
1800
  .vec_dot_type = GGML_TYPE_Q8_K,
1732
1801
  },
1733
1802
  [GGML_TYPE_Q8_K] = {
1803
+ .type_name = "q8_K",
1804
+ .blck_size = QK_K,
1805
+ .type_size = sizeof(block_q8_K),
1806
+ .is_quantized = true,
1734
1807
  .from_float = quantize_row_q8_K,
1735
1808
  }
1736
1809
  #endif
1737
1810
  };
1738
1811
 
1739
1812
  // For internal test use
1740
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
1741
- GGML_ASSERT(i < GGML_TYPE_COUNT);
1742
- return type_traits[i];
1813
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1814
+ GGML_ASSERT(type < GGML_TYPE_COUNT);
1815
+ return type_traits[type];
1743
1816
  }
1744
1817
 
1745
1818
 
@@ -3481,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
3481
3554
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
3482
3555
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3483
3556
 
3484
- static const float GELU_COEF_A = 0.044715f;
3485
- static const float GELU_QUICK_COEF = -1.702f;
3486
- static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3557
+ static const float GELU_COEF_A = 0.044715f;
3558
+ static const float GELU_QUICK_COEF = -1.702f;
3559
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3487
3560
 
3488
3561
  inline static float ggml_gelu_f32(float x) {
3489
3562
  return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -3652,95 +3725,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
3652
3725
  // data types
3653
3726
  //
3654
3727
 
3655
- static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3656
- [GGML_TYPE_F32] = 1,
3657
- [GGML_TYPE_F16] = 1,
3658
- [GGML_TYPE_Q4_0] = QK4_0,
3659
- [GGML_TYPE_Q4_1] = QK4_1,
3660
- [GGML_TYPE_Q5_0] = QK5_0,
3661
- [GGML_TYPE_Q5_1] = QK5_1,
3662
- [GGML_TYPE_Q8_0] = QK8_0,
3663
- [GGML_TYPE_Q8_1] = QK8_1,
3664
- #ifdef GGML_USE_K_QUANTS
3665
- [GGML_TYPE_Q2_K] = QK_K,
3666
- [GGML_TYPE_Q3_K] = QK_K,
3667
- [GGML_TYPE_Q4_K] = QK_K,
3668
- [GGML_TYPE_Q5_K] = QK_K,
3669
- [GGML_TYPE_Q6_K] = QK_K,
3670
- [GGML_TYPE_Q8_K] = QK_K,
3671
- #endif
3672
- [GGML_TYPE_I8] = 1,
3673
- [GGML_TYPE_I16] = 1,
3674
- [GGML_TYPE_I32] = 1,
3675
- };
3676
- static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3677
-
3678
- static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3679
- [GGML_TYPE_F32] = sizeof(float),
3680
- [GGML_TYPE_F16] = sizeof(ggml_fp16_t),
3681
- [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3682
- [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
3683
- [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3684
- [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3685
- [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3686
- [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3687
- #ifdef GGML_USE_K_QUANTS
3688
- [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3689
- [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3690
- [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3691
- [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3692
- [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3693
- [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3694
- #endif
3695
- [GGML_TYPE_I8] = sizeof(int8_t),
3696
- [GGML_TYPE_I16] = sizeof(int16_t),
3697
- [GGML_TYPE_I32] = sizeof(int32_t),
3698
- };
3699
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3700
-
3701
-
3702
- static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3703
- [GGML_TYPE_F32] = "f32",
3704
- [GGML_TYPE_F16] = "f16",
3705
- [GGML_TYPE_Q4_0] = "q4_0",
3706
- [GGML_TYPE_Q4_1] = "q4_1",
3707
- [GGML_TYPE_Q5_0] = "q5_0",
3708
- [GGML_TYPE_Q5_1] = "q5_1",
3709
- [GGML_TYPE_Q8_0] = "q8_0",
3710
- [GGML_TYPE_Q8_1] = "q8_1",
3711
- [GGML_TYPE_Q2_K] = "q2_K",
3712
- [GGML_TYPE_Q3_K] = "q3_K",
3713
- [GGML_TYPE_Q4_K] = "q4_K",
3714
- [GGML_TYPE_Q5_K] = "q5_K",
3715
- [GGML_TYPE_Q6_K] = "q6_K",
3716
- [GGML_TYPE_Q8_K] = "q8_K",
3717
- [GGML_TYPE_I8] = "i8",
3718
- [GGML_TYPE_I16] = "i16",
3719
- [GGML_TYPE_I32] = "i32",
3720
- };
3721
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3722
-
3723
- static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3724
- [GGML_TYPE_F32] = false,
3725
- [GGML_TYPE_F16] = false,
3726
- [GGML_TYPE_Q4_0] = true,
3727
- [GGML_TYPE_Q4_1] = true,
3728
- [GGML_TYPE_Q5_0] = true,
3729
- [GGML_TYPE_Q5_1] = true,
3730
- [GGML_TYPE_Q8_0] = true,
3731
- [GGML_TYPE_Q8_1] = true,
3732
- [GGML_TYPE_Q2_K] = true,
3733
- [GGML_TYPE_Q3_K] = true,
3734
- [GGML_TYPE_Q4_K] = true,
3735
- [GGML_TYPE_Q5_K] = true,
3736
- [GGML_TYPE_Q6_K] = true,
3737
- [GGML_TYPE_Q8_K] = true,
3738
- [GGML_TYPE_I8] = false,
3739
- [GGML_TYPE_I16] = false,
3740
- [GGML_TYPE_I32] = false,
3741
- };
3742
- static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3743
-
3744
3728
  static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3745
3729
  "NONE",
3746
3730
 
@@ -3760,10 +3744,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3760
3744
  "ARGMAX",
3761
3745
  "REPEAT",
3762
3746
  "REPEAT_BACK",
3747
+ "CONCAT",
3763
3748
  "SILU_BACK",
3764
3749
  "NORM",
3765
3750
  "RMS_NORM",
3766
3751
  "RMS_NORM_BACK",
3752
+ "GROUP_NORM",
3767
3753
 
3768
3754
  "MUL_MAT",
3769
3755
  "OUT_PROD",
@@ -3789,20 +3775,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3789
3775
  "CLAMP",
3790
3776
  "CONV_1D",
3791
3777
  "CONV_2D",
3778
+ "CONV_TRANSPOSE_2D",
3792
3779
  "POOL_1D",
3793
3780
  "POOL_2D",
3781
+ "UPSCALE",
3794
3782
 
3795
3783
  "FLASH_ATTN",
3796
3784
  "FLASH_FF",
3797
3785
  "FLASH_ATTN_BACK",
3798
3786
  "WIN_PART",
3799
3787
  "WIN_UNPART",
3788
+ "GET_REL_POS",
3789
+ "ADD_REL_POS",
3800
3790
 
3801
3791
  "UNARY",
3802
3792
 
3803
3793
  "MAP_UNARY",
3804
3794
  "MAP_BINARY",
3805
3795
 
3796
+ "MAP_CUSTOM1_F32",
3797
+ "MAP_CUSTOM2_F32",
3798
+ "MAP_CUSTOM3_F32",
3799
+
3806
3800
  "MAP_CUSTOM1",
3807
3801
  "MAP_CUSTOM2",
3808
3802
  "MAP_CUSTOM3",
@@ -3811,7 +3805,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3811
3805
  "CROSS_ENTROPY_LOSS_BACK",
3812
3806
  };
3813
3807
 
3814
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3808
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3815
3809
 
3816
3810
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3817
3811
  "none",
@@ -3832,10 +3826,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3832
3826
  "argmax(x)",
3833
3827
  "repeat(x)",
3834
3828
  "repeat_back(x)",
3829
+ "concat(x, y)",
3835
3830
  "silu_back(x)",
3836
3831
  "norm(x)",
3837
3832
  "rms_norm(x)",
3838
3833
  "rms_norm_back(x)",
3834
+ "group_norm(x)",
3839
3835
 
3840
3836
  "X*Y",
3841
3837
  "X*Y",
@@ -3861,20 +3857,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3861
3857
  "clamp(x)",
3862
3858
  "conv_1d(x)",
3863
3859
  "conv_2d(x)",
3860
+ "conv_transpose_2d(x)",
3864
3861
  "pool_1d(x)",
3865
3862
  "pool_2d(x)",
3863
+ "upscale(x)",
3866
3864
 
3867
3865
  "flash_attn(x)",
3868
3866
  "flash_ff(x)",
3869
3867
  "flash_attn_back(x)",
3870
3868
  "win_part(x)",
3871
3869
  "win_unpart(x)",
3870
+ "get_rel_pos(x)",
3871
+ "add_rel_pos(x)",
3872
3872
 
3873
3873
  "unary(x)",
3874
3874
 
3875
3875
  "f(x)",
3876
3876
  "f(x,y)",
3877
3877
 
3878
+ "custom_f32(x)",
3879
+ "custom_f32(x,y)",
3880
+ "custom_f32(x,y,z)",
3881
+
3878
3882
  "custom(x)",
3879
3883
  "custom(x,y)",
3880
3884
  "custom(x,y,z)",
@@ -3883,7 +3887,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3883
3887
  "cross_entropy_loss_back(x,y)",
3884
3888
  };
3885
3889
 
3886
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3890
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3887
3891
 
3888
3892
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3889
3893
 
@@ -3913,8 +3917,10 @@ static void ggml_setup_op_has_task_pass(void) {
3913
3917
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
3914
3918
  p[GGML_OP_CONV_1D ] = true;
3915
3919
  p[GGML_OP_CONV_2D ] = true;
3920
+ p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
3916
3921
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
3917
3922
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3923
+ p[GGML_OP_ADD_REL_POS ] = true;
3918
3924
  }
3919
3925
 
3920
3926
  { // FINALIZE
@@ -4110,29 +4116,37 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
4116
  //
4111
4117
  // is enough, but just in case, adding the second part
4112
4118
 
4113
- return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4119
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4120
+ }
4121
+
4122
+ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
4123
+ return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
4114
4124
  }
4115
4125
 
4116
4126
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
4117
4127
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4118
4128
 
4119
- return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
4129
+ return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
4120
4130
  }
4121
4131
 
4122
4132
  int ggml_blck_size(enum ggml_type type) {
4123
- return GGML_BLCK_SIZE[type];
4133
+ return type_traits[type].blck_size;
4124
4134
  }
4125
4135
 
4126
4136
  size_t ggml_type_size(enum ggml_type type) {
4127
- return GGML_TYPE_SIZE[type];
4137
+ return type_traits[type].type_size;
4128
4138
  }
4129
4139
 
4130
4140
  float ggml_type_sizef(enum ggml_type type) {
4131
- return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
4141
+ return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
4132
4142
  }
4133
4143
 
4134
4144
  const char * ggml_type_name(enum ggml_type type) {
4135
- return GGML_TYPE_NAME[type];
4145
+ return type_traits[type].type_name;
4146
+ }
4147
+
4148
+ bool ggml_is_quantized(enum ggml_type type) {
4149
+ return type_traits[type].is_quantized;
4136
4150
  }
4137
4151
 
4138
4152
  const char * ggml_op_name(enum ggml_op op) {
@@ -4144,7 +4158,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
4144
4158
  }
4145
4159
 
4146
4160
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4147
- return GGML_TYPE_SIZE[tensor->type];
4161
+ return ggml_type_size(tensor->type);
4148
4162
  }
4149
4163
 
4150
4164
  static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@@ -4182,10 +4196,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
4182
4196
  (t0->ne[3] == t1->ne[3]);
4183
4197
  }
4184
4198
 
4185
- bool ggml_is_quantized(enum ggml_type type) {
4186
- return GGML_IS_QUANTIZED[type];
4187
- }
4188
-
4189
4199
  enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4190
4200
  enum ggml_type wtype = GGML_TYPE_COUNT;
4191
4201
 
@@ -4223,8 +4233,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4223
4233
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4224
4234
 
4225
4235
  return
4226
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4227
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
4236
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4237
+ tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
4228
4238
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4229
4239
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4230
4240
  }
@@ -4233,7 +4243,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
4233
4243
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
4244
 
4235
4245
  return
4236
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4246
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4237
4247
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
4248
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
4249
  }
@@ -4248,7 +4258,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4248
4258
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4249
4259
 
4250
4260
  return
4251
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4261
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4252
4262
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4253
4263
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4254
4264
  }
@@ -4567,7 +4577,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4567
4577
  size_t data_size = 0;
4568
4578
 
4569
4579
  if (data == NULL && !ctx->no_alloc) {
4570
- data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4580
+ data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4571
4581
  for (int i = 1; i < n_dims; i++) {
4572
4582
  data_size *= ne[i];
4573
4583
  }
@@ -4622,8 +4632,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4622
4632
  result->ne[i] = ne[i];
4623
4633
  }
4624
4634
 
4625
- result->nb[0] = GGML_TYPE_SIZE[type];
4626
- result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
4635
+ result->nb[0] = ggml_type_size(type);
4636
+ result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
4627
4637
  for (int i = 2; i < GGML_MAX_DIMS; i++) {
4628
4638
  result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
4629
4639
  }
@@ -5545,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
5545
5555
  is_node = true;
5546
5556
  }
5547
5557
 
5548
- if (ggml_are_same_shape(a, b) && !is_node) {
5549
- return a;
5550
- }
5551
-
5552
5558
  struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
5553
5559
 
5554
5560
  result->op = GGML_OP_REPEAT;
@@ -5587,6 +5593,30 @@ struct ggml_tensor * ggml_repeat_back(
5587
5593
  return result;
5588
5594
  }
5589
5595
 
5596
+ // ggml_concat
5597
+
5598
+ struct ggml_tensor* ggml_concat(
5599
+ struct ggml_context* ctx,
5600
+ struct ggml_tensor* a,
5601
+ struct ggml_tensor* b) {
5602
+ GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
5603
+
5604
+ bool is_node = false;
5605
+
5606
+ if (a->grad || b->grad) {
5607
+ is_node = true;
5608
+ }
5609
+
5610
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
5611
+
5612
+ result->op = GGML_OP_CONCAT;
5613
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5614
+ result->src[0] = a;
5615
+ result->src[1] = b;
5616
+
5617
+ return result;
5618
+ }
5619
+
5590
5620
  // ggml_abs
5591
5621
 
5592
5622
  struct ggml_tensor * ggml_abs(
@@ -5755,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
5755
5785
  static struct ggml_tensor * ggml_norm_impl(
5756
5786
  struct ggml_context * ctx,
5757
5787
  struct ggml_tensor * a,
5788
+ float eps,
5758
5789
  bool inplace) {
5759
5790
  bool is_node = false;
5760
5791
 
@@ -5765,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
5765
5796
 
5766
5797
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5767
5798
 
5768
- // TODO: maybe store epsilon here?
5799
+ ggml_set_op_params(result, &eps, sizeof(eps));
5769
5800
 
5770
5801
  result->op = GGML_OP_NORM;
5771
5802
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5776,16 +5807,20 @@ static struct ggml_tensor * ggml_norm_impl(
5776
5807
 
5777
5808
  struct ggml_tensor * ggml_norm(
5778
5809
  struct ggml_context * ctx,
5779
- struct ggml_tensor * a) {
5780
- return ggml_norm_impl(ctx, a, false);
5810
+ struct ggml_tensor * a,
5811
+ float eps) {
5812
+ return ggml_norm_impl(ctx, a, eps, false);
5781
5813
  }
5782
5814
 
5783
5815
  struct ggml_tensor * ggml_norm_inplace(
5784
5816
  struct ggml_context * ctx,
5785
- struct ggml_tensor * a) {
5786
- return ggml_norm_impl(ctx, a, true);
5817
+ struct ggml_tensor * a,
5818
+ float eps) {
5819
+ return ggml_norm_impl(ctx, a, eps, true);
5787
5820
  }
5788
5821
 
5822
+ // ggml_rms_norm
5823
+
5789
5824
  static struct ggml_tensor * ggml_rms_norm_impl(
5790
5825
  struct ggml_context * ctx,
5791
5826
  struct ggml_tensor * a,
@@ -5822,6 +5857,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5822
5857
  return ggml_rms_norm_impl(ctx, a, eps, true);
5823
5858
  }
5824
5859
 
5860
+ // ggml_rms_norm_back
5861
+
5825
5862
  struct ggml_tensor * ggml_rms_norm_back(
5826
5863
  struct ggml_context * ctx,
5827
5864
  struct ggml_tensor * a,
@@ -5843,6 +5880,44 @@ struct ggml_tensor * ggml_rms_norm_back(
5843
5880
  return result;
5844
5881
  }
5845
5882
 
5883
+ // ggml_group_norm
5884
+
5885
+ static struct ggml_tensor * ggml_group_norm_impl(
5886
+ struct ggml_context * ctx,
5887
+ struct ggml_tensor * a,
5888
+ int n_groups,
5889
+ bool inplace) {
5890
+
5891
+ bool is_node = false;
5892
+ if (!inplace && (a->grad)) {
5893
+ GGML_ASSERT(false); // TODO: implement backward
5894
+ is_node = true;
5895
+ }
5896
+
5897
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5898
+
5899
+ result->op = GGML_OP_GROUP_NORM;
5900
+ result->op_params[0] = n_groups;
5901
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5902
+ result->src[0] = a;
5903
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
5904
+
5905
+ return result;
5906
+ }
5907
+
5908
+ struct ggml_tensor * ggml_group_norm(
5909
+ struct ggml_context * ctx,
5910
+ struct ggml_tensor * a,
5911
+ int n_groups) {
5912
+ return ggml_group_norm_impl(ctx, a, n_groups, false);
5913
+ }
5914
+
5915
+ struct ggml_tensor * ggml_group_norm_inplace(
5916
+ struct ggml_context * ctx,
5917
+ struct ggml_tensor * a,
5918
+ int n_groups) {
5919
+ return ggml_group_norm_impl(ctx, a, n_groups, true);
5920
+ }
5846
5921
 
5847
5922
  // ggml_mul_mat
5848
5923
 
@@ -6711,6 +6786,8 @@ static struct ggml_tensor * ggml_rope_impl(
6711
6786
  int n_ctx,
6712
6787
  float freq_base,
6713
6788
  float freq_scale,
6789
+ float xpos_base,
6790
+ bool xpos_down,
6714
6791
  bool inplace) {
6715
6792
  GGML_ASSERT(n_past >= 0);
6716
6793
  bool is_node = false;
@@ -6721,9 +6798,11 @@ static struct ggml_tensor * ggml_rope_impl(
6721
6798
 
6722
6799
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6723
6800
 
6724
- int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6801
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
6725
6802
  memcpy(params + 4, &freq_base, sizeof(float));
6726
6803
  memcpy(params + 5, &freq_scale, sizeof(float));
6804
+ memcpy(params + 6, &xpos_base, sizeof(float));
6805
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6727
6806
  ggml_set_op_params(result, params, sizeof(params));
6728
6807
 
6729
6808
  result->op = GGML_OP_ROPE;
@@ -6740,7 +6819,7 @@ struct ggml_tensor * ggml_rope(
6740
6819
  int n_dims,
6741
6820
  int mode,
6742
6821
  int n_ctx) {
6743
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
6822
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
6744
6823
  }
6745
6824
 
6746
6825
  struct ggml_tensor * ggml_rope_inplace(
@@ -6750,7 +6829,7 @@ struct ggml_tensor * ggml_rope_inplace(
6750
6829
  int n_dims,
6751
6830
  int mode,
6752
6831
  int n_ctx) {
6753
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6832
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
6754
6833
  }
6755
6834
 
6756
6835
  struct ggml_tensor * ggml_rope_custom(
@@ -6762,7 +6841,7 @@ struct ggml_tensor * ggml_rope_custom(
6762
6841
  int n_ctx,
6763
6842
  float freq_base,
6764
6843
  float freq_scale) {
6765
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6844
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
6766
6845
  }
6767
6846
 
6768
6847
  struct ggml_tensor * ggml_rope_custom_inplace(
@@ -6774,7 +6853,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6774
6853
  int n_ctx,
6775
6854
  float freq_base,
6776
6855
  float freq_scale) {
6777
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
6856
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
6857
+ }
6858
+
6859
+ struct ggml_tensor * ggml_rope_xpos_inplace(
6860
+ struct ggml_context * ctx,
6861
+ struct ggml_tensor * a,
6862
+ int n_past,
6863
+ int n_dims,
6864
+ float base,
6865
+ bool down) {
6866
+ return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
6778
6867
  }
6779
6868
 
6780
6869
  // ggml_rope_back
@@ -6785,7 +6874,11 @@ struct ggml_tensor * ggml_rope_back(
6785
6874
  int n_past,
6786
6875
  int n_dims,
6787
6876
  int mode,
6788
- int n_ctx) {
6877
+ int n_ctx,
6878
+ float freq_base,
6879
+ float freq_scale,
6880
+ float xpos_base,
6881
+ bool xpos_down) {
6789
6882
  GGML_ASSERT(n_past >= 0);
6790
6883
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
6791
6884
 
@@ -6797,7 +6890,11 @@ struct ggml_tensor * ggml_rope_back(
6797
6890
 
6798
6891
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6799
6892
 
6800
- int32_t params[] = { n_past, n_dims, mode, n_ctx };
6893
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
6894
+ memcpy(params + 4, &freq_base, sizeof(float));
6895
+ memcpy(params + 5, &freq_scale, sizeof(float));
6896
+ memcpy(params + 6, &xpos_base, sizeof(float));
6897
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6801
6898
  ggml_set_op_params(result, params, sizeof(params));
6802
6899
 
6803
6900
  result->op = GGML_OP_ROPE_BACK;
@@ -6904,6 +7001,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6904
7001
  return result;
6905
7002
  }
6906
7003
 
7004
+ // ggml_conv_1d_ph
7005
+
7006
+ struct ggml_tensor* ggml_conv_1d_ph(
7007
+ struct ggml_context * ctx,
7008
+ struct ggml_tensor * a,
7009
+ struct ggml_tensor * b,
7010
+ int s,
7011
+ int d) {
7012
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7013
+ }
7014
+
6907
7015
  // ggml_conv_2d
6908
7016
 
6909
7017
  struct ggml_tensor * ggml_conv_2d(
@@ -6944,17 +7052,59 @@ struct ggml_tensor * ggml_conv_2d(
6944
7052
 
6945
7053
  }
6946
7054
 
6947
- // ggml_conv_1d_ph
7055
+ // ggml_conv_2d_sk_p0
6948
7056
 
6949
- struct ggml_tensor * ggml_conv_1d_ph(
7057
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6950
7058
  struct ggml_context * ctx,
6951
7059
  struct ggml_tensor * a,
6952
- struct ggml_tensor * b,
6953
- int s,
6954
- int d) {
6955
- return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7060
+ struct ggml_tensor * b) {
7061
+ return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
7062
+ }
7063
+
7064
+ // ggml_conv_2d_s1_ph
7065
+
7066
+ struct ggml_tensor * ggml_conv_2d_s1_ph(
7067
+ struct ggml_context * ctx,
7068
+ struct ggml_tensor * a,
7069
+ struct ggml_tensor * b) {
7070
+ return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
6956
7071
  }
6957
7072
 
7073
+ // ggml_conv_transpose_2d_p0
7074
+
7075
+ static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
7076
+ return (ins - 1) * s - 2 * p + ks;
7077
+ }
7078
+
7079
+ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7080
+ struct ggml_context * ctx,
7081
+ struct ggml_tensor * a,
7082
+ struct ggml_tensor * b,
7083
+ int stride) {
7084
+ GGML_ASSERT(a->ne[3] == b->ne[2]);
7085
+
7086
+ bool is_node = false;
7087
+
7088
+ if (a->grad || b->grad) {
7089
+ GGML_ASSERT(false); // TODO: implement backward
7090
+ is_node = true;
7091
+ }
7092
+
7093
+ const int64_t ne[4] = {
7094
+ ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
7095
+ ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
7096
+ a->ne[2], b->ne[3],
7097
+ };
7098
+
7099
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7100
+ result->op = GGML_OP_CONV_TRANSPOSE_2D;
7101
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7102
+ result->src[0] = a;
7103
+ result->src[1] = b;
7104
+ result->src[2] = ggml_new_i32(ctx, stride);
7105
+
7106
+ return result;
7107
+ }
6958
7108
 
6959
7109
  // ggml_pool_*
6960
7110
 
@@ -7032,6 +7182,40 @@ struct ggml_tensor * ggml_pool_2d(
7032
7182
  return result;
7033
7183
  }
7034
7184
 
7185
+ // ggml_upscale
7186
+
7187
+ static struct ggml_tensor * ggml_upscale_impl(
7188
+ struct ggml_context * ctx,
7189
+ struct ggml_tensor * a,
7190
+ int scale_factor) {
7191
+ bool is_node = false;
7192
+
7193
+ if (a->grad) {
7194
+ GGML_ASSERT(false); // TODO: implement backward
7195
+ is_node = true;
7196
+ }
7197
+
7198
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
7199
+ a->ne[0] * scale_factor,
7200
+ a->ne[1] * scale_factor,
7201
+ a->ne[2], a->ne[3]);
7202
+
7203
+ result->op = GGML_OP_UPSCALE;
7204
+ result->op_params[0] = scale_factor;
7205
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7206
+ result->src[0] = a;
7207
+ result->src[1] = NULL;
7208
+
7209
+ return result;
7210
+ }
7211
+
7212
+ struct ggml_tensor * ggml_upscale(
7213
+ struct ggml_context * ctx,
7214
+ struct ggml_tensor * a,
7215
+ int scale_factor) {
7216
+ return ggml_upscale_impl(ctx, a, scale_factor);
7217
+ }
7218
+
7035
7219
  // ggml_flash_attn
7036
7220
 
7037
7221
  struct ggml_tensor * ggml_flash_attn(
@@ -7230,6 +7414,87 @@ struct ggml_tensor * ggml_win_unpart(
7230
7414
  return result;
7231
7415
  }
7232
7416
 
7417
+ // ggml_get_rel_pos
7418
+
7419
+ struct ggml_tensor * ggml_get_rel_pos(
7420
+ struct ggml_context * ctx,
7421
+ struct ggml_tensor * a,
7422
+ int qh,
7423
+ int kh) {
7424
+ GGML_ASSERT(qh == kh);
7425
+ GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
7426
+
7427
+ bool is_node = false;
7428
+
7429
+ if (a->grad) {
7430
+ GGML_ASSERT(false); // TODO: implement backward
7431
+ is_node = true;
7432
+ }
7433
+
7434
+ const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
7435
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
7436
+
7437
+ result->op = GGML_OP_GET_REL_POS;
7438
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7439
+ result->src[0] = a;
7440
+ result->src[1] = NULL;
7441
+
7442
+ return result;
7443
+ }
7444
+
7445
+ // ggml_add_rel_pos
7446
+
7447
+ static struct ggml_tensor * ggml_add_rel_pos_impl(
7448
+ struct ggml_context * ctx,
7449
+ struct ggml_tensor * a,
7450
+ struct ggml_tensor * pw,
7451
+ struct ggml_tensor * ph,
7452
+ bool inplace) {
7453
+ GGML_ASSERT(ggml_are_same_shape(pw, ph));
7454
+ GGML_ASSERT(ggml_is_contiguous(a));
7455
+ GGML_ASSERT(ggml_is_contiguous(pw));
7456
+ GGML_ASSERT(ggml_is_contiguous(ph));
7457
+ GGML_ASSERT(ph->type == GGML_TYPE_F32);
7458
+ GGML_ASSERT(pw->type == GGML_TYPE_F32);
7459
+ GGML_ASSERT(pw->ne[3] == a->ne[2]);
7460
+ GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
7461
+ GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
7462
+
7463
+ bool is_node = false;
7464
+
7465
+ if (!inplace && (a->grad || pw->grad || ph->grad)) {
7466
+ is_node = true;
7467
+ }
7468
+
7469
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7470
+ ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
7471
+
7472
+ result->op = GGML_OP_ADD_REL_POS;
7473
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7474
+ result->src[0] = a;
7475
+ result->src[1] = pw;
7476
+ result->src[2] = ph;
7477
+
7478
+ return result;
7479
+ }
7480
+
7481
+
7482
+ struct ggml_tensor * ggml_add_rel_pos(
7483
+ struct ggml_context * ctx,
7484
+ struct ggml_tensor * a,
7485
+ struct ggml_tensor * pw,
7486
+ struct ggml_tensor * ph) {
7487
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
7488
+ }
7489
+
7490
+ struct ggml_tensor * ggml_add_rel_pos_inplace(
7491
+ struct ggml_context * ctx,
7492
+ struct ggml_tensor * a,
7493
+ struct ggml_tensor * pw,
7494
+ struct ggml_tensor * ph) {
7495
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7496
+ }
7497
+
7233
7498
  // gmml_unary
7234
7499
 
7235
7500
  static struct ggml_tensor * ggml_unary_impl(
@@ -7745,7 +8010,7 @@ static void ggml_compute_forward_dup_same_cont(
7745
8010
  memcpy(
7746
8011
  ((char *) dst->data + ie0*nb0),
7747
8012
  ((char *) src0->data + ie0*nb00),
7748
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
8013
+ (ie1 - ie0) * ggml_type_size(src0->type));
7749
8014
  }
7750
8015
 
7751
8016
  }
@@ -7779,7 +8044,7 @@ static void ggml_compute_forward_dup_f16(
7779
8044
 
7780
8045
  if (src0->type == dst->type &&
7781
8046
  ne00 == ne0 &&
7782
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8047
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
7783
8048
  // copy by rows
7784
8049
  const size_t rs = ne00*nb00;
7785
8050
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7837,7 +8102,7 @@ static void ggml_compute_forward_dup_f16(
7837
8102
  float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
7838
8103
 
7839
8104
  size_t id = 0;
7840
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8105
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
7841
8106
  char * dst_ptr = (char *) dst->data;
7842
8107
 
7843
8108
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8050,7 +8315,7 @@ static void ggml_compute_forward_dup_f32(
8050
8315
 
8051
8316
  if (src0->type == dst->type &&
8052
8317
  ne00 == ne0 &&
8053
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8318
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
8054
8319
  // copy by rows
8055
8320
  const size_t rs = ne00*nb00;
8056
8321
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -8089,7 +8354,7 @@ static void ggml_compute_forward_dup_f32(
8089
8354
  ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
8090
8355
 
8091
8356
  size_t id = 0;
8092
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8357
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
8093
8358
  char * dst_ptr = (char *) dst->data;
8094
8359
 
8095
8360
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8501,7 +8766,7 @@ static void ggml_compute_forward_add_q_f32(
8501
8766
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8502
8767
 
8503
8768
  // we don't support permuted src0 or src1
8504
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
8769
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8505
8770
  GGML_ASSERT(nb10 == sizeof(float));
8506
8771
 
8507
8772
  // dst cannot be transposed or permuted
@@ -8775,7 +9040,7 @@ static void ggml_compute_forward_add1_q_f32(
8775
9040
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8776
9041
 
8777
9042
  // we don't support permuted src0
8778
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
9043
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8779
9044
 
8780
9045
  // dst cannot be transposed or permuted
8781
9046
  GGML_ASSERT(nb0 <= nb1);
@@ -9137,6 +9402,8 @@ static void ggml_compute_forward_mul(
9137
9402
  const struct ggml_tensor * src0,
9138
9403
  const struct ggml_tensor * src1,
9139
9404
  struct ggml_tensor * dst) {
9405
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
9406
+
9140
9407
  switch (src0->type) {
9141
9408
  case GGML_TYPE_F32:
9142
9409
  {
@@ -9731,6 +9998,72 @@ static void ggml_compute_forward_repeat_back(
9731
9998
  }
9732
9999
  }
9733
10000
 
10001
+ // ggml_compute_forward_concat
10002
+
10003
+ static void ggml_compute_forward_concat_f32(
10004
+ const struct ggml_compute_params * params,
10005
+ const struct ggml_tensor * src0,
10006
+ const struct ggml_tensor * src1,
10007
+ struct ggml_tensor * dst) {
10008
+
10009
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10010
+ return;
10011
+ }
10012
+
10013
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10014
+
10015
+ const int ith = params->ith;
10016
+
10017
+ GGML_TENSOR_BINARY_OP_LOCALS;
10018
+
10019
+ // TODO: support for transposed / permuted tensors
10020
+ GGML_ASSERT(nb0 == sizeof(float));
10021
+ GGML_ASSERT(nb00 == sizeof(float));
10022
+ GGML_ASSERT(nb10 == sizeof(float));
10023
+
10024
+ for (int i3 = 0; i3 < ne3; i3++) {
10025
+ for (int i2 = ith; i2 < ne2; i2++) {
10026
+ if (i2 < ne02) { // src0
10027
+ for (int i1 = 0; i1 < ne1; i1++) {
10028
+ for (int i0 = 0; i0 < ne0; i0++) {
10029
+ const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
10030
+
10031
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10032
+ *y = *x;
10033
+ }
10034
+ }
10035
+ } // src1
10036
+ else {
10037
+ for (int i1 = 0; i1 < ne1; i1++) {
10038
+ for (int i0 = 0; i0 < ne0; i0++) {
10039
+ const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
10040
+
10041
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10042
+ *y = *x;
10043
+ }
10044
+ }
10045
+ }
10046
+ }
10047
+ }
10048
+ }
10049
+
10050
+ static void ggml_compute_forward_concat(
10051
+ const struct ggml_compute_params* params,
10052
+ const struct ggml_tensor* src0,
10053
+ const struct ggml_tensor* src1,
10054
+ struct ggml_tensor* dst) {
10055
+ switch (src0->type) {
10056
+ case GGML_TYPE_F32:
10057
+ {
10058
+ ggml_compute_forward_concat_f32(params, src0, src1, dst);
10059
+ } break;
10060
+ default:
10061
+ {
10062
+ GGML_ASSERT(false);
10063
+ } break;
10064
+ }
10065
+ }
10066
+
9734
10067
  // ggml_compute_forward_abs
9735
10068
 
9736
10069
  static void ggml_compute_forward_abs_f32(
@@ -10285,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
10285
10618
 
10286
10619
  GGML_TENSOR_UNARY_OP_LOCALS;
10287
10620
 
10288
- const float eps = 1e-5f; // TODO: make this a parameter
10621
+ float eps;
10622
+ memcpy(&eps, dst->op_params, sizeof(float));
10289
10623
 
10290
10624
  // TODO: optimize
10291
10625
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10334,6 +10668,8 @@ static void ggml_compute_forward_norm(
10334
10668
  }
10335
10669
  }
10336
10670
 
10671
+ // ggml_compute_forward_group_rms_norm
10672
+
10337
10673
  static void ggml_compute_forward_rms_norm_f32(
10338
10674
  const struct ggml_compute_params * params,
10339
10675
  const struct ggml_tensor * src0,
@@ -10398,7 +10734,6 @@ static void ggml_compute_forward_rms_norm(
10398
10734
  }
10399
10735
  }
10400
10736
 
10401
-
10402
10737
  static void ggml_compute_forward_rms_norm_back_f32(
10403
10738
  const struct ggml_compute_params * params,
10404
10739
  const struct ggml_tensor * src0,
@@ -10572,16 +10907,106 @@ static void ggml_compute_forward_rms_norm_back(
10572
10907
  }
10573
10908
  }
10574
10909
 
10575
- // ggml_compute_forward_mul_mat
10910
+ // ggml_compute_forward_group_norm
10576
10911
 
10577
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10578
- // helper function to determine if it is better to use BLAS or not
10579
- // for large matrices, BLAS is faster
10580
- static bool ggml_compute_forward_mul_mat_use_blas(
10581
- const struct ggml_tensor * src0,
10582
- const struct ggml_tensor * src1,
10583
- struct ggml_tensor * dst) {
10584
- //const int64_t ne00 = src0->ne[0];
10912
+ static void ggml_compute_forward_group_norm_f32(
10913
+ const struct ggml_compute_params * params,
10914
+ const struct ggml_tensor * src0,
10915
+ struct ggml_tensor * dst) {
10916
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
10917
+
10918
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10919
+ return;
10920
+ }
10921
+
10922
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10923
+
10924
+ const int ith = params->ith;
10925
+ const int nth = params->nth;
10926
+
10927
+ GGML_TENSOR_UNARY_OP_LOCALS;
10928
+
10929
+ const float eps = 1e-6f; // TODO: make this a parameter
10930
+
10931
+ // TODO: optimize
10932
+
10933
+ int n_channels = src0->ne[2];
10934
+ int n_groups = dst->op_params[0];
10935
+ int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10936
+ for (int i = ith; i < n_groups; i+=nth) {
10937
+ int start = i * n_channels_per_group;
10938
+ int end = start + n_channels_per_group;
10939
+ if (end > n_channels) {
10940
+ end = n_channels;
10941
+ }
10942
+ int step = end - start;
10943
+
10944
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
10945
+ ggml_float sum = 0.0;
10946
+ for (int64_t i02 = start; i02 < end; i02++) {
10947
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
10948
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10949
+
10950
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
10951
+ sum += (ggml_float)x[i00];
10952
+ }
10953
+ }
10954
+ }
10955
+ float mean = sum / (ne00 * ne01 * step);
10956
+ ggml_float sum2 = 0.0;
10957
+
10958
+ for (int64_t i02 = start; i02 < end; i02++) {
10959
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
10960
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10961
+
10962
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10963
+
10964
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
10965
+ float v = x[i00] - mean;
10966
+ y[i00] = v;
10967
+ sum2 += (ggml_float)(v * v);
10968
+ }
10969
+ }
10970
+ }
10971
+ float variance = sum2 / (ne00 * ne01 * step);
10972
+ const float scale = 1.0f / sqrtf(variance + eps);
10973
+
10974
+ for (int64_t i02 = start; i02 < end; i02++) {
10975
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
10976
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10977
+ ggml_vec_scale_f32(ne00, y, scale);
10978
+ }
10979
+ }
10980
+ }
10981
+ }
10982
+ }
10983
+
10984
+ static void ggml_compute_forward_group_norm(
10985
+ const struct ggml_compute_params * params,
10986
+ const struct ggml_tensor * src0,
10987
+ struct ggml_tensor * dst) {
10988
+ switch (src0->type) {
10989
+ case GGML_TYPE_F32:
10990
+ {
10991
+ ggml_compute_forward_group_norm_f32(params, src0, dst);
10992
+ } break;
10993
+ default:
10994
+ {
10995
+ GGML_ASSERT(false);
10996
+ } break;
10997
+ }
10998
+ }
10999
+
11000
+ // ggml_compute_forward_mul_mat
11001
+
11002
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
11003
+ // helper function to determine if it is better to use BLAS or not
11004
+ // for large matrices, BLAS is faster
11005
+ static bool ggml_compute_forward_mul_mat_use_blas(
11006
+ const struct ggml_tensor * src0,
11007
+ const struct ggml_tensor * src1,
11008
+ struct ggml_tensor * dst) {
11009
+ //const int64_t ne00 = src0->ne[0];
10585
11010
  //const int64_t ne01 = src0->ne[1];
10586
11011
 
10587
11012
  const int64_t ne10 = src1->ne[0];
@@ -10629,7 +11054,7 @@ static void ggml_compute_forward_mul_mat(
10629
11054
  GGML_ASSERT(ne3 == ne13);
10630
11055
 
10631
11056
  // we don't support permuted src0 or src1
10632
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
11057
+ GGML_ASSERT(nb00 == ggml_type_size(type));
10633
11058
  GGML_ASSERT(nb10 == sizeof(float));
10634
11059
 
10635
11060
  // dst cannot be transposed or permuted
@@ -10638,6 +11063,10 @@ static void ggml_compute_forward_mul_mat(
10638
11063
  GGML_ASSERT(nb1 <= nb2);
10639
11064
  GGML_ASSERT(nb2 <= nb3);
10640
11065
 
11066
+ // broadcast factors
11067
+ const int64_t r2 = ne12/ne02;
11068
+ const int64_t r3 = ne13/ne03;
11069
+
10641
11070
  // nb01 >= nb00 - src0 is not transposed
10642
11071
  // compute by src0 rows
10643
11072
 
@@ -10657,11 +11086,6 @@ static void ggml_compute_forward_mul_mat(
10657
11086
 
10658
11087
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10659
11088
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10660
- // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
10661
- // ref: https://github.com/ggerganov/ggml/pull/224
10662
- GGML_ASSERT(ne02 == ne12);
10663
- GGML_ASSERT(ne03 == ne13);
10664
-
10665
11089
  if (params->ith != 0) {
10666
11090
  return;
10667
11091
  }
@@ -10674,12 +11098,16 @@ static void ggml_compute_forward_mul_mat(
10674
11098
  return;
10675
11099
  }
10676
11100
 
10677
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10678
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10679
- const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
10680
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
11101
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
11102
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
11103
+ // broadcast src0 into src1 across 2nd,3rd dimension
11104
+ const int64_t i03 = i13/r3;
11105
+ const int64_t i02 = i12/r2;
11106
+
11107
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
11108
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
10681
11109
 
10682
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
11110
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
10683
11111
 
10684
11112
  if (type != GGML_TYPE_F32) {
10685
11113
  float * const wdata = params->wdata;
@@ -10687,7 +11115,7 @@ static void ggml_compute_forward_mul_mat(
10687
11115
 
10688
11116
  size_t id = 0;
10689
11117
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
10690
- to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
11118
+ to_float((const char *) x + i01*nb01, wdata + id, ne00);
10691
11119
  id += ne00;
10692
11120
  }
10693
11121
 
@@ -10712,7 +11140,7 @@ static void ggml_compute_forward_mul_mat(
10712
11140
  if (params->type == GGML_TASK_INIT) {
10713
11141
  if (src1->type != vec_dot_type) {
10714
11142
  char * wdata = params->wdata;
10715
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11143
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10716
11144
 
10717
11145
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
10718
11146
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -10732,7 +11160,7 @@ static void ggml_compute_forward_mul_mat(
10732
11160
  }
10733
11161
 
10734
11162
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10735
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11163
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10736
11164
 
10737
11165
  const int64_t nr0 = ne01; // src0 rows
10738
11166
  const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@@ -10767,10 +11195,6 @@ static void ggml_compute_forward_mul_mat(
10767
11195
  assert(ne12 % ne02 == 0);
10768
11196
  assert(ne13 % ne03 == 0);
10769
11197
 
10770
- // broadcast factors
10771
- const int64_t r2 = ne12/ne02;
10772
- const int64_t r3 = ne13/ne03;
10773
-
10774
11198
  // block-tiling attempt
10775
11199
  const int64_t blck_0 = 16;
10776
11200
  const int64_t blck_1 = 16;
@@ -11205,7 +11629,7 @@ static void ggml_compute_forward_get_rows_q(
11205
11629
 
11206
11630
  assert( dst->ne[0] == nc);
11207
11631
  assert( dst->ne[1] == nr);
11208
- assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
11632
+ assert(src0->nb[0] == ggml_type_size(type));
11209
11633
 
11210
11634
  for (int i = 0; i < nr; ++i) {
11211
11635
  const int r = ((int32_t *) src1->data)[i];
@@ -11926,7 +12350,6 @@ static void ggml_compute_forward_alibi(
11926
12350
  }
11927
12351
  }
11928
12352
 
11929
-
11930
12353
  // ggml_compute_forward_clamp
11931
12354
 
11932
12355
  static void ggml_compute_forward_clamp_f32(
@@ -12015,12 +12438,18 @@ static void ggml_compute_forward_rope_f32(
12015
12438
  float freq_base;
12016
12439
  float freq_scale;
12017
12440
 
12441
+ // these two only relevant for xPos RoPE:
12442
+ float xpos_base;
12443
+ bool xpos_down;
12444
+
12018
12445
  const int n_past = ((int32_t *) dst->op_params)[0];
12019
12446
  const int n_dims = ((int32_t *) dst->op_params)[1];
12020
12447
  const int mode = ((int32_t *) dst->op_params)[2];
12021
12448
  const int n_ctx = ((int32_t *) dst->op_params)[3];
12022
12449
  memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12023
12450
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12451
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12452
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12024
12453
 
12025
12454
  assert(n_past >= 0);
12026
12455
 
@@ -12092,6 +12521,9 @@ static void ggml_compute_forward_rope_f32(
12092
12521
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12093
12522
  const float cos_theta = cosf(theta);
12094
12523
  const float sin_theta = sinf(theta);
12524
+ // zeta scaling for xPos only:
12525
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12526
+ if (xpos_down) zeta = 1.0f / zeta;
12095
12527
 
12096
12528
  theta *= theta_scale;
12097
12529
 
@@ -12101,11 +12533,11 @@ static void ggml_compute_forward_rope_f32(
12101
12533
  const float x0 = src[0];
12102
12534
  const float x1 = src[1];
12103
12535
 
12104
- dst_data[0] = x0*cos_theta - x1*sin_theta;
12105
- dst_data[1] = x0*sin_theta + x1*cos_theta;
12536
+ dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
12537
+ dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
12106
12538
  }
12107
12539
  } else {
12108
- // TODO: this is probably wrong, but I can't figure it out ..
12540
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12109
12541
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12110
12542
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12111
12543
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12234,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
12234
12666
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12235
12667
  }
12236
12668
  } else {
12237
- // TODO: this is probably wrong, but I can't figure it out ..
12669
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12238
12670
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12239
12671
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12240
12672
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12296,9 +12728,21 @@ static void ggml_compute_forward_rope_back_f32(
12296
12728
  // dx = rope_back(dy, src1)
12297
12729
  // src0 is dy, src1 contains options
12298
12730
 
12731
+ float freq_base;
12732
+ float freq_scale;
12733
+
12734
+ // these two only relevant for xPos RoPE:
12735
+ float xpos_base;
12736
+ bool xpos_down;
12737
+
12299
12738
  const int n_past = ((int32_t *) dst->op_params)[0];
12300
12739
  const int n_dims = ((int32_t *) dst->op_params)[1];
12301
12740
  const int mode = ((int32_t *) dst->op_params)[2];
12741
+ const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
12742
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12743
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12744
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12745
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12302
12746
 
12303
12747
  assert(n_past >= 0);
12304
12748
 
@@ -12324,7 +12768,7 @@ static void ggml_compute_forward_rope_back_f32(
12324
12768
  // row index used to determine which thread to use
12325
12769
  int ir = 0;
12326
12770
 
12327
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12771
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12328
12772
 
12329
12773
  const bool is_neox = mode & 2;
12330
12774
 
@@ -12335,12 +12779,15 @@ static void ggml_compute_forward_rope_back_f32(
12335
12779
  if (ir++ < ir0) continue;
12336
12780
  if (ir > ir1) break;
12337
12781
 
12338
- float theta = (float)p;
12782
+ float theta = freq_scale * (float)p;
12339
12783
 
12340
12784
  if (!is_neox) {
12341
12785
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12342
12786
  const float cos_theta = cosf(theta);
12343
12787
  const float sin_theta = sinf(theta);
12788
+ // zeta scaling for xPos only:
12789
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12790
+ if (xpos_down) zeta = 1.0f / zeta;
12344
12791
 
12345
12792
  theta *= theta_scale;
12346
12793
 
@@ -12350,8 +12797,8 @@ static void ggml_compute_forward_rope_back_f32(
12350
12797
  const float dy0 = dy[0];
12351
12798
  const float dy1 = dy[1];
12352
12799
 
12353
- dx[0] = dy0*cos_theta + dy1*sin_theta;
12354
- dx[1] = - dy0*sin_theta + dy1*cos_theta;
12800
+ dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
12801
+ dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
12355
12802
  }
12356
12803
  } else {
12357
12804
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
@@ -13044,6 +13491,108 @@ static void ggml_compute_forward_conv_2d(
13044
13491
  }
13045
13492
  }
13046
13493
 
13494
+ // ggml_compute_forward_conv_transpose_2d
13495
+
13496
+ static void ggml_compute_forward_conv_transpose_2d(
13497
+ const struct ggml_compute_params * params,
13498
+ const struct ggml_tensor * src0,
13499
+ const struct ggml_tensor * src1,
13500
+ const struct ggml_tensor * opt0,
13501
+ struct ggml_tensor * dst) {
13502
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13503
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13504
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13505
+
13506
+ int64_t t0 = ggml_perf_time_us();
13507
+ UNUSED(t0);
13508
+
13509
+ GGML_TENSOR_BINARY_OP_LOCALS;
13510
+
13511
+ const int ith = params->ith;
13512
+ const int nth = params->nth;
13513
+
13514
+ const int nk = ne00*ne01*ne02*ne03;
13515
+
13516
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13517
+ GGML_ASSERT(nb10 == sizeof(float));
13518
+
13519
+ if (params->type == GGML_TASK_INIT) {
13520
+ memset(params->wdata, 0, params->wsize);
13521
+
13522
+ // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
13523
+ {
13524
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13525
+
13526
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
13527
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
13528
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
13529
+ ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
13530
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
13531
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
13532
+ dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
13533
+ }
13534
+ }
13535
+ }
13536
+ }
13537
+ }
13538
+
13539
+ // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
13540
+ {
13541
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
13542
+ for (int i12 = 0; i12 < ne12; i12++) {
13543
+ for (int i11 = 0; i11 < ne11; i11++) {
13544
+ const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
13545
+ ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
13546
+ for (int i10 = 0; i10 < ne10; i10++) {
13547
+ dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
13548
+ }
13549
+ }
13550
+ }
13551
+ }
13552
+
13553
+ return;
13554
+ }
13555
+
13556
+ if (params->type == GGML_TASK_FINALIZE) {
13557
+ return;
13558
+ }
13559
+
13560
+ const int32_t stride = ((const int32_t*)(opt0->data))[0];
13561
+
13562
+ // total patches in dst
13563
+ const int np = ne2;
13564
+
13565
+ // patches per thread
13566
+ const int dp = (np + nth - 1)/nth;
13567
+
13568
+ // patch range for this thread
13569
+ const int ip0 = dp*ith;
13570
+ const int ip1 = MIN(ip0 + dp, np);
13571
+
13572
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13573
+ ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
13574
+
13575
+ for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13576
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13577
+ ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
13578
+ for (int i11 = 0; i11 < ne11; i11++) {
13579
+ for (int i10 = 0; i10 < ne10; i10++) {
13580
+ const int i1n = i11*ne10*ne12 + i10*ne12;
13581
+ for (int i01 = 0; i01 < ne01; i01++) {
13582
+ for (int i00 = 0; i00 < ne00; i00++) {
13583
+ float v = 0;
13584
+ ggml_vec_dot_f16(ne03, &v,
13585
+ (ggml_fp16_t *) wdata_src + i1n,
13586
+ (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
13587
+
13588
+ dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13589
+ }
13590
+ }
13591
+ }
13592
+ }
13593
+ }
13594
+ }
13595
+
13047
13596
  // ggml_compute_forward_pool_1d_sk_p0
13048
13597
 
13049
13598
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -13202,6 +13751,60 @@ static void ggml_compute_forward_pool_2d(
13202
13751
  ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
13203
13752
  }
13204
13753
 
13754
+ // ggml_compute_forward_upscale
13755
+
13756
+ static void ggml_compute_forward_upscale_f32(
13757
+ const struct ggml_compute_params * params,
13758
+ const struct ggml_tensor * src0,
13759
+ struct ggml_tensor * dst) {
13760
+
13761
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13762
+ return;
13763
+ }
13764
+
13765
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13766
+
13767
+ const int ith = params->ith;
13768
+
13769
+ GGML_TENSOR_UNARY_OP_LOCALS;
13770
+
13771
+ const int scale_factor = dst->op_params[0];
13772
+
13773
+ // TODO: optimize
13774
+
13775
+ for (int i03 = 0; i03 < ne03; i03++) {
13776
+ for (int i02 = ith; i02 < ne02; i02++) {
13777
+ for (int m = 0; m < dst->ne[1]; m++) {
13778
+ int i01 = m / scale_factor;
13779
+ for (int n = 0; n < dst->ne[0]; n++) {
13780
+ int i00 = n / scale_factor;
13781
+
13782
+ const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
13783
+
13784
+ float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
13785
+
13786
+ *y = *x;
13787
+ }
13788
+ }
13789
+ }
13790
+ }
13791
+ }
13792
+
13793
+ static void ggml_compute_forward_upscale(
13794
+ const struct ggml_compute_params * params,
13795
+ const struct ggml_tensor * src0,
13796
+ struct ggml_tensor * dst) {
13797
+ switch (src0->type) {
13798
+ case GGML_TYPE_F32:
13799
+ {
13800
+ ggml_compute_forward_upscale_f32(params, src0, dst);
13801
+ } break;
13802
+ default:
13803
+ {
13804
+ GGML_ASSERT(false);
13805
+ } break;
13806
+ }
13807
+ }
13205
13808
 
13206
13809
  // ggml_compute_forward_flash_attn
13207
13810
 
@@ -14327,42 +14930,43 @@ static void ggml_compute_forward_unary(
14327
14930
  }
14328
14931
  }
14329
14932
 
14330
- // ggml_compute_forward_map_unary
14933
+ // ggml_compute_forward_get_rel_pos
14331
14934
 
14332
- static void ggml_compute_forward_map_unary_f32(
14935
+ static void ggml_compute_forward_get_rel_pos_f16(
14333
14936
  const struct ggml_compute_params * params,
14334
14937
  const struct ggml_tensor * src0,
14335
- struct ggml_tensor * dst,
14336
- const ggml_unary_op_f32_t fun) {
14337
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
14338
-
14938
+ struct ggml_tensor * dst) {
14339
14939
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14340
14940
  return;
14341
14941
  }
14342
14942
 
14343
- const int n = ggml_nrows(src0);
14344
- const int nc = src0->ne[0];
14943
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
14345
14944
 
14346
- assert( dst->nb[0] == sizeof(float));
14347
- assert(src0->nb[0] == sizeof(float));
14945
+ GGML_TENSOR_UNARY_OP_LOCALS;
14348
14946
 
14349
- for (int i = 0; i < n; i++) {
14350
- fun(nc,
14351
- (float *) ((char *) dst->data + i*( dst->nb[1])),
14352
- (float *) ((char *) src0->data + i*(src0->nb[1])));
14947
+ const int64_t w = ne1;
14948
+
14949
+ ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
14950
+ ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
14951
+
14952
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14953
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14954
+ const int64_t pos = (w - i1 - 1) + i2;
14955
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14956
+ dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
14957
+ }
14958
+ }
14353
14959
  }
14354
14960
  }
14355
14961
 
14356
-
14357
- static void ggml_compute_forward_map_unary(
14962
+ static void ggml_compute_forward_get_rel_pos(
14358
14963
  const struct ggml_compute_params * params,
14359
14964
  const struct ggml_tensor * src0,
14360
- struct ggml_tensor * dst,
14361
- const ggml_unary_op_f32_t fun) {
14965
+ struct ggml_tensor * dst) {
14362
14966
  switch (src0->type) {
14363
- case GGML_TYPE_F32:
14967
+ case GGML_TYPE_F16:
14364
14968
  {
14365
- ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
14969
+ ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
14366
14970
  } break;
14367
14971
  default:
14368
14972
  {
@@ -14371,34 +14975,164 @@ static void ggml_compute_forward_map_unary(
14371
14975
  }
14372
14976
  }
14373
14977
 
14374
- // ggml_compute_forward_map_binary
14978
+ // ggml_compute_forward_add_rel_pos
14375
14979
 
14376
- static void ggml_compute_forward_map_binary_f32(
14980
+ static void ggml_compute_forward_add_rel_pos_f32(
14377
14981
  const struct ggml_compute_params * params,
14378
14982
  const struct ggml_tensor * src0,
14379
14983
  const struct ggml_tensor * src1,
14380
- struct ggml_tensor * dst,
14381
- const ggml_binary_op_f32_t fun) {
14382
- assert(params->ith == 0);
14383
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14984
+ const struct ggml_tensor * src2,
14985
+ struct ggml_tensor * dst) {
14384
14986
 
14987
+ const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14988
+ if (!inplace && params->type == GGML_TASK_INIT) {
14989
+ memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14990
+ return;
14991
+ }
14385
14992
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14386
14993
  return;
14387
14994
  }
14388
14995
 
14389
- const int n = ggml_nrows(src0);
14390
- const int nc = src0->ne[0];
14996
+ int64_t t0 = ggml_perf_time_us();
14997
+ UNUSED(t0);
14391
14998
 
14392
- assert( dst->nb[0] == sizeof(float));
14393
- assert(src0->nb[0] == sizeof(float));
14394
- assert(src1->nb[0] == sizeof(float));
14999
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
14395
15000
 
14396
- for (int i = 0; i < n; i++) {
14397
- fun(nc,
14398
- (float *) ((char *) dst->data + i*( dst->nb[1])),
14399
- (float *) ((char *) src0->data + i*(src0->nb[1])),
14400
- (float *) ((char *) src1->data + i*(src1->nb[1])));
14401
- }
15001
+ float * src1_data = (float *) src1->data;
15002
+ float * src2_data = (float *) src2->data;
15003
+ float * dst_data = (float *) dst->data;
15004
+
15005
+ const int64_t ne10 = src1->ne[0];
15006
+ const int64_t ne11 = src1->ne[1];
15007
+ const int64_t ne12 = src1->ne[2];
15008
+ const int64_t ne13 = src1->ne[3];
15009
+
15010
+ const int ith = params->ith;
15011
+ const int nth = params->nth;
15012
+
15013
+ // total patches in dst
15014
+ const int np = ne13;
15015
+
15016
+ // patches per thread
15017
+ const int dp = (np + nth - 1)/nth;
15018
+
15019
+ // patch range for this thread
15020
+ const int ip0 = dp*ith;
15021
+ const int ip1 = MIN(ip0 + dp, np);
15022
+
15023
+
15024
+ for (int64_t i13 = ip0; i13 < ip1; ++i13) {
15025
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
15026
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
15027
+ const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
15028
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
15029
+ const int64_t jp0 = jp1 + i10;
15030
+ const float src1_e = src1_data[jp0];
15031
+ const float src2_e = src2_data[jp0];
15032
+
15033
+ const int64_t jdh = jp0 * ne10;
15034
+ const int64_t jdw = jdh - (ne10 - 1) * i10;
15035
+
15036
+ for (int64_t j = 0; j < ne10; ++j) {
15037
+ dst_data[jdh + j ] += src2_e;
15038
+ dst_data[jdw + j*ne10] += src1_e;
15039
+ }
15040
+ }
15041
+ }
15042
+ }
15043
+ }
15044
+ }
15045
+
15046
+ static void ggml_compute_forward_add_rel_pos(
15047
+ const struct ggml_compute_params * params,
15048
+ const struct ggml_tensor * src0,
15049
+ const struct ggml_tensor * src1,
15050
+ const struct ggml_tensor * src2,
15051
+ struct ggml_tensor * dst) {
15052
+ switch (src0->type) {
15053
+ case GGML_TYPE_F32:
15054
+ {
15055
+ ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
15056
+ } break;
15057
+ default:
15058
+ {
15059
+ GGML_ASSERT(false);
15060
+ } break;
15061
+ }
15062
+ }
15063
+
15064
+ // ggml_compute_forward_map_unary
15065
+
15066
+ static void ggml_compute_forward_map_unary_f32(
15067
+ const struct ggml_compute_params * params,
15068
+ const struct ggml_tensor * src0,
15069
+ struct ggml_tensor * dst,
15070
+ const ggml_unary_op_f32_t fun) {
15071
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
15072
+
15073
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15074
+ return;
15075
+ }
15076
+
15077
+ const int n = ggml_nrows(src0);
15078
+ const int nc = src0->ne[0];
15079
+
15080
+ assert( dst->nb[0] == sizeof(float));
15081
+ assert(src0->nb[0] == sizeof(float));
15082
+
15083
+ for (int i = 0; i < n; i++) {
15084
+ fun(nc,
15085
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
15086
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
15087
+ }
15088
+ }
15089
+
15090
+
15091
+ static void ggml_compute_forward_map_unary(
15092
+ const struct ggml_compute_params * params,
15093
+ const struct ggml_tensor * src0,
15094
+ struct ggml_tensor * dst,
15095
+ const ggml_unary_op_f32_t fun) {
15096
+ switch (src0->type) {
15097
+ case GGML_TYPE_F32:
15098
+ {
15099
+ ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
15100
+ } break;
15101
+ default:
15102
+ {
15103
+ GGML_ASSERT(false);
15104
+ } break;
15105
+ }
15106
+ }
15107
+
15108
+ // ggml_compute_forward_map_binary
15109
+
15110
+ static void ggml_compute_forward_map_binary_f32(
15111
+ const struct ggml_compute_params * params,
15112
+ const struct ggml_tensor * src0,
15113
+ const struct ggml_tensor * src1,
15114
+ struct ggml_tensor * dst,
15115
+ const ggml_binary_op_f32_t fun) {
15116
+ assert(params->ith == 0);
15117
+ assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
15118
+
15119
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15120
+ return;
15121
+ }
15122
+
15123
+ const int n = ggml_nrows(src0);
15124
+ const int nc = src0->ne[0];
15125
+
15126
+ assert( dst->nb[0] == sizeof(float));
15127
+ assert(src0->nb[0] == sizeof(float));
15128
+ assert(src1->nb[0] == sizeof(float));
15129
+
15130
+ for (int i = 0; i < n; i++) {
15131
+ fun(nc,
15132
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
15133
+ (float *) ((char *) src0->data + i*(src0->nb[1])),
15134
+ (float *) ((char *) src1->data + i*(src1->nb[1])));
15135
+ }
14402
15136
  }
14403
15137
 
14404
15138
 
@@ -14879,6 +15613,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14879
15613
  {
14880
15614
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14881
15615
  } break;
15616
+ case GGML_OP_CONCAT:
15617
+ {
15618
+ ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15619
+ } break;
14882
15620
  case GGML_OP_SILU_BACK:
14883
15621
  {
14884
15622
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14895,6 +15633,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14895
15633
  {
14896
15634
  ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
14897
15635
  } break;
15636
+ case GGML_OP_GROUP_NORM:
15637
+ {
15638
+ ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15639
+ } break;
14898
15640
  case GGML_OP_MUL_MAT:
14899
15641
  {
14900
15642
  ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
@@ -14987,6 +15729,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14987
15729
  {
14988
15730
  ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14989
15731
  } break;
15732
+ case GGML_OP_CONV_TRANSPOSE_2D:
15733
+ {
15734
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15735
+ } break;
14990
15736
  case GGML_OP_POOL_1D:
14991
15737
  {
14992
15738
  ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
@@ -14995,6 +15741,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14995
15741
  {
14996
15742
  ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
14997
15743
  } break;
15744
+ case GGML_OP_UPSCALE:
15745
+ {
15746
+ ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15747
+ } break;
14998
15748
  case GGML_OP_FLASH_ATTN:
14999
15749
  {
15000
15750
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -15025,6 +15775,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15025
15775
  {
15026
15776
  ggml_compute_forward_unary(params, tensor->src[0], tensor);
15027
15777
  } break;
15778
+ case GGML_OP_GET_REL_POS:
15779
+ {
15780
+ ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15781
+ } break;
15782
+ case GGML_OP_ADD_REL_POS:
15783
+ {
15784
+ ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15785
+ } break;
15028
15786
  case GGML_OP_MAP_UNARY:
15029
15787
  {
15030
15788
  ggml_unary_op_f32_t fun;
@@ -15288,6 +16046,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15288
16046
  inplace);
15289
16047
  }
15290
16048
  } break;
16049
+ case GGML_OP_CONCAT:
16050
+ {
16051
+ GGML_ASSERT(false); // TODO: implement
16052
+ } break;
15291
16053
  case GGML_OP_SILU_BACK:
15292
16054
  {
15293
16055
  GGML_ASSERT(false); // TODO: not implemented
@@ -15310,6 +16072,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15310
16072
  {
15311
16073
  GGML_ASSERT(false); // TODO: not implemented
15312
16074
  } break;
16075
+ case GGML_OP_GROUP_NORM:
16076
+ {
16077
+ GGML_ASSERT(false); // TODO: not implemented
16078
+ } break;
15313
16079
  case GGML_OP_MUL_MAT:
15314
16080
  {
15315
16081
  // https://cs231n.github.io/optimization-2/#staged
@@ -15584,6 +16350,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15584
16350
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15585
16351
  const int mode = ((int32_t *) tensor->op_params)[2];
15586
16352
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16353
+ float freq_base;
16354
+ float freq_scale;
16355
+ float xpos_base;
16356
+ bool xpos_down;
16357
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16358
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16359
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16360
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16361
+
15587
16362
  src0->grad = ggml_add_impl(ctx,
15588
16363
  src0->grad,
15589
16364
  ggml_rope_back(ctx,
@@ -15591,7 +16366,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15591
16366
  n_past,
15592
16367
  n_dims,
15593
16368
  mode,
15594
- n_ctx),
16369
+ n_ctx,
16370
+ freq_base,
16371
+ freq_scale,
16372
+ xpos_base,
16373
+ xpos_down),
15595
16374
  inplace);
15596
16375
  }
15597
16376
  } break;
@@ -15602,14 +16381,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15602
16381
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15603
16382
  const int mode = ((int32_t *) tensor->op_params)[2];
15604
16383
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16384
+ float freq_base;
16385
+ float freq_scale;
16386
+ float xpos_base;
16387
+ bool xpos_down;
16388
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16389
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16390
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16391
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16392
+
15605
16393
  src0->grad = ggml_add_impl(ctx,
15606
16394
  src0->grad,
15607
- ggml_rope(ctx,
16395
+ ggml_rope_impl(ctx,
15608
16396
  tensor->grad,
15609
16397
  n_past,
15610
16398
  n_dims,
15611
16399
  mode,
15612
- n_ctx),
16400
+ n_ctx,
16401
+ freq_base,
16402
+ freq_scale,
16403
+ xpos_base,
16404
+ xpos_down,
16405
+ false),
15613
16406
  inplace);
15614
16407
  }
15615
16408
  } break;
@@ -15629,6 +16422,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15629
16422
  {
15630
16423
  GGML_ASSERT(false); // TODO: not implemented
15631
16424
  } break;
16425
+ case GGML_OP_CONV_TRANSPOSE_2D:
16426
+ {
16427
+ GGML_ASSERT(false); // TODO: not implemented
16428
+ } break;
15632
16429
  case GGML_OP_POOL_1D:
15633
16430
  {
15634
16431
  GGML_ASSERT(false); // TODO: not implemented
@@ -15637,6 +16434,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15637
16434
  {
15638
16435
  GGML_ASSERT(false); // TODO: not implemented
15639
16436
  } break;
16437
+ case GGML_OP_UPSCALE:
16438
+ {
16439
+ GGML_ASSERT(false); // TODO: not implemented
16440
+ } break;
15640
16441
  case GGML_OP_FLASH_ATTN:
15641
16442
  {
15642
16443
  struct ggml_tensor * flash_grad = NULL;
@@ -15878,6 +16679,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15878
16679
  GGML_ASSERT(false);
15879
16680
  }
15880
16681
  } break;
16682
+ case GGML_OP_GET_REL_POS:
16683
+ case GGML_OP_ADD_REL_POS:
15881
16684
  case GGML_OP_MAP_UNARY:
15882
16685
  case GGML_OP_MAP_BINARY:
15883
16686
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16382,7 +17185,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16382
17185
 
16383
17186
  size_t cur = 0;
16384
17187
  if (ggml_is_quantized(node->type)) {
16385
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
17188
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16386
17189
  }
16387
17190
 
16388
17191
  work_size = MAX(work_size, cur);
@@ -16395,7 +17198,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16395
17198
  size_t cur = 0;
16396
17199
 
16397
17200
  if (ggml_is_quantized(node->src[0]->type)) {
16398
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
17201
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16399
17202
  }
16400
17203
 
16401
17204
  work_size = MAX(work_size, cur);
@@ -16407,7 +17210,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16407
17210
  size_t cur = 0;
16408
17211
 
16409
17212
  if (ggml_is_quantized(node->src[0]->type)) {
16410
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
17213
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16411
17214
  }
16412
17215
 
16413
17216
  work_size = MAX(work_size, cur);
@@ -16454,9 +17257,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16454
17257
  case GGML_OP_NORM:
16455
17258
  case GGML_OP_RMS_NORM:
16456
17259
  case GGML_OP_RMS_NORM_BACK:
17260
+ case GGML_OP_GROUP_NORM:
16457
17261
  {
16458
17262
  n_tasks = n_threads;
16459
17263
  } break;
17264
+ case GGML_OP_CONCAT:
16460
17265
  case GGML_OP_MUL_MAT:
16461
17266
  case GGML_OP_OUT_PROD:
16462
17267
  {
@@ -16490,12 +17295,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16490
17295
  // the threads are still spinning
16491
17296
  if (node->src[0]->type != GGML_TYPE_F32) {
16492
17297
  // here we need memory just for single 2D matrix from src0
16493
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
17298
+ cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
16494
17299
  }
16495
17300
  } else
16496
17301
  #endif
16497
17302
  if (node->src[1]->type != vec_dot_type) {
16498
- cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
17303
+ cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16499
17304
  } else {
16500
17305
  cur = 0;
16501
17306
  }
@@ -16524,6 +17329,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16524
17329
  case GGML_OP_SOFT_MAX_BACK:
16525
17330
  case GGML_OP_ROPE:
16526
17331
  case GGML_OP_ROPE_BACK:
17332
+ case GGML_OP_ADD_REL_POS:
16527
17333
  {
16528
17334
  n_tasks = n_threads;
16529
17335
  } break;
@@ -16598,6 +17404,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16598
17404
  GGML_ASSERT(false);
16599
17405
  }
16600
17406
 
17407
+ work_size = MAX(work_size, cur);
17408
+ } break;
17409
+ case GGML_OP_CONV_TRANSPOSE_2D:
17410
+ {
17411
+ n_tasks = n_threads;
17412
+
17413
+ const int64_t ne00 = node->src[0]->ne[0]; // W
17414
+ const int64_t ne01 = node->src[0]->ne[1]; // H
17415
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
17416
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
17417
+
17418
+ const int64_t ne10 = node->src[1]->ne[0]; // W
17419
+ const int64_t ne11 = node->src[1]->ne[1]; // H
17420
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
17421
+
17422
+ size_t cur = 0;
17423
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
17424
+ cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
17425
+
16601
17426
  work_size = MAX(work_size, cur);
16602
17427
  } break;
16603
17428
  case GGML_OP_POOL_1D:
@@ -16605,6 +17430,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16605
17430
  {
16606
17431
  n_tasks = 1;
16607
17432
  } break;
17433
+ case GGML_OP_UPSCALE:
17434
+ {
17435
+ n_tasks = n_threads;
17436
+ } break;
16608
17437
  case GGML_OP_FLASH_ATTN:
16609
17438
  {
16610
17439
  n_tasks = n_threads;
@@ -16666,6 +17495,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16666
17495
  } break;
16667
17496
  case GGML_OP_WIN_PART:
16668
17497
  case GGML_OP_WIN_UNPART:
17498
+ case GGML_OP_GET_REL_POS:
16669
17499
  case GGML_OP_MAP_UNARY:
16670
17500
  case GGML_OP_MAP_BINARY:
16671
17501
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16783,8 +17613,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16783
17613
 
16784
17614
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16785
17615
  GGML_ASSERT(rc == 0);
17616
+ UNUSED(rc);
16786
17617
  }
16787
17618
  }
17619
+
16788
17620
  workers[0].ith = 0;
16789
17621
  workers[0].shared = &state_shared;
16790
17622
 
@@ -16900,7 +17732,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16900
17732
  // compute size of intermediate results
16901
17733
  // TODO: does not take into account scratch buffers !!!!
16902
17734
  for (int i = 0; i < cgraph->n_nodes; ++i) {
16903
- size_eval += ggml_nbytes(cgraph->nodes[i]);
17735
+ size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
16904
17736
  }
16905
17737
 
16906
17738
  // print
@@ -18301,8 +19133,8 @@ enum ggml_opt_result ggml_opt_resume(
18301
19133
  struct ggml_tensor * f) {
18302
19134
 
18303
19135
  // build forward + backward compute graphs
18304
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
18305
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
19136
+ struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
19137
+ struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18306
19138
 
18307
19139
  struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18308
19140
  struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
@@ -18561,6 +19393,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18561
19393
 
18562
19394
  ////////////////////////////////////////////////////////////////////////////////
18563
19395
 
19396
+ struct gguf_str {
19397
+ uint32_t n;
19398
+ char * data;
19399
+ };
19400
+
19401
+ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19402
+ [GGUF_TYPE_UINT8] = sizeof(uint8_t),
19403
+ [GGUF_TYPE_INT8] = sizeof(int8_t),
19404
+ [GGUF_TYPE_UINT16] = sizeof(uint16_t),
19405
+ [GGUF_TYPE_INT16] = sizeof(int16_t),
19406
+ [GGUF_TYPE_UINT32] = sizeof(uint32_t),
19407
+ [GGUF_TYPE_INT32] = sizeof(int32_t),
19408
+ [GGUF_TYPE_FLOAT32] = sizeof(float),
19409
+ [GGUF_TYPE_BOOL] = sizeof(bool),
19410
+ [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19411
+ [GGUF_TYPE_ARRAY] = 0, // undefined
19412
+ };
19413
+ static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19414
+
19415
+ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19416
+ [GGUF_TYPE_UINT8] = "u8",
19417
+ [GGUF_TYPE_INT8] = "i8",
19418
+ [GGUF_TYPE_UINT16] = "u16",
19419
+ [GGUF_TYPE_INT16] = "i16",
19420
+ [GGUF_TYPE_UINT32] = "u32",
19421
+ [GGUF_TYPE_INT32] = "i32",
19422
+ [GGUF_TYPE_FLOAT32] = "f32",
19423
+ [GGUF_TYPE_BOOL] = "bool",
19424
+ [GGUF_TYPE_STRING] = "str",
19425
+ [GGUF_TYPE_ARRAY] = "arr",
19426
+ };
19427
+ static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19428
+
19429
+ union gguf_value {
19430
+ uint8_t uint8;
19431
+ int8_t int8;
19432
+ uint16_t uint16;
19433
+ int16_t int16;
19434
+ uint32_t uint32;
19435
+ int32_t int32;
19436
+ float float32;
19437
+ bool bool_;
19438
+
19439
+ struct gguf_str str;
19440
+
19441
+ struct {
19442
+ enum gguf_type type;
19443
+
19444
+ uint32_t n;
19445
+ void * data;
19446
+ } arr;
19447
+ };
19448
+
19449
+ struct gguf_kv {
19450
+ struct gguf_str key;
19451
+
19452
+ uint32_t n_bytes; // TODO: is this actually needed?
19453
+
19454
+ enum gguf_type type;
19455
+ union gguf_value value;
19456
+ };
19457
+
19458
+ struct gguf_header {
19459
+ uint32_t magic;
19460
+ uint32_t version;
19461
+ uint32_t n_tensors;
19462
+ uint32_t n_kv;
19463
+ };
19464
+
19465
+ struct gguf_tensor_info {
19466
+ struct gguf_str name;
19467
+
19468
+ uint32_t n_dims;
19469
+ uint32_t ne[GGML_MAX_DIMS];
19470
+
19471
+ enum ggml_type type;
19472
+
19473
+ uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
19474
+
19475
+ // for writing API
19476
+ const void * data;
19477
+ size_t size;
19478
+ };
19479
+
19480
+ struct gguf_context {
19481
+ struct gguf_header header;
19482
+
19483
+ struct gguf_kv * kv;
19484
+ struct gguf_tensor_info * infos;
19485
+
19486
+ size_t alignment;
19487
+ size_t offset; // offset of `data` from beginning of file
19488
+ size_t size; // size of `data` in bytes
19489
+
19490
+ //uint8_t * padding;
19491
+ void * data;
19492
+ };
19493
+
19494
+ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
19495
+ const size_t n = fread(dst, 1, size, file);
19496
+ *offset += n;
19497
+ return n == size;
19498
+ }
19499
+
19500
+ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19501
+ p->n = 0;
19502
+ p->data = NULL;
19503
+
19504
+ bool ok = true;
19505
+
19506
+ // TODO: how to avoid mallocs for strings?
19507
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19508
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19509
+
19510
+ return ok;
19511
+ }
19512
+
19513
+ struct gguf_context * gguf_init_empty(void) {
19514
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19515
+
19516
+ ctx->header.magic = GGUF_MAGIC;
19517
+ ctx->header.version = GGUF_VERSION;
19518
+ ctx->header.n_tensors = 0;
19519
+ ctx->header.n_kv = 0;
19520
+
19521
+ ctx->kv = NULL;
19522
+ ctx->infos = NULL;
19523
+
19524
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19525
+ ctx->offset = 0;
19526
+ ctx->size = 0;
19527
+
19528
+ ctx->data = NULL;
19529
+
19530
+ return ctx;
19531
+ }
19532
+
19533
+ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
19534
+ FILE * file = fopen(fname, "rb");
19535
+ if (!file) {
19536
+ return NULL;
19537
+ }
19538
+
19539
+ // offset from start of file
19540
+ size_t offset = 0;
19541
+
19542
+ uint32_t magic = 0;
19543
+
19544
+ // check the magic before making allocations
19545
+ {
19546
+ gguf_fread_el(file, &magic, sizeof(magic), &offset);
19547
+
19548
+ if (magic != GGUF_MAGIC) {
19549
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
19550
+ fclose(file);
19551
+ return NULL;
19552
+ }
19553
+ }
19554
+
19555
+ bool ok = true;
19556
+
19557
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19558
+
19559
+ // read the header
19560
+ {
19561
+ ctx->header.magic = magic;
19562
+
19563
+ ctx->kv = NULL;
19564
+ ctx->infos = NULL;
19565
+ ctx->data = NULL;
19566
+
19567
+ ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19568
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19569
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19570
+
19571
+ if (!ok) {
19572
+ fprintf(stderr, "%s: failed to read header\n", __func__);
19573
+ fclose(file);
19574
+ gguf_free(ctx);
19575
+ return NULL;
19576
+ }
19577
+ }
19578
+
19579
+ // read the kv pairs
19580
+ {
19581
+ ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19582
+
19583
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19584
+ struct gguf_kv * kv = &ctx->kv[i];
19585
+
19586
+ //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19587
+
19588
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19589
+ //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
19590
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19591
+
19592
+ //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19593
+
19594
+ switch (kv->type) {
19595
+ case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
19596
+ case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
19597
+ case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
19598
+ case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
19599
+ case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19600
+ case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19601
+ case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19602
+ case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19603
+ case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19604
+ case GGUF_TYPE_ARRAY:
19605
+ {
19606
+ ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19607
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19608
+
19609
+ switch (kv->value.arr.type) {
19610
+ case GGUF_TYPE_UINT8:
19611
+ case GGUF_TYPE_INT8:
19612
+ case GGUF_TYPE_UINT16:
19613
+ case GGUF_TYPE_INT16:
19614
+ case GGUF_TYPE_UINT32:
19615
+ case GGUF_TYPE_INT32:
19616
+ case GGUF_TYPE_FLOAT32:
19617
+ case GGUF_TYPE_BOOL:
19618
+ {
19619
+ kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19620
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19621
+ } break;
19622
+ case GGUF_TYPE_STRING:
19623
+ {
19624
+ kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19625
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19626
+ ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19627
+ }
19628
+ } break;
19629
+ case GGUF_TYPE_ARRAY:
19630
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19631
+ };
19632
+ } break;
19633
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19634
+ };
19635
+
19636
+ if (!ok) {
19637
+ break;
19638
+ }
19639
+ }
19640
+
19641
+ if (!ok) {
19642
+ fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
19643
+ fclose(file);
19644
+ gguf_free(ctx);
19645
+ return NULL;
19646
+ }
19647
+ }
19648
+
19649
+ // read the tensor infos
19650
+ {
19651
+ ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19652
+
19653
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19654
+ struct gguf_tensor_info * info = &ctx->infos[i];
19655
+
19656
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
19657
+ info->ne[j] = 1;
19658
+ }
19659
+
19660
+ ok = ok && gguf_fread_str(file, &info->name, &offset);
19661
+ ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19662
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
19663
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19664
+ }
19665
+ ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19666
+ ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19667
+
19668
+ if (!ok) {
19669
+ fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19670
+ fclose(file);
19671
+ gguf_free(ctx);
19672
+ return NULL;
19673
+ }
19674
+ }
19675
+ }
19676
+
19677
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19678
+
19679
+ int alignment_idx = gguf_find_key(ctx, "general.alignment");
19680
+ if (alignment_idx != -1) {
19681
+ ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
19682
+ }
19683
+
19684
+ // we require the data section to be aligned, so take into account any padding
19685
+ {
19686
+ const size_t offset_pad = offset % ctx->alignment;
19687
+
19688
+ if (offset_pad != 0) {
19689
+ offset += ctx->alignment - offset_pad;
19690
+ fseek(file, offset, SEEK_SET);
19691
+ }
19692
+ }
19693
+
19694
+ // store the current file offset - this is where the data section starts
19695
+ ctx->offset = offset;
19696
+
19697
+ // compute the total size of the data section, taking into account the alignment
19698
+ {
19699
+ ctx->size = 0;
19700
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19701
+ struct gguf_tensor_info * info = &ctx->infos[i];
19702
+
19703
+ const int64_t ne =
19704
+ (int64_t) info->ne[0] *
19705
+ (int64_t) info->ne[1] *
19706
+ (int64_t) info->ne[2] *
19707
+ (int64_t) info->ne[3];
19708
+
19709
+ if (ne % ggml_blck_size(info->type) != 0) {
19710
+ fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19711
+ __func__, info->name.data, ne, ggml_blck_size(info->type));
19712
+ fclose(file);
19713
+ gguf_free(ctx);
19714
+ return NULL;
19715
+ }
19716
+
19717
+ const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
19718
+
19719
+ ctx->size += GGML_PAD(size_cur, ctx->alignment);
19720
+ }
19721
+ }
19722
+
19723
+ // load the tensor data only if requested
19724
+ if (params.ctx != NULL) {
19725
+ // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
19726
+ // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
19727
+ // the ggml_tensor structs to the appropriate locations in the binary blob
19728
+
19729
+ // compute the exact size needed for the new ggml_context
19730
+ const size_t mem_size =
19731
+ params.no_alloc ?
19732
+ (ctx->header.n_tensors )*ggml_tensor_overhead() :
19733
+ (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
19734
+
19735
+ struct ggml_init_params pdata = {
19736
+ .mem_size = mem_size,
19737
+ .mem_buffer = NULL,
19738
+ .no_alloc = params.no_alloc,
19739
+ };
19740
+
19741
+ *params.ctx = ggml_init(pdata);
19742
+
19743
+ struct ggml_context * ctx_data = *params.ctx;
19744
+
19745
+ struct ggml_tensor * data = NULL;
19746
+
19747
+ if (params.no_alloc == false) {
19748
+ data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
19749
+
19750
+ ok = ok && data != NULL;
19751
+
19752
+ // read the binary blob with the tensor data
19753
+ ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
19754
+
19755
+ if (!ok) {
19756
+ fprintf(stderr, "%s: failed to read tensor data\n", __func__);
19757
+ fclose(file);
19758
+ ggml_free(ctx_data);
19759
+ gguf_free(ctx);
19760
+ return NULL;
19761
+ }
19762
+
19763
+ ctx->data = data->data;
19764
+ }
19765
+
19766
+ ggml_set_no_alloc(ctx_data, true);
19767
+
19768
+ // create the tensors
19769
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19770
+ const int64_t ne[GGML_MAX_DIMS] = {
19771
+ ctx->infos[i].ne[0],
19772
+ ctx->infos[i].ne[1],
19773
+ ctx->infos[i].ne[2],
19774
+ ctx->infos[i].ne[3],
19775
+ };
19776
+
19777
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
19778
+
19779
+ ok = ok && cur != NULL;
19780
+
19781
+ ggml_set_name(cur, ctx->infos[i].name.data);
19782
+
19783
+ if (!ok) {
19784
+ break;
19785
+ }
19786
+
19787
+ // point the data member to the appropriate location in the binary blob using the tensor infos
19788
+ if (params.no_alloc == false) {
19789
+ //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
19790
+ cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
19791
+ }
19792
+ }
19793
+
19794
+ if (!ok) {
19795
+ fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
19796
+ fclose(file);
19797
+ ggml_free(ctx_data);
19798
+ gguf_free(ctx);
19799
+ return NULL;
19800
+ }
19801
+
19802
+ ggml_set_no_alloc(ctx_data, params.no_alloc);
19803
+ }
19804
+
19805
+ fclose(file);
19806
+
19807
+ return ctx;
19808
+ }
19809
+
19810
+ void gguf_free(struct gguf_context * ctx) {
19811
+ if (ctx == NULL) {
19812
+ return;
19813
+ }
19814
+
19815
+ if (ctx->kv) {
19816
+ // free string memory - not great..
19817
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19818
+ struct gguf_kv * kv = &ctx->kv[i];
19819
+
19820
+ if (kv->key.data) {
19821
+ free(kv->key.data);
19822
+ }
19823
+
19824
+ if (kv->type == GGUF_TYPE_STRING) {
19825
+ if (kv->value.str.data) {
19826
+ free(kv->value.str.data);
19827
+ }
19828
+ }
19829
+
19830
+ if (kv->type == GGUF_TYPE_ARRAY) {
19831
+ if (kv->value.arr.data) {
19832
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
19833
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19834
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19835
+ if (str->data) {
19836
+ free(str->data);
19837
+ }
19838
+ }
19839
+ }
19840
+ free(kv->value.arr.data);
19841
+ }
19842
+ }
19843
+ }
19844
+
19845
+ GGML_ALIGNED_FREE(ctx->kv);
19846
+ }
19847
+
19848
+ if (ctx->infos) {
19849
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19850
+ struct gguf_tensor_info * info = &ctx->infos[i];
19851
+
19852
+ if (info->name.data) {
19853
+ free(info->name.data);
19854
+ }
19855
+ }
19856
+
19857
+ GGML_ALIGNED_FREE(ctx->infos);
19858
+ }
19859
+
19860
+ GGML_ALIGNED_FREE(ctx);
19861
+ }
19862
+
19863
+ const char * gguf_type_name(enum gguf_type type) {
19864
+ return GGUF_TYPE_NAME[type];
19865
+ }
19866
+
19867
+ int gguf_get_version(struct gguf_context * ctx) {
19868
+ return ctx->header.version;
19869
+ }
19870
+
19871
+ size_t gguf_get_alignment(struct gguf_context * ctx) {
19872
+ return ctx->alignment;
19873
+ }
19874
+
19875
+ size_t gguf_get_data_offset(struct gguf_context * ctx) {
19876
+ return ctx->offset;
19877
+ }
19878
+
19879
+ void * gguf_get_data(struct gguf_context * ctx) {
19880
+ return ctx->data;
19881
+ }
19882
+
19883
+ int gguf_get_n_kv(struct gguf_context * ctx) {
19884
+ return ctx->header.n_kv;
19885
+ }
19886
+
19887
+ int gguf_find_key(struct gguf_context * ctx, const char * key) {
19888
+ // return -1 if key not found
19889
+ int keyfound = -1;
19890
+
19891
+ const int n_kv = gguf_get_n_kv(ctx);
19892
+
19893
+ for (int i = 0; i < n_kv; ++i) {
19894
+ if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
19895
+ keyfound = i;
19896
+ break;
19897
+ }
19898
+ }
19899
+
19900
+ return keyfound;
19901
+ }
19902
+
19903
+ const char * gguf_get_key(struct gguf_context * ctx, int i) {
19904
+ return ctx->kv[i].key.data;
19905
+ }
19906
+
19907
+ enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
19908
+ return ctx->kv[i].type;
19909
+ }
19910
+
19911
+ enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
19912
+ return ctx->kv[i].value.arr.type;
19913
+ }
19914
+
19915
+ const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
19916
+ return ctx->kv[i].value.arr.data;
19917
+ }
19918
+
19919
+ const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
19920
+ struct gguf_kv * kv = &ctx->kv[key_id];
19921
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
19922
+ return str->data;
19923
+ }
19924
+
19925
+ int gguf_get_arr_n(struct gguf_context * ctx, int i) {
19926
+ return ctx->kv[i].value.arr.n;
19927
+ }
19928
+
19929
+ uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
19930
+ return ctx->kv[i].value.uint8;
19931
+ }
19932
+
19933
+ int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
19934
+ return ctx->kv[i].value.int8;
19935
+ }
19936
+
19937
+ uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
19938
+ return ctx->kv[i].value.uint16;
19939
+ }
19940
+
19941
+ int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
19942
+ return ctx->kv[i].value.int16;
19943
+ }
19944
+
19945
+ uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
19946
+ return ctx->kv[i].value.uint32;
19947
+ }
19948
+
19949
+ int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
19950
+ return ctx->kv[i].value.int32;
19951
+ }
19952
+
19953
+ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
19954
+ return ctx->kv[i].value.float32;
19955
+ }
19956
+
19957
+ bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
19958
+ return ctx->kv[i].value.bool_;
19959
+ }
19960
+
19961
+ const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
19962
+ return ctx->kv[i].value.str.data;
19963
+ }
19964
+
19965
+ int gguf_get_n_tensors(struct gguf_context * ctx) {
19966
+ return ctx->header.n_tensors;
19967
+ }
19968
+
19969
+ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
19970
+ // return -1 if tensor not found
19971
+ int tensorfound = -1;
19972
+
19973
+ const int n_tensors = gguf_get_n_tensors(ctx);
19974
+
19975
+ for (int i = 0; i < n_tensors; ++i) {
19976
+ if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
19977
+ tensorfound = i;
19978
+ break;
19979
+ }
19980
+ }
19981
+
19982
+ return tensorfound;
19983
+ }
19984
+
19985
+ size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
19986
+ return ctx->infos[i].offset;
19987
+ }
19988
+
19989
+ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
19990
+ return ctx->infos[i].name.data;
19991
+ }
19992
+
19993
+ // returns the index
19994
+ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
19995
+ const int idx = gguf_find_key(ctx, key);
19996
+ if (idx >= 0) {
19997
+ return idx;
19998
+ }
19999
+
20000
+ const int n_kv = gguf_get_n_kv(ctx);
20001
+
20002
+ ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20003
+ ctx->kv[n_kv].key.n = strlen(key) + 1;
20004
+ ctx->kv[n_kv].key.data = strdup(key);
20005
+ ctx->header.n_kv++;
20006
+
20007
+ return n_kv;
20008
+ }
20009
+
20010
+ void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
20011
+ const int idx = gguf_get_or_add_key(ctx, key);
20012
+
20013
+ ctx->kv[idx].type = GGUF_TYPE_UINT8;
20014
+ ctx->kv[idx].value.uint8 = val;
20015
+ }
20016
+
20017
+ void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
20018
+ const int idx = gguf_get_or_add_key(ctx, key);
20019
+
20020
+ ctx->kv[idx].type = GGUF_TYPE_INT8;
20021
+ ctx->kv[idx].value.int8 = val;
20022
+ }
20023
+
20024
+ void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
20025
+ const int idx = gguf_get_or_add_key(ctx, key);
20026
+
20027
+ ctx->kv[idx].type = GGUF_TYPE_UINT16;
20028
+ ctx->kv[idx].value.uint16 = val;
20029
+ }
20030
+
20031
+ void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
20032
+ const int idx = gguf_get_or_add_key(ctx, key);
20033
+
20034
+ ctx->kv[idx].type = GGUF_TYPE_INT16;
20035
+ ctx->kv[idx].value.int16 = val;
20036
+ }
20037
+
20038
+ void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
20039
+ const int idx = gguf_get_or_add_key(ctx, key);
20040
+
20041
+ ctx->kv[idx].type = GGUF_TYPE_UINT32;
20042
+ ctx->kv[idx].value.uint32 = val;
20043
+ }
20044
+
20045
+ void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
20046
+ const int idx = gguf_get_or_add_key(ctx, key);
20047
+
20048
+ ctx->kv[idx].type = GGUF_TYPE_INT32;
20049
+ ctx->kv[idx].value.int32 = val;
20050
+ }
20051
+
20052
+ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20053
+ const int idx = gguf_get_or_add_key(ctx, key);
20054
+
20055
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
20056
+ ctx->kv[idx].value.float32 = val;
20057
+ }
20058
+
20059
+ void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20060
+ const int idx = gguf_get_or_add_key(ctx, key);
20061
+
20062
+ ctx->kv[idx].type = GGUF_TYPE_BOOL;
20063
+ ctx->kv[idx].value.bool_ = val;
20064
+ }
20065
+
20066
+ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
20067
+ const int idx = gguf_get_or_add_key(ctx, key);
20068
+
20069
+ ctx->kv[idx].type = GGUF_TYPE_STRING;
20070
+ ctx->kv[idx].value.str.n = strlen(val) + 1;
20071
+ ctx->kv[idx].value.str.data = strdup(val);
20072
+ }
20073
+
20074
+ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
20075
+ const int idx = gguf_get_or_add_key(ctx, key);
20076
+
20077
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20078
+ ctx->kv[idx].value.arr.type = type;
20079
+ ctx->kv[idx].value.arr.n = n;
20080
+ ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
20081
+ memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20082
+ }
20083
+
20084
+ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
20085
+ const int idx = gguf_get_or_add_key(ctx, key);
20086
+
20087
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20088
+ ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
20089
+ ctx->kv[idx].value.arr.n = n;
20090
+ ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20091
+ for (int i = 0; i < n; i++) {
20092
+ struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20093
+ str->n = strlen(data[i]) + 1;
20094
+ str->data = strdup(data[i]);
20095
+ }
20096
+ }
20097
+
20098
+ // set or add KV pairs from another context
20099
+ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20100
+ for (uint32_t i = 0; i < src->header.n_kv; i++) {
20101
+ switch (src->kv[i].type) {
20102
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
20103
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
20104
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
20105
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
20106
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20107
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20108
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20109
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20110
+ case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20111
+ case GGUF_TYPE_ARRAY:
20112
+ {
20113
+ if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
20114
+ const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20115
+ for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
20116
+ data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
20117
+ }
20118
+ gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
20119
+ free(data);
20120
+ } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
20121
+ GGML_ASSERT(false && "nested arrays not supported");
20122
+ } else {
20123
+ gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
20124
+ }
20125
+ } break;
20126
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20127
+ }
20128
+ }
20129
+ }
20130
+
20131
+ void gguf_add_tensor(
20132
+ struct gguf_context * ctx,
20133
+ const struct ggml_tensor * tensor) {
20134
+ const int idx = ctx->header.n_tensors;
20135
+ ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20136
+
20137
+ ctx->infos[idx].name.n = strlen(tensor->name) + 1;
20138
+ ctx->infos[idx].name.data = strdup(tensor->name);
20139
+
20140
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
20141
+ ctx->infos[idx].ne[i] = 1;
20142
+ }
20143
+
20144
+ ctx->infos[idx].n_dims = tensor->n_dims;
20145
+ for (int i = 0; i < tensor->n_dims; i++) {
20146
+ ctx->infos[idx].ne[i] = tensor->ne[i];
20147
+ }
20148
+
20149
+ ctx->infos[idx].type = tensor->type;
20150
+ ctx->infos[idx].offset = 0;
20151
+ ctx->infos[idx].data = tensor->data;
20152
+ ctx->infos[idx].size = ggml_nbytes(tensor);
20153
+
20154
+ if (ctx->header.n_tensors > 0) {
20155
+ ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
20156
+ }
20157
+
20158
+ ctx->header.n_tensors++;
20159
+ }
20160
+
20161
+ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
20162
+ const int idx = gguf_find_tensor(ctx, name);
20163
+ if (idx < 0) {
20164
+ GGML_ASSERT(false && "tensor not found");
20165
+ }
20166
+
20167
+ ctx->infos[idx].type = type;
20168
+ }
20169
+
20170
+ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
20171
+ const int idx = gguf_find_tensor(ctx, name);
20172
+ if (idx < 0) {
20173
+ GGML_ASSERT(false && "tensor not found");
20174
+ }
20175
+
20176
+ ctx->infos[idx].data = data;
20177
+ ctx->infos[idx].size = size;
20178
+
20179
+ // update offsets
20180
+ for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
20181
+ ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
20182
+ }
20183
+ }
20184
+
20185
+ //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
20186
+ // fwrite(&val->n, sizeof(val->n), 1, file);
20187
+ // fwrite(val->data, sizeof(char), val->n, file);
20188
+ //}
20189
+ //
20190
+ //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
20191
+ // fwrite(val, sizeof(char), size, file);
20192
+ //}
20193
+
20194
+ struct gguf_buf {
20195
+ void * data;
20196
+ size_t size;
20197
+ size_t offset;
20198
+ };
20199
+
20200
+ static struct gguf_buf gguf_buf_init(size_t size) {
20201
+ struct gguf_buf buf = {
20202
+ /*buf.data =*/ size == 0 ? NULL : malloc(size),
20203
+ /*buf.size =*/ size,
20204
+ /*buf.offset =*/ 0,
20205
+ };
20206
+
20207
+ return buf;
20208
+ }
20209
+
20210
+ static void gguf_buf_free(struct gguf_buf buf) {
20211
+ if (buf.data) {
20212
+ free(buf.data);
20213
+ }
20214
+ }
20215
+
20216
+ static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
20217
+ if (buf->offset + size > buf->size) {
20218
+ buf->size = 1.5*(buf->offset + size);
20219
+ if (buf->data) {
20220
+ buf->data = realloc(buf->data, buf->size);
20221
+ }
20222
+ }
20223
+ }
20224
+
20225
+ static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
20226
+ gguf_buf_grow(buf, sizeof(val->n) + val->n);
20227
+
20228
+ if (buf->data) {
20229
+ memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
20230
+ }
20231
+ buf->offset += sizeof(val->n);
20232
+
20233
+ if (buf->data) {
20234
+ memcpy((char *) buf->data + buf->offset, val->data, val->n);
20235
+ }
20236
+ buf->offset += val->n;
20237
+ }
20238
+
20239
+ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
20240
+ gguf_buf_grow(buf, el_size);
20241
+
20242
+ if (buf->data) {
20243
+ memcpy((char *) buf->data + buf->offset, val, el_size);
20244
+ }
20245
+ buf->offset += el_size;
20246
+ }
20247
+
20248
+ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20249
+ // write header
20250
+ gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
20251
+ gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
20252
+ gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
20253
+ gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
20254
+
20255
+ // write key-value pairs
20256
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
20257
+ struct gguf_kv * kv = &ctx->kv[i];
20258
+
20259
+ gguf_bwrite_str(buf, &kv->key);
20260
+ gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
20261
+
20262
+ switch (kv->type) {
20263
+ case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
20264
+ case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
20265
+ case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
20266
+ case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
20267
+ case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20268
+ case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20269
+ case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20270
+ case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20271
+ case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20272
+ case GGUF_TYPE_ARRAY:
20273
+ {
20274
+ gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
20275
+ gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
20276
+
20277
+ switch (kv->value.arr.type) {
20278
+ case GGUF_TYPE_UINT8:
20279
+ case GGUF_TYPE_INT8:
20280
+ case GGUF_TYPE_UINT16:
20281
+ case GGUF_TYPE_INT16:
20282
+ case GGUF_TYPE_UINT32:
20283
+ case GGUF_TYPE_INT32:
20284
+ case GGUF_TYPE_FLOAT32:
20285
+ case GGUF_TYPE_BOOL:
20286
+ {
20287
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20288
+ } break;
20289
+ case GGUF_TYPE_STRING:
20290
+ {
20291
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
20292
+ gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
20293
+ }
20294
+ } break;
20295
+ case GGUF_TYPE_ARRAY:
20296
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20297
+ };
20298
+ } break;
20299
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20300
+ };
20301
+ }
20302
+
20303
+ // write tensor infos
20304
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20305
+ struct gguf_tensor_info * info = &ctx->infos[i];
20306
+
20307
+ gguf_bwrite_str(buf, &info->name);
20308
+ gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
20309
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
20310
+ gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
20311
+ }
20312
+ gguf_bwrite_el(buf, &info->type, sizeof(info->type));
20313
+ gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
20314
+ }
20315
+
20316
+ // we require the data section to be aligned, so take into account any padding
20317
+ {
20318
+ const size_t offset = buf->offset;
20319
+ const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
20320
+
20321
+ if (offset_pad != offset) {
20322
+ uint8_t pad = 0;
20323
+ for (size_t i = 0; i < offset_pad - offset; ++i) {
20324
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20325
+ }
20326
+ }
20327
+ }
20328
+
20329
+ if (only_meta) {
20330
+ return;
20331
+ }
20332
+
20333
+ size_t offset = 0;
20334
+
20335
+ // write tensor data
20336
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20337
+ struct gguf_tensor_info * info = &ctx->infos[i];
20338
+
20339
+ const size_t size = info->size;
20340
+ const size_t size_pad = GGML_PAD(size, ctx->alignment);
20341
+
20342
+ gguf_bwrite_el(buf, info->data, size);
20343
+
20344
+ if (size_pad != size) {
20345
+ uint8_t pad = 0;
20346
+ for (size_t j = 0; j < size_pad - size; ++j) {
20347
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20348
+ }
20349
+ }
20350
+
20351
+ GGML_ASSERT(offset == info->offset);
20352
+
20353
+ offset += size_pad;
20354
+ }
20355
+ }
20356
+
20357
+ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
20358
+ FILE * file = fopen(fname, "wb");
20359
+ if (!file) {
20360
+ GGML_ASSERT(false && "failed to open file for writing");
20361
+ }
20362
+
20363
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20364
+
20365
+ gguf_write_to_buf(ctx, &buf, only_meta);
20366
+
20367
+ fwrite(buf.data, 1, buf.offset, file);
20368
+
20369
+ gguf_buf_free(buf);
20370
+
20371
+ fclose(file);
20372
+ }
20373
+
20374
+ size_t gguf_get_meta_size(struct gguf_context * ctx) {
20375
+ // no allocs - only compute size
20376
+ struct gguf_buf buf = gguf_buf_init(0);
20377
+
20378
+ gguf_write_to_buf(ctx, &buf, true);
20379
+
20380
+ return buf.offset;
20381
+ }
20382
+
20383
+ void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
20384
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20385
+
20386
+ gguf_write_to_buf(ctx, &buf, true);
20387
+
20388
+ memcpy(data, buf.data, buf.offset);
20389
+
20390
+ gguf_buf_free(buf);
20391
+ }
20392
+
20393
+ ////////////////////////////////////////////////////////////////////////////////
20394
+
18564
20395
  int ggml_cpu_has_avx(void) {
18565
20396
  #if defined(__AVX__)
18566
20397
  return 1;