llama_cpp 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -213,8 +213,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
213
213
  error_desc = "insufficient memory";
214
214
  break;
215
215
  }
216
- GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
217
- __func__, error_desc, size/(1024.0*1024.0));
216
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
218
217
  return NULL;
219
218
  }
220
219
  return aligned_memory;
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
1643
1642
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1644
1643
 
1645
1644
  static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1645
+ [GGML_TYPE_I8] = {
1646
+ .type_name = "i8",
1647
+ .blck_size = 1,
1648
+ .type_size = sizeof(int8_t),
1649
+ .is_quantized = false,
1650
+ },
1651
+ [GGML_TYPE_I16] = {
1652
+ .type_name = "i16",
1653
+ .blck_size = 1,
1654
+ .type_size = sizeof(int16_t),
1655
+ .is_quantized = false,
1656
+ },
1657
+ [GGML_TYPE_I32] = {
1658
+ .type_name = "i32",
1659
+ .blck_size = 1,
1660
+ .type_size = sizeof(int32_t),
1661
+ .is_quantized = false,
1662
+ },
1646
1663
  [GGML_TYPE_F32] = {
1664
+ .type_name = "f32",
1665
+ .blck_size = 1,
1666
+ .type_size = sizeof(float),
1667
+ .is_quantized = false,
1647
1668
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
1648
1669
  .vec_dot_type = GGML_TYPE_F32,
1649
1670
  },
1650
1671
  [GGML_TYPE_F16] = {
1672
+ .type_name = "f16",
1673
+ .blck_size = 1,
1674
+ .type_size = sizeof(ggml_fp16_t),
1675
+ .is_quantized = false,
1651
1676
  .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
1652
1677
  .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
1653
1678
  .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1655
1680
  .vec_dot_type = GGML_TYPE_F16,
1656
1681
  },
1657
1682
  [GGML_TYPE_Q4_0] = {
1683
+ .type_name = "q4_0",
1684
+ .blck_size = QK4_0,
1685
+ .type_size = sizeof(block_q4_0),
1686
+ .is_quantized = true,
1658
1687
  .to_float = (ggml_to_float_t) dequantize_row_q4_0,
1659
1688
  .from_float = quantize_row_q4_0,
1660
1689
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1662
1691
  .vec_dot_type = GGML_TYPE_Q8_0,
1663
1692
  },
1664
1693
  [GGML_TYPE_Q4_1] = {
1694
+ .type_name = "q4_1",
1695
+ .blck_size = QK4_1,
1696
+ .type_size = sizeof(block_q4_1),
1697
+ .is_quantized = true,
1665
1698
  .to_float = (ggml_to_float_t) dequantize_row_q4_1,
1666
1699
  .from_float = quantize_row_q4_1,
1667
1700
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1669
1702
  .vec_dot_type = GGML_TYPE_Q8_1,
1670
1703
  },
1671
1704
  [GGML_TYPE_Q5_0] = {
1705
+ .type_name = "q5_0",
1706
+ .blck_size = QK5_0,
1707
+ .type_size = sizeof(block_q5_0),
1708
+ .is_quantized = true,
1672
1709
  .to_float = (ggml_to_float_t) dequantize_row_q5_0,
1673
1710
  .from_float = quantize_row_q5_0,
1674
1711
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1676
1713
  .vec_dot_type = GGML_TYPE_Q8_0,
1677
1714
  },
1678
1715
  [GGML_TYPE_Q5_1] = {
1716
+ .type_name = "q5_1",
1717
+ .blck_size = QK5_1,
1718
+ .type_size = sizeof(block_q5_1),
1719
+ .is_quantized = true,
1679
1720
  .to_float = (ggml_to_float_t) dequantize_row_q5_1,
1680
1721
  .from_float = quantize_row_q5_1,
1681
1722
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1683
1724
  .vec_dot_type = GGML_TYPE_Q8_1,
1684
1725
  },
1685
1726
  [GGML_TYPE_Q8_0] = {
1727
+ .type_name = "q8_0",
1728
+ .blck_size = QK8_0,
1729
+ .type_size = sizeof(block_q8_0),
1730
+ .is_quantized = true,
1686
1731
  .to_float = dequantize_row_q8_0,
1687
1732
  .from_float = quantize_row_q8_0,
1688
1733
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1690
1735
  .vec_dot_type = GGML_TYPE_Q8_0,
1691
1736
  },
1692
1737
  [GGML_TYPE_Q8_1] = {
1738
+ .type_name = "q8_1",
1739
+ .blck_size = QK8_1,
1740
+ .type_size = sizeof(block_q8_1),
1741
+ .is_quantized = true,
1693
1742
  .from_float = quantize_row_q8_1,
1694
1743
  .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
1695
1744
  .vec_dot_type = GGML_TYPE_Q8_1,
1696
1745
  },
1697
1746
  #ifdef GGML_USE_K_QUANTS
1698
1747
  [GGML_TYPE_Q2_K] = {
1748
+ .type_name = "q2_K",
1749
+ .blck_size = QK_K,
1750
+ .type_size = sizeof(block_q2_K),
1751
+ .is_quantized = true,
1699
1752
  .to_float = (ggml_to_float_t) dequantize_row_q2_K,
1700
1753
  .from_float = quantize_row_q2_K,
1701
1754
  .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1703
1756
  .vec_dot_type = GGML_TYPE_Q8_K,
1704
1757
  },
1705
1758
  [GGML_TYPE_Q3_K] = {
1759
+ .type_name = "q3_K",
1760
+ .blck_size = QK_K,
1761
+ .type_size = sizeof(block_q3_K),
1762
+ .is_quantized = true,
1706
1763
  .to_float = (ggml_to_float_t) dequantize_row_q3_K,
1707
1764
  .from_float = quantize_row_q3_K,
1708
1765
  .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1710
1767
  .vec_dot_type = GGML_TYPE_Q8_K,
1711
1768
  },
1712
1769
  [GGML_TYPE_Q4_K] = {
1770
+ .type_name = "q4_K",
1771
+ .blck_size = QK_K,
1772
+ .type_size = sizeof(block_q4_K),
1773
+ .is_quantized = true,
1713
1774
  .to_float = (ggml_to_float_t) dequantize_row_q4_K,
1714
1775
  .from_float = quantize_row_q4_K,
1715
1776
  .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1717
1778
  .vec_dot_type = GGML_TYPE_Q8_K,
1718
1779
  },
1719
1780
  [GGML_TYPE_Q5_K] = {
1781
+ .type_name = "q5_K",
1782
+ .blck_size = QK_K,
1783
+ .type_size = sizeof(block_q5_K),
1784
+ .is_quantized = true,
1720
1785
  .to_float = (ggml_to_float_t) dequantize_row_q5_K,
1721
1786
  .from_float = quantize_row_q5_K,
1722
1787
  .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1724
1789
  .vec_dot_type = GGML_TYPE_Q8_K,
1725
1790
  },
1726
1791
  [GGML_TYPE_Q6_K] = {
1792
+ .type_name = "q6_K",
1793
+ .blck_size = QK_K,
1794
+ .type_size = sizeof(block_q6_K),
1795
+ .is_quantized = true,
1727
1796
  .to_float = (ggml_to_float_t) dequantize_row_q6_K,
1728
1797
  .from_float = quantize_row_q6_K,
1729
1798
  .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
1731
1800
  .vec_dot_type = GGML_TYPE_Q8_K,
1732
1801
  },
1733
1802
  [GGML_TYPE_Q8_K] = {
1803
+ .type_name = "q8_K",
1804
+ .blck_size = QK_K,
1805
+ .type_size = sizeof(block_q8_K),
1806
+ .is_quantized = true,
1734
1807
  .from_float = quantize_row_q8_K,
1735
1808
  }
1736
1809
  #endif
1737
1810
  };
1738
1811
 
1739
1812
  // For internal test use
1740
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) {
1741
- GGML_ASSERT(i < GGML_TYPE_COUNT);
1742
- return type_traits[i];
1813
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
1814
+ GGML_ASSERT(type < GGML_TYPE_COUNT);
1815
+ return type_traits[type];
1743
1816
  }
1744
1817
 
1745
1818
 
@@ -3481,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
3481
3554
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
3482
3555
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3483
3556
 
3484
- static const float GELU_COEF_A = 0.044715f;
3485
- static const float GELU_QUICK_COEF = -1.702f;
3486
- static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3557
+ static const float GELU_COEF_A = 0.044715f;
3558
+ static const float GELU_QUICK_COEF = -1.702f;
3559
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3487
3560
 
3488
3561
  inline static float ggml_gelu_f32(float x) {
3489
3562
  return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -3652,95 +3725,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
3652
3725
  // data types
3653
3726
  //
3654
3727
 
3655
- static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3656
- [GGML_TYPE_F32] = 1,
3657
- [GGML_TYPE_F16] = 1,
3658
- [GGML_TYPE_Q4_0] = QK4_0,
3659
- [GGML_TYPE_Q4_1] = QK4_1,
3660
- [GGML_TYPE_Q5_0] = QK5_0,
3661
- [GGML_TYPE_Q5_1] = QK5_1,
3662
- [GGML_TYPE_Q8_0] = QK8_0,
3663
- [GGML_TYPE_Q8_1] = QK8_1,
3664
- #ifdef GGML_USE_K_QUANTS
3665
- [GGML_TYPE_Q2_K] = QK_K,
3666
- [GGML_TYPE_Q3_K] = QK_K,
3667
- [GGML_TYPE_Q4_K] = QK_K,
3668
- [GGML_TYPE_Q5_K] = QK_K,
3669
- [GGML_TYPE_Q6_K] = QK_K,
3670
- [GGML_TYPE_Q8_K] = QK_K,
3671
- #endif
3672
- [GGML_TYPE_I8] = 1,
3673
- [GGML_TYPE_I16] = 1,
3674
- [GGML_TYPE_I32] = 1,
3675
- };
3676
- static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
3677
-
3678
- static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3679
- [GGML_TYPE_F32] = sizeof(float),
3680
- [GGML_TYPE_F16] = sizeof(ggml_fp16_t),
3681
- [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3682
- [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
3683
- [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3684
- [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3685
- [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3686
- [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
3687
- #ifdef GGML_USE_K_QUANTS
3688
- [GGML_TYPE_Q2_K] = sizeof(block_q2_K),
3689
- [GGML_TYPE_Q3_K] = sizeof(block_q3_K),
3690
- [GGML_TYPE_Q4_K] = sizeof(block_q4_K),
3691
- [GGML_TYPE_Q5_K] = sizeof(block_q5_K),
3692
- [GGML_TYPE_Q6_K] = sizeof(block_q6_K),
3693
- [GGML_TYPE_Q8_K] = sizeof(block_q8_K),
3694
- #endif
3695
- [GGML_TYPE_I8] = sizeof(int8_t),
3696
- [GGML_TYPE_I16] = sizeof(int16_t),
3697
- [GGML_TYPE_I32] = sizeof(int32_t),
3698
- };
3699
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
3700
-
3701
-
3702
- static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3703
- [GGML_TYPE_F32] = "f32",
3704
- [GGML_TYPE_F16] = "f16",
3705
- [GGML_TYPE_Q4_0] = "q4_0",
3706
- [GGML_TYPE_Q4_1] = "q4_1",
3707
- [GGML_TYPE_Q5_0] = "q5_0",
3708
- [GGML_TYPE_Q5_1] = "q5_1",
3709
- [GGML_TYPE_Q8_0] = "q8_0",
3710
- [GGML_TYPE_Q8_1] = "q8_1",
3711
- [GGML_TYPE_Q2_K] = "q2_K",
3712
- [GGML_TYPE_Q3_K] = "q3_K",
3713
- [GGML_TYPE_Q4_K] = "q4_K",
3714
- [GGML_TYPE_Q5_K] = "q5_K",
3715
- [GGML_TYPE_Q6_K] = "q6_K",
3716
- [GGML_TYPE_Q8_K] = "q8_K",
3717
- [GGML_TYPE_I8] = "i8",
3718
- [GGML_TYPE_I16] = "i16",
3719
- [GGML_TYPE_I32] = "i32",
3720
- };
3721
- static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
3722
-
3723
- static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3724
- [GGML_TYPE_F32] = false,
3725
- [GGML_TYPE_F16] = false,
3726
- [GGML_TYPE_Q4_0] = true,
3727
- [GGML_TYPE_Q4_1] = true,
3728
- [GGML_TYPE_Q5_0] = true,
3729
- [GGML_TYPE_Q5_1] = true,
3730
- [GGML_TYPE_Q8_0] = true,
3731
- [GGML_TYPE_Q8_1] = true,
3732
- [GGML_TYPE_Q2_K] = true,
3733
- [GGML_TYPE_Q3_K] = true,
3734
- [GGML_TYPE_Q4_K] = true,
3735
- [GGML_TYPE_Q5_K] = true,
3736
- [GGML_TYPE_Q6_K] = true,
3737
- [GGML_TYPE_Q8_K] = true,
3738
- [GGML_TYPE_I8] = false,
3739
- [GGML_TYPE_I16] = false,
3740
- [GGML_TYPE_I32] = false,
3741
- };
3742
- static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
3743
-
3744
3728
  static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3745
3729
  "NONE",
3746
3730
 
@@ -3760,10 +3744,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3760
3744
  "ARGMAX",
3761
3745
  "REPEAT",
3762
3746
  "REPEAT_BACK",
3747
+ "CONCAT",
3763
3748
  "SILU_BACK",
3764
3749
  "NORM",
3765
3750
  "RMS_NORM",
3766
3751
  "RMS_NORM_BACK",
3752
+ "GROUP_NORM",
3767
3753
 
3768
3754
  "MUL_MAT",
3769
3755
  "OUT_PROD",
@@ -3789,20 +3775,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3789
3775
  "CLAMP",
3790
3776
  "CONV_1D",
3791
3777
  "CONV_2D",
3778
+ "CONV_TRANSPOSE_2D",
3792
3779
  "POOL_1D",
3793
3780
  "POOL_2D",
3781
+ "UPSCALE",
3794
3782
 
3795
3783
  "FLASH_ATTN",
3796
3784
  "FLASH_FF",
3797
3785
  "FLASH_ATTN_BACK",
3798
3786
  "WIN_PART",
3799
3787
  "WIN_UNPART",
3788
+ "GET_REL_POS",
3789
+ "ADD_REL_POS",
3800
3790
 
3801
3791
  "UNARY",
3802
3792
 
3803
3793
  "MAP_UNARY",
3804
3794
  "MAP_BINARY",
3805
3795
 
3796
+ "MAP_CUSTOM1_F32",
3797
+ "MAP_CUSTOM2_F32",
3798
+ "MAP_CUSTOM3_F32",
3799
+
3806
3800
  "MAP_CUSTOM1",
3807
3801
  "MAP_CUSTOM2",
3808
3802
  "MAP_CUSTOM3",
@@ -3811,7 +3805,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3811
3805
  "CROSS_ENTROPY_LOSS_BACK",
3812
3806
  };
3813
3807
 
3814
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3808
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3815
3809
 
3816
3810
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3817
3811
  "none",
@@ -3832,10 +3826,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3832
3826
  "argmax(x)",
3833
3827
  "repeat(x)",
3834
3828
  "repeat_back(x)",
3829
+ "concat(x, y)",
3835
3830
  "silu_back(x)",
3836
3831
  "norm(x)",
3837
3832
  "rms_norm(x)",
3838
3833
  "rms_norm_back(x)",
3834
+ "group_norm(x)",
3839
3835
 
3840
3836
  "X*Y",
3841
3837
  "X*Y",
@@ -3861,20 +3857,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3861
3857
  "clamp(x)",
3862
3858
  "conv_1d(x)",
3863
3859
  "conv_2d(x)",
3860
+ "conv_transpose_2d(x)",
3864
3861
  "pool_1d(x)",
3865
3862
  "pool_2d(x)",
3863
+ "upscale(x)",
3866
3864
 
3867
3865
  "flash_attn(x)",
3868
3866
  "flash_ff(x)",
3869
3867
  "flash_attn_back(x)",
3870
3868
  "win_part(x)",
3871
3869
  "win_unpart(x)",
3870
+ "get_rel_pos(x)",
3871
+ "add_rel_pos(x)",
3872
3872
 
3873
3873
  "unary(x)",
3874
3874
 
3875
3875
  "f(x)",
3876
3876
  "f(x,y)",
3877
3877
 
3878
+ "custom_f32(x)",
3879
+ "custom_f32(x,y)",
3880
+ "custom_f32(x,y,z)",
3881
+
3878
3882
  "custom(x)",
3879
3883
  "custom(x,y)",
3880
3884
  "custom(x,y,z)",
@@ -3883,7 +3887,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3883
3887
  "cross_entropy_loss_back(x,y)",
3884
3888
  };
3885
3889
 
3886
- static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
3890
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
3887
3891
 
3888
3892
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
3889
3893
 
@@ -3913,8 +3917,10 @@ static void ggml_setup_op_has_task_pass(void) {
3913
3917
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
3914
3918
  p[GGML_OP_CONV_1D ] = true;
3915
3919
  p[GGML_OP_CONV_2D ] = true;
3920
+ p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
3916
3921
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
3917
3922
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
3923
+ p[GGML_OP_ADD_REL_POS ] = true;
3918
3924
  }
3919
3925
 
3920
3926
  { // FINALIZE
@@ -4110,29 +4116,37 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
4110
4116
  //
4111
4117
  // is enough, but just in case, adding the second part
4112
4118
 
4113
- return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
4119
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
4120
+ }
4121
+
4122
+ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
4123
+ return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
4114
4124
  }
4115
4125
 
4116
4126
  size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
4117
4127
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4118
4128
 
4119
- return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
4129
+ return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
4120
4130
  }
4121
4131
 
4122
4132
  int ggml_blck_size(enum ggml_type type) {
4123
- return GGML_BLCK_SIZE[type];
4133
+ return type_traits[type].blck_size;
4124
4134
  }
4125
4135
 
4126
4136
  size_t ggml_type_size(enum ggml_type type) {
4127
- return GGML_TYPE_SIZE[type];
4137
+ return type_traits[type].type_size;
4128
4138
  }
4129
4139
 
4130
4140
  float ggml_type_sizef(enum ggml_type type) {
4131
- return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
4141
+ return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
4132
4142
  }
4133
4143
 
4134
4144
  const char * ggml_type_name(enum ggml_type type) {
4135
- return GGML_TYPE_NAME[type];
4145
+ return type_traits[type].type_name;
4146
+ }
4147
+
4148
+ bool ggml_is_quantized(enum ggml_type type) {
4149
+ return type_traits[type].is_quantized;
4136
4150
  }
4137
4151
 
4138
4152
  const char * ggml_op_name(enum ggml_op op) {
@@ -4144,7 +4158,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
4144
4158
  }
4145
4159
 
4146
4160
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
4147
- return GGML_TYPE_SIZE[tensor->type];
4161
+ return ggml_type_size(tensor->type);
4148
4162
  }
4149
4163
 
4150
4164
  static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
@@ -4182,10 +4196,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
4182
4196
  (t0->ne[3] == t1->ne[3]);
4183
4197
  }
4184
4198
 
4185
- bool ggml_is_quantized(enum ggml_type type) {
4186
- return GGML_IS_QUANTIZED[type];
4187
- }
4188
-
4189
4199
  enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4190
4200
  enum ggml_type wtype = GGML_TYPE_COUNT;
4191
4201
 
@@ -4223,8 +4233,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
4223
4233
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4224
4234
 
4225
4235
  return
4226
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4227
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
4236
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4237
+ tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
4228
4238
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4229
4239
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4230
4240
  }
@@ -4233,7 +4243,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
4233
4243
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4234
4244
 
4235
4245
  return
4236
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4246
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4237
4247
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4238
4248
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4239
4249
  }
@@ -4248,7 +4258,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
4248
4258
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
4249
4259
 
4250
4260
  return
4251
- tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
4261
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
4252
4262
  tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
4253
4263
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
4254
4264
  }
@@ -4567,7 +4577,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4567
4577
  size_t data_size = 0;
4568
4578
 
4569
4579
  if (data == NULL && !ctx->no_alloc) {
4570
- data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
4580
+ data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
4571
4581
  for (int i = 1; i < n_dims; i++) {
4572
4582
  data_size *= ne[i];
4573
4583
  }
@@ -4622,8 +4632,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4622
4632
  result->ne[i] = ne[i];
4623
4633
  }
4624
4634
 
4625
- result->nb[0] = GGML_TYPE_SIZE[type];
4626
- result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
4635
+ result->nb[0] = ggml_type_size(type);
4636
+ result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
4627
4637
  for (int i = 2; i < GGML_MAX_DIMS; i++) {
4628
4638
  result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
4629
4639
  }
@@ -5545,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
5545
5555
  is_node = true;
5546
5556
  }
5547
5557
 
5548
- if (ggml_are_same_shape(a, b) && !is_node) {
5549
- return a;
5550
- }
5551
-
5552
5558
  struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
5553
5559
 
5554
5560
  result->op = GGML_OP_REPEAT;
@@ -5587,6 +5593,30 @@ struct ggml_tensor * ggml_repeat_back(
5587
5593
  return result;
5588
5594
  }
5589
5595
 
5596
+ // ggml_concat
5597
+
5598
+ struct ggml_tensor* ggml_concat(
5599
+ struct ggml_context* ctx,
5600
+ struct ggml_tensor* a,
5601
+ struct ggml_tensor* b) {
5602
+ GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
5603
+
5604
+ bool is_node = false;
5605
+
5606
+ if (a->grad || b->grad) {
5607
+ is_node = true;
5608
+ }
5609
+
5610
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
5611
+
5612
+ result->op = GGML_OP_CONCAT;
5613
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5614
+ result->src[0] = a;
5615
+ result->src[1] = b;
5616
+
5617
+ return result;
5618
+ }
5619
+
5590
5620
  // ggml_abs
5591
5621
 
5592
5622
  struct ggml_tensor * ggml_abs(
@@ -5755,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
5755
5785
  static struct ggml_tensor * ggml_norm_impl(
5756
5786
  struct ggml_context * ctx,
5757
5787
  struct ggml_tensor * a,
5788
+ float eps,
5758
5789
  bool inplace) {
5759
5790
  bool is_node = false;
5760
5791
 
@@ -5765,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
5765
5796
 
5766
5797
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5767
5798
 
5768
- // TODO: maybe store epsilon here?
5799
+ ggml_set_op_params(result, &eps, sizeof(eps));
5769
5800
 
5770
5801
  result->op = GGML_OP_NORM;
5771
5802
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5776,16 +5807,20 @@ static struct ggml_tensor * ggml_norm_impl(
5776
5807
 
5777
5808
  struct ggml_tensor * ggml_norm(
5778
5809
  struct ggml_context * ctx,
5779
- struct ggml_tensor * a) {
5780
- return ggml_norm_impl(ctx, a, false);
5810
+ struct ggml_tensor * a,
5811
+ float eps) {
5812
+ return ggml_norm_impl(ctx, a, eps, false);
5781
5813
  }
5782
5814
 
5783
5815
  struct ggml_tensor * ggml_norm_inplace(
5784
5816
  struct ggml_context * ctx,
5785
- struct ggml_tensor * a) {
5786
- return ggml_norm_impl(ctx, a, true);
5817
+ struct ggml_tensor * a,
5818
+ float eps) {
5819
+ return ggml_norm_impl(ctx, a, eps, true);
5787
5820
  }
5788
5821
 
5822
+ // ggml_rms_norm
5823
+
5789
5824
  static struct ggml_tensor * ggml_rms_norm_impl(
5790
5825
  struct ggml_context * ctx,
5791
5826
  struct ggml_tensor * a,
@@ -5822,6 +5857,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
5822
5857
  return ggml_rms_norm_impl(ctx, a, eps, true);
5823
5858
  }
5824
5859
 
5860
+ // ggml_rms_norm_back
5861
+
5825
5862
  struct ggml_tensor * ggml_rms_norm_back(
5826
5863
  struct ggml_context * ctx,
5827
5864
  struct ggml_tensor * a,
@@ -5843,6 +5880,44 @@ struct ggml_tensor * ggml_rms_norm_back(
5843
5880
  return result;
5844
5881
  }
5845
5882
 
5883
+ // ggml_group_norm
5884
+
5885
+ static struct ggml_tensor * ggml_group_norm_impl(
5886
+ struct ggml_context * ctx,
5887
+ struct ggml_tensor * a,
5888
+ int n_groups,
5889
+ bool inplace) {
5890
+
5891
+ bool is_node = false;
5892
+ if (!inplace && (a->grad)) {
5893
+ GGML_ASSERT(false); // TODO: implement backward
5894
+ is_node = true;
5895
+ }
5896
+
5897
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5898
+
5899
+ result->op = GGML_OP_GROUP_NORM;
5900
+ result->op_params[0] = n_groups;
5901
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5902
+ result->src[0] = a;
5903
+ result->src[1] = NULL; // TODO: maybe store epsilon here?
5904
+
5905
+ return result;
5906
+ }
5907
+
5908
+ struct ggml_tensor * ggml_group_norm(
5909
+ struct ggml_context * ctx,
5910
+ struct ggml_tensor * a,
5911
+ int n_groups) {
5912
+ return ggml_group_norm_impl(ctx, a, n_groups, false);
5913
+ }
5914
+
5915
+ struct ggml_tensor * ggml_group_norm_inplace(
5916
+ struct ggml_context * ctx,
5917
+ struct ggml_tensor * a,
5918
+ int n_groups) {
5919
+ return ggml_group_norm_impl(ctx, a, n_groups, true);
5920
+ }
5846
5921
 
5847
5922
  // ggml_mul_mat
5848
5923
 
@@ -6711,6 +6786,8 @@ static struct ggml_tensor * ggml_rope_impl(
6711
6786
  int n_ctx,
6712
6787
  float freq_base,
6713
6788
  float freq_scale,
6789
+ float xpos_base,
6790
+ bool xpos_down,
6714
6791
  bool inplace) {
6715
6792
  GGML_ASSERT(n_past >= 0);
6716
6793
  bool is_node = false;
@@ -6721,9 +6798,11 @@ static struct ggml_tensor * ggml_rope_impl(
6721
6798
 
6722
6799
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6723
6800
 
6724
- int32_t params[6] = { n_past, n_dims, mode, n_ctx };
6801
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
6725
6802
  memcpy(params + 4, &freq_base, sizeof(float));
6726
6803
  memcpy(params + 5, &freq_scale, sizeof(float));
6804
+ memcpy(params + 6, &xpos_base, sizeof(float));
6805
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6727
6806
  ggml_set_op_params(result, params, sizeof(params));
6728
6807
 
6729
6808
  result->op = GGML_OP_ROPE;
@@ -6740,7 +6819,7 @@ struct ggml_tensor * ggml_rope(
6740
6819
  int n_dims,
6741
6820
  int mode,
6742
6821
  int n_ctx) {
6743
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
6822
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
6744
6823
  }
6745
6824
 
6746
6825
  struct ggml_tensor * ggml_rope_inplace(
@@ -6750,7 +6829,7 @@ struct ggml_tensor * ggml_rope_inplace(
6750
6829
  int n_dims,
6751
6830
  int mode,
6752
6831
  int n_ctx) {
6753
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
6832
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
6754
6833
  }
6755
6834
 
6756
6835
  struct ggml_tensor * ggml_rope_custom(
@@ -6762,7 +6841,7 @@ struct ggml_tensor * ggml_rope_custom(
6762
6841
  int n_ctx,
6763
6842
  float freq_base,
6764
6843
  float freq_scale) {
6765
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
6844
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
6766
6845
  }
6767
6846
 
6768
6847
  struct ggml_tensor * ggml_rope_custom_inplace(
@@ -6774,7 +6853,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
6774
6853
  int n_ctx,
6775
6854
  float freq_base,
6776
6855
  float freq_scale) {
6777
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
6856
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
6857
+ }
6858
+
6859
+ struct ggml_tensor * ggml_rope_xpos_inplace(
6860
+ struct ggml_context * ctx,
6861
+ struct ggml_tensor * a,
6862
+ int n_past,
6863
+ int n_dims,
6864
+ float base,
6865
+ bool down) {
6866
+ return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
6778
6867
  }
6779
6868
 
6780
6869
  // ggml_rope_back
@@ -6785,7 +6874,11 @@ struct ggml_tensor * ggml_rope_back(
6785
6874
  int n_past,
6786
6875
  int n_dims,
6787
6876
  int mode,
6788
- int n_ctx) {
6877
+ int n_ctx,
6878
+ float freq_base,
6879
+ float freq_scale,
6880
+ float xpos_base,
6881
+ bool xpos_down) {
6789
6882
  GGML_ASSERT(n_past >= 0);
6790
6883
  GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
6791
6884
 
@@ -6797,7 +6890,11 @@ struct ggml_tensor * ggml_rope_back(
6797
6890
 
6798
6891
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
6799
6892
 
6800
- int32_t params[] = { n_past, n_dims, mode, n_ctx };
6893
+ int32_t params[8] = { n_past, n_dims, mode, n_ctx };
6894
+ memcpy(params + 4, &freq_base, sizeof(float));
6895
+ memcpy(params + 5, &freq_scale, sizeof(float));
6896
+ memcpy(params + 6, &xpos_base, sizeof(float));
6897
+ memcpy(params + 7, &xpos_down, sizeof(bool));
6801
6898
  ggml_set_op_params(result, params, sizeof(params));
6802
6899
 
6803
6900
  result->op = GGML_OP_ROPE_BACK;
@@ -6904,6 +7001,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
6904
7001
  return result;
6905
7002
  }
6906
7003
 
7004
+ // ggml_conv_1d_ph
7005
+
7006
+ struct ggml_tensor* ggml_conv_1d_ph(
7007
+ struct ggml_context * ctx,
7008
+ struct ggml_tensor * a,
7009
+ struct ggml_tensor * b,
7010
+ int s,
7011
+ int d) {
7012
+ return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7013
+ }
7014
+
6907
7015
  // ggml_conv_2d
6908
7016
 
6909
7017
  struct ggml_tensor * ggml_conv_2d(
@@ -6944,17 +7052,59 @@ struct ggml_tensor * ggml_conv_2d(
6944
7052
 
6945
7053
  }
6946
7054
 
6947
- // ggml_conv_1d_ph
7055
+ // ggml_conv_2d_sk_p0
6948
7056
 
6949
- struct ggml_tensor * ggml_conv_1d_ph(
7057
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6950
7058
  struct ggml_context * ctx,
6951
7059
  struct ggml_tensor * a,
6952
- struct ggml_tensor * b,
6953
- int s,
6954
- int d) {
6955
- return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
7060
+ struct ggml_tensor * b) {
7061
+ return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
7062
+ }
7063
+
7064
+ // ggml_conv_2d_s1_ph
7065
+
7066
+ struct ggml_tensor * ggml_conv_2d_s1_ph(
7067
+ struct ggml_context * ctx,
7068
+ struct ggml_tensor * a,
7069
+ struct ggml_tensor * b) {
7070
+ return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
6956
7071
  }
6957
7072
 
7073
+ // ggml_conv_transpose_2d_p0
7074
+
7075
+ static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
7076
+ return (ins - 1) * s - 2 * p + ks;
7077
+ }
7078
+
7079
+ struct ggml_tensor * ggml_conv_transpose_2d_p0(
7080
+ struct ggml_context * ctx,
7081
+ struct ggml_tensor * a,
7082
+ struct ggml_tensor * b,
7083
+ int stride) {
7084
+ GGML_ASSERT(a->ne[3] == b->ne[2]);
7085
+
7086
+ bool is_node = false;
7087
+
7088
+ if (a->grad || b->grad) {
7089
+ GGML_ASSERT(false); // TODO: implement backward
7090
+ is_node = true;
7091
+ }
7092
+
7093
+ const int64_t ne[4] = {
7094
+ ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
7095
+ ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
7096
+ a->ne[2], b->ne[3],
7097
+ };
7098
+
7099
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7100
+ result->op = GGML_OP_CONV_TRANSPOSE_2D;
7101
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7102
+ result->src[0] = a;
7103
+ result->src[1] = b;
7104
+ result->src[2] = ggml_new_i32(ctx, stride);
7105
+
7106
+ return result;
7107
+ }
6958
7108
 
6959
7109
  // ggml_pool_*
6960
7110
 
@@ -7032,6 +7182,40 @@ struct ggml_tensor * ggml_pool_2d(
7032
7182
  return result;
7033
7183
  }
7034
7184
 
7185
+ // ggml_upscale
7186
+
7187
+ static struct ggml_tensor * ggml_upscale_impl(
7188
+ struct ggml_context * ctx,
7189
+ struct ggml_tensor * a,
7190
+ int scale_factor) {
7191
+ bool is_node = false;
7192
+
7193
+ if (a->grad) {
7194
+ GGML_ASSERT(false); // TODO: implement backward
7195
+ is_node = true;
7196
+ }
7197
+
7198
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
7199
+ a->ne[0] * scale_factor,
7200
+ a->ne[1] * scale_factor,
7201
+ a->ne[2], a->ne[3]);
7202
+
7203
+ result->op = GGML_OP_UPSCALE;
7204
+ result->op_params[0] = scale_factor;
7205
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7206
+ result->src[0] = a;
7207
+ result->src[1] = NULL;
7208
+
7209
+ return result;
7210
+ }
7211
+
7212
+ struct ggml_tensor * ggml_upscale(
7213
+ struct ggml_context * ctx,
7214
+ struct ggml_tensor * a,
7215
+ int scale_factor) {
7216
+ return ggml_upscale_impl(ctx, a, scale_factor);
7217
+ }
7218
+
7035
7219
  // ggml_flash_attn
7036
7220
 
7037
7221
  struct ggml_tensor * ggml_flash_attn(
@@ -7230,6 +7414,87 @@ struct ggml_tensor * ggml_win_unpart(
7230
7414
  return result;
7231
7415
  }
7232
7416
 
7417
+ // ggml_get_rel_pos
7418
+
7419
+ struct ggml_tensor * ggml_get_rel_pos(
7420
+ struct ggml_context * ctx,
7421
+ struct ggml_tensor * a,
7422
+ int qh,
7423
+ int kh) {
7424
+ GGML_ASSERT(qh == kh);
7425
+ GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
7426
+
7427
+ bool is_node = false;
7428
+
7429
+ if (a->grad) {
7430
+ GGML_ASSERT(false); // TODO: implement backward
7431
+ is_node = true;
7432
+ }
7433
+
7434
+ const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
7435
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
7436
+
7437
+ result->op = GGML_OP_GET_REL_POS;
7438
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7439
+ result->src[0] = a;
7440
+ result->src[1] = NULL;
7441
+
7442
+ return result;
7443
+ }
7444
+
7445
+ // ggml_add_rel_pos
7446
+
7447
+ static struct ggml_tensor * ggml_add_rel_pos_impl(
7448
+ struct ggml_context * ctx,
7449
+ struct ggml_tensor * a,
7450
+ struct ggml_tensor * pw,
7451
+ struct ggml_tensor * ph,
7452
+ bool inplace) {
7453
+ GGML_ASSERT(ggml_are_same_shape(pw, ph));
7454
+ GGML_ASSERT(ggml_is_contiguous(a));
7455
+ GGML_ASSERT(ggml_is_contiguous(pw));
7456
+ GGML_ASSERT(ggml_is_contiguous(ph));
7457
+ GGML_ASSERT(ph->type == GGML_TYPE_F32);
7458
+ GGML_ASSERT(pw->type == GGML_TYPE_F32);
7459
+ GGML_ASSERT(pw->ne[3] == a->ne[2]);
7460
+ GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
7461
+ GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
7462
+
7463
+ bool is_node = false;
7464
+
7465
+ if (!inplace && (a->grad || pw->grad || ph->grad)) {
7466
+ is_node = true;
7467
+ }
7468
+
7469
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7470
+ ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
7471
+
7472
+ result->op = GGML_OP_ADD_REL_POS;
7473
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7474
+ result->src[0] = a;
7475
+ result->src[1] = pw;
7476
+ result->src[2] = ph;
7477
+
7478
+ return result;
7479
+ }
7480
+
7481
+
7482
+ struct ggml_tensor * ggml_add_rel_pos(
7483
+ struct ggml_context * ctx,
7484
+ struct ggml_tensor * a,
7485
+ struct ggml_tensor * pw,
7486
+ struct ggml_tensor * ph) {
7487
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
7488
+ }
7489
+
7490
+ struct ggml_tensor * ggml_add_rel_pos_inplace(
7491
+ struct ggml_context * ctx,
7492
+ struct ggml_tensor * a,
7493
+ struct ggml_tensor * pw,
7494
+ struct ggml_tensor * ph) {
7495
+ return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
7496
+ }
7497
+
7233
7498
  // gmml_unary
7234
7499
 
7235
7500
  static struct ggml_tensor * ggml_unary_impl(
@@ -7745,7 +8010,7 @@ static void ggml_compute_forward_dup_same_cont(
7745
8010
  memcpy(
7746
8011
  ((char *) dst->data + ie0*nb0),
7747
8012
  ((char *) src0->data + ie0*nb00),
7748
- (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
8013
+ (ie1 - ie0) * ggml_type_size(src0->type));
7749
8014
  }
7750
8015
 
7751
8016
  }
@@ -7779,7 +8044,7 @@ static void ggml_compute_forward_dup_f16(
7779
8044
 
7780
8045
  if (src0->type == dst->type &&
7781
8046
  ne00 == ne0 &&
7782
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8047
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
7783
8048
  // copy by rows
7784
8049
  const size_t rs = ne00*nb00;
7785
8050
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7837,7 +8102,7 @@ static void ggml_compute_forward_dup_f16(
7837
8102
  float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
7838
8103
 
7839
8104
  size_t id = 0;
7840
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8105
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
7841
8106
  char * dst_ptr = (char *) dst->data;
7842
8107
 
7843
8108
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8050,7 +8315,7 @@ static void ggml_compute_forward_dup_f32(
8050
8315
 
8051
8316
  if (src0->type == dst->type &&
8052
8317
  ne00 == ne0 &&
8053
- nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
8318
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
8054
8319
  // copy by rows
8055
8320
  const size_t rs = ne00*nb00;
8056
8321
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -8089,7 +8354,7 @@ static void ggml_compute_forward_dup_f32(
8089
8354
  ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
8090
8355
 
8091
8356
  size_t id = 0;
8092
- size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
8357
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
8093
8358
  char * dst_ptr = (char *) dst->data;
8094
8359
 
8095
8360
  for (int i03 = 0; i03 < ne03; i03++) {
@@ -8501,7 +8766,7 @@ static void ggml_compute_forward_add_q_f32(
8501
8766
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8502
8767
 
8503
8768
  // we don't support permuted src0 or src1
8504
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
8769
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8505
8770
  GGML_ASSERT(nb10 == sizeof(float));
8506
8771
 
8507
8772
  // dst cannot be transposed or permuted
@@ -8775,7 +9040,7 @@ static void ggml_compute_forward_add1_q_f32(
8775
9040
  ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
8776
9041
 
8777
9042
  // we don't support permuted src0
8778
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
9043
+ GGML_ASSERT(nb00 == ggml_type_size(type));
8779
9044
 
8780
9045
  // dst cannot be transposed or permuted
8781
9046
  GGML_ASSERT(nb0 <= nb1);
@@ -9137,6 +9402,8 @@ static void ggml_compute_forward_mul(
9137
9402
  const struct ggml_tensor * src0,
9138
9403
  const struct ggml_tensor * src1,
9139
9404
  struct ggml_tensor * dst) {
9405
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
9406
+
9140
9407
  switch (src0->type) {
9141
9408
  case GGML_TYPE_F32:
9142
9409
  {
@@ -9731,6 +9998,72 @@ static void ggml_compute_forward_repeat_back(
9731
9998
  }
9732
9999
  }
9733
10000
 
10001
+ // ggml_compute_forward_concat
10002
+
10003
+ static void ggml_compute_forward_concat_f32(
10004
+ const struct ggml_compute_params * params,
10005
+ const struct ggml_tensor * src0,
10006
+ const struct ggml_tensor * src1,
10007
+ struct ggml_tensor * dst) {
10008
+
10009
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10010
+ return;
10011
+ }
10012
+
10013
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10014
+
10015
+ const int ith = params->ith;
10016
+
10017
+ GGML_TENSOR_BINARY_OP_LOCALS;
10018
+
10019
+ // TODO: support for transposed / permuted tensors
10020
+ GGML_ASSERT(nb0 == sizeof(float));
10021
+ GGML_ASSERT(nb00 == sizeof(float));
10022
+ GGML_ASSERT(nb10 == sizeof(float));
10023
+
10024
+ for (int i3 = 0; i3 < ne3; i3++) {
10025
+ for (int i2 = ith; i2 < ne2; i2++) {
10026
+ if (i2 < ne02) { // src0
10027
+ for (int i1 = 0; i1 < ne1; i1++) {
10028
+ for (int i0 = 0; i0 < ne0; i0++) {
10029
+ const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
10030
+
10031
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10032
+ *y = *x;
10033
+ }
10034
+ }
10035
+ } // src1
10036
+ else {
10037
+ for (int i1 = 0; i1 < ne1; i1++) {
10038
+ for (int i0 = 0; i0 < ne0; i0++) {
10039
+ const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
10040
+
10041
+ float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
10042
+ *y = *x;
10043
+ }
10044
+ }
10045
+ }
10046
+ }
10047
+ }
10048
+ }
10049
+
10050
+ static void ggml_compute_forward_concat(
10051
+ const struct ggml_compute_params* params,
10052
+ const struct ggml_tensor* src0,
10053
+ const struct ggml_tensor* src1,
10054
+ struct ggml_tensor* dst) {
10055
+ switch (src0->type) {
10056
+ case GGML_TYPE_F32:
10057
+ {
10058
+ ggml_compute_forward_concat_f32(params, src0, src1, dst);
10059
+ } break;
10060
+ default:
10061
+ {
10062
+ GGML_ASSERT(false);
10063
+ } break;
10064
+ }
10065
+ }
10066
+
9734
10067
  // ggml_compute_forward_abs
9735
10068
 
9736
10069
  static void ggml_compute_forward_abs_f32(
@@ -10285,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
10285
10618
 
10286
10619
  GGML_TENSOR_UNARY_OP_LOCALS;
10287
10620
 
10288
- const float eps = 1e-5f; // TODO: make this a parameter
10621
+ float eps;
10622
+ memcpy(&eps, dst->op_params, sizeof(float));
10289
10623
 
10290
10624
  // TODO: optimize
10291
10625
  for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -10334,6 +10668,8 @@ static void ggml_compute_forward_norm(
10334
10668
  }
10335
10669
  }
10336
10670
 
10671
+ // ggml_compute_forward_group_rms_norm
10672
+
10337
10673
  static void ggml_compute_forward_rms_norm_f32(
10338
10674
  const struct ggml_compute_params * params,
10339
10675
  const struct ggml_tensor * src0,
@@ -10398,7 +10734,6 @@ static void ggml_compute_forward_rms_norm(
10398
10734
  }
10399
10735
  }
10400
10736
 
10401
-
10402
10737
  static void ggml_compute_forward_rms_norm_back_f32(
10403
10738
  const struct ggml_compute_params * params,
10404
10739
  const struct ggml_tensor * src0,
@@ -10572,16 +10907,106 @@ static void ggml_compute_forward_rms_norm_back(
10572
10907
  }
10573
10908
  }
10574
10909
 
10575
- // ggml_compute_forward_mul_mat
10910
+ // ggml_compute_forward_group_norm
10576
10911
 
10577
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10578
- // helper function to determine if it is better to use BLAS or not
10579
- // for large matrices, BLAS is faster
10580
- static bool ggml_compute_forward_mul_mat_use_blas(
10581
- const struct ggml_tensor * src0,
10582
- const struct ggml_tensor * src1,
10583
- struct ggml_tensor * dst) {
10584
- //const int64_t ne00 = src0->ne[0];
10912
+ static void ggml_compute_forward_group_norm_f32(
10913
+ const struct ggml_compute_params * params,
10914
+ const struct ggml_tensor * src0,
10915
+ struct ggml_tensor * dst) {
10916
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
10917
+
10918
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10919
+ return;
10920
+ }
10921
+
10922
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
10923
+
10924
+ const int ith = params->ith;
10925
+ const int nth = params->nth;
10926
+
10927
+ GGML_TENSOR_UNARY_OP_LOCALS;
10928
+
10929
+ const float eps = 1e-6f; // TODO: make this a parameter
10930
+
10931
+ // TODO: optimize
10932
+
10933
+ int n_channels = src0->ne[2];
10934
+ int n_groups = dst->op_params[0];
10935
+ int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10936
+ for (int i = ith; i < n_groups; i+=nth) {
10937
+ int start = i * n_channels_per_group;
10938
+ int end = start + n_channels_per_group;
10939
+ if (end > n_channels) {
10940
+ end = n_channels;
10941
+ }
10942
+ int step = end - start;
10943
+
10944
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
10945
+ ggml_float sum = 0.0;
10946
+ for (int64_t i02 = start; i02 < end; i02++) {
10947
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
10948
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10949
+
10950
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
10951
+ sum += (ggml_float)x[i00];
10952
+ }
10953
+ }
10954
+ }
10955
+ float mean = sum / (ne00 * ne01 * step);
10956
+ ggml_float sum2 = 0.0;
10957
+
10958
+ for (int64_t i02 = start; i02 < end; i02++) {
10959
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
10960
+ const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10961
+
10962
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10963
+
10964
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
10965
+ float v = x[i00] - mean;
10966
+ y[i00] = v;
10967
+ sum2 += (ggml_float)(v * v);
10968
+ }
10969
+ }
10970
+ }
10971
+ float variance = sum2 / (ne00 * ne01 * step);
10972
+ const float scale = 1.0f / sqrtf(variance + eps);
10973
+
10974
+ for (int64_t i02 = start; i02 < end; i02++) {
10975
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
10976
+ float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10977
+ ggml_vec_scale_f32(ne00, y, scale);
10978
+ }
10979
+ }
10980
+ }
10981
+ }
10982
+ }
10983
+
10984
+ static void ggml_compute_forward_group_norm(
10985
+ const struct ggml_compute_params * params,
10986
+ const struct ggml_tensor * src0,
10987
+ struct ggml_tensor * dst) {
10988
+ switch (src0->type) {
10989
+ case GGML_TYPE_F32:
10990
+ {
10991
+ ggml_compute_forward_group_norm_f32(params, src0, dst);
10992
+ } break;
10993
+ default:
10994
+ {
10995
+ GGML_ASSERT(false);
10996
+ } break;
10997
+ }
10998
+ }
10999
+
11000
+ // ggml_compute_forward_mul_mat
11001
+
11002
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
11003
+ // helper function to determine if it is better to use BLAS or not
11004
+ // for large matrices, BLAS is faster
11005
+ static bool ggml_compute_forward_mul_mat_use_blas(
11006
+ const struct ggml_tensor * src0,
11007
+ const struct ggml_tensor * src1,
11008
+ struct ggml_tensor * dst) {
11009
+ //const int64_t ne00 = src0->ne[0];
10585
11010
  //const int64_t ne01 = src0->ne[1];
10586
11011
 
10587
11012
  const int64_t ne10 = src1->ne[0];
@@ -10629,7 +11054,7 @@ static void ggml_compute_forward_mul_mat(
10629
11054
  GGML_ASSERT(ne3 == ne13);
10630
11055
 
10631
11056
  // we don't support permuted src0 or src1
10632
- GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
11057
+ GGML_ASSERT(nb00 == ggml_type_size(type));
10633
11058
  GGML_ASSERT(nb10 == sizeof(float));
10634
11059
 
10635
11060
  // dst cannot be transposed or permuted
@@ -10638,6 +11063,10 @@ static void ggml_compute_forward_mul_mat(
10638
11063
  GGML_ASSERT(nb1 <= nb2);
10639
11064
  GGML_ASSERT(nb2 <= nb3);
10640
11065
 
11066
+ // broadcast factors
11067
+ const int64_t r2 = ne12/ne02;
11068
+ const int64_t r3 = ne13/ne03;
11069
+
10641
11070
  // nb01 >= nb00 - src0 is not transposed
10642
11071
  // compute by src0 rows
10643
11072
 
@@ -10657,11 +11086,6 @@ static void ggml_compute_forward_mul_mat(
10657
11086
 
10658
11087
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
10659
11088
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
10660
- // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
10661
- // ref: https://github.com/ggerganov/ggml/pull/224
10662
- GGML_ASSERT(ne02 == ne12);
10663
- GGML_ASSERT(ne03 == ne13);
10664
-
10665
11089
  if (params->ith != 0) {
10666
11090
  return;
10667
11091
  }
@@ -10674,12 +11098,16 @@ static void ggml_compute_forward_mul_mat(
10674
11098
  return;
10675
11099
  }
10676
11100
 
10677
- for (int64_t i03 = 0; i03 < ne03; i03++) {
10678
- for (int64_t i02 = 0; i02 < ne02; i02++) {
10679
- const void * x = (char *) src0->data + i03*nb03 + i02*nb02;
10680
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
11101
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
11102
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
11103
+ // broadcast src0 into src1 across 2nd,3rd dimension
11104
+ const int64_t i03 = i13/r3;
11105
+ const int64_t i02 = i12/r2;
11106
+
11107
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
11108
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
10681
11109
 
10682
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
11110
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
10683
11111
 
10684
11112
  if (type != GGML_TYPE_F32) {
10685
11113
  float * const wdata = params->wdata;
@@ -10687,7 +11115,7 @@ static void ggml_compute_forward_mul_mat(
10687
11115
 
10688
11116
  size_t id = 0;
10689
11117
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
10690
- to_float((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
11118
+ to_float((const char *) x + i01*nb01, wdata + id, ne00);
10691
11119
  id += ne00;
10692
11120
  }
10693
11121
 
@@ -10712,7 +11140,7 @@ static void ggml_compute_forward_mul_mat(
10712
11140
  if (params->type == GGML_TASK_INIT) {
10713
11141
  if (src1->type != vec_dot_type) {
10714
11142
  char * wdata = params->wdata;
10715
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11143
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10716
11144
 
10717
11145
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
10718
11146
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -10732,7 +11160,7 @@ static void ggml_compute_forward_mul_mat(
10732
11160
  }
10733
11161
 
10734
11162
  const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
10735
- const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
11163
+ const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
10736
11164
 
10737
11165
  const int64_t nr0 = ne01; // src0 rows
10738
11166
  const int64_t nr1 = ne11*ne12*ne13; // src1 rows
@@ -10767,10 +11195,6 @@ static void ggml_compute_forward_mul_mat(
10767
11195
  assert(ne12 % ne02 == 0);
10768
11196
  assert(ne13 % ne03 == 0);
10769
11197
 
10770
- // broadcast factors
10771
- const int64_t r2 = ne12/ne02;
10772
- const int64_t r3 = ne13/ne03;
10773
-
10774
11198
  // block-tiling attempt
10775
11199
  const int64_t blck_0 = 16;
10776
11200
  const int64_t blck_1 = 16;
@@ -11205,7 +11629,7 @@ static void ggml_compute_forward_get_rows_q(
11205
11629
 
11206
11630
  assert( dst->ne[0] == nc);
11207
11631
  assert( dst->ne[1] == nr);
11208
- assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
11632
+ assert(src0->nb[0] == ggml_type_size(type));
11209
11633
 
11210
11634
  for (int i = 0; i < nr; ++i) {
11211
11635
  const int r = ((int32_t *) src1->data)[i];
@@ -11926,7 +12350,6 @@ static void ggml_compute_forward_alibi(
11926
12350
  }
11927
12351
  }
11928
12352
 
11929
-
11930
12353
  // ggml_compute_forward_clamp
11931
12354
 
11932
12355
  static void ggml_compute_forward_clamp_f32(
@@ -12015,12 +12438,18 @@ static void ggml_compute_forward_rope_f32(
12015
12438
  float freq_base;
12016
12439
  float freq_scale;
12017
12440
 
12441
+ // these two only relevant for xPos RoPE:
12442
+ float xpos_base;
12443
+ bool xpos_down;
12444
+
12018
12445
  const int n_past = ((int32_t *) dst->op_params)[0];
12019
12446
  const int n_dims = ((int32_t *) dst->op_params)[1];
12020
12447
  const int mode = ((int32_t *) dst->op_params)[2];
12021
12448
  const int n_ctx = ((int32_t *) dst->op_params)[3];
12022
12449
  memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12023
12450
  memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12451
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12452
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12024
12453
 
12025
12454
  assert(n_past >= 0);
12026
12455
 
@@ -12092,6 +12521,9 @@ static void ggml_compute_forward_rope_f32(
12092
12521
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12093
12522
  const float cos_theta = cosf(theta);
12094
12523
  const float sin_theta = sinf(theta);
12524
+ // zeta scaling for xPos only:
12525
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12526
+ if (xpos_down) zeta = 1.0f / zeta;
12095
12527
 
12096
12528
  theta *= theta_scale;
12097
12529
 
@@ -12101,11 +12533,11 @@ static void ggml_compute_forward_rope_f32(
12101
12533
  const float x0 = src[0];
12102
12534
  const float x1 = src[1];
12103
12535
 
12104
- dst_data[0] = x0*cos_theta - x1*sin_theta;
12105
- dst_data[1] = x0*sin_theta + x1*cos_theta;
12536
+ dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
12537
+ dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
12106
12538
  }
12107
12539
  } else {
12108
- // TODO: this is probably wrong, but I can't figure it out ..
12540
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12109
12541
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12110
12542
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12111
12543
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12234,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
12234
12666
  dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12235
12667
  }
12236
12668
  } else {
12237
- // TODO: this is probably wrong, but I can't figure it out ..
12669
+ // TODO: this might be wrong for ne0 != n_dims - need double check
12238
12670
  // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
12239
12671
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
12240
12672
  for (int64_t ic = 0; ic < n_dims; ic += 2) {
@@ -12296,9 +12728,21 @@ static void ggml_compute_forward_rope_back_f32(
12296
12728
  // dx = rope_back(dy, src1)
12297
12729
  // src0 is dy, src1 contains options
12298
12730
 
12731
+ float freq_base;
12732
+ float freq_scale;
12733
+
12734
+ // these two only relevant for xPos RoPE:
12735
+ float xpos_base;
12736
+ bool xpos_down;
12737
+
12299
12738
  const int n_past = ((int32_t *) dst->op_params)[0];
12300
12739
  const int n_dims = ((int32_t *) dst->op_params)[1];
12301
12740
  const int mode = ((int32_t *) dst->op_params)[2];
12741
+ const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
12742
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
12743
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
12744
+ memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
12745
+ memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
12302
12746
 
12303
12747
  assert(n_past >= 0);
12304
12748
 
@@ -12324,7 +12768,7 @@ static void ggml_compute_forward_rope_back_f32(
12324
12768
  // row index used to determine which thread to use
12325
12769
  int ir = 0;
12326
12770
 
12327
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
12771
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
12328
12772
 
12329
12773
  const bool is_neox = mode & 2;
12330
12774
 
@@ -12335,12 +12779,15 @@ static void ggml_compute_forward_rope_back_f32(
12335
12779
  if (ir++ < ir0) continue;
12336
12780
  if (ir > ir1) break;
12337
12781
 
12338
- float theta = (float)p;
12782
+ float theta = freq_scale * (float)p;
12339
12783
 
12340
12784
  if (!is_neox) {
12341
12785
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12342
12786
  const float cos_theta = cosf(theta);
12343
12787
  const float sin_theta = sinf(theta);
12788
+ // zeta scaling for xPos only:
12789
+ float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
12790
+ if (xpos_down) zeta = 1.0f / zeta;
12344
12791
 
12345
12792
  theta *= theta_scale;
12346
12793
 
@@ -12350,8 +12797,8 @@ static void ggml_compute_forward_rope_back_f32(
12350
12797
  const float dy0 = dy[0];
12351
12798
  const float dy1 = dy[1];
12352
12799
 
12353
- dx[0] = dy0*cos_theta + dy1*sin_theta;
12354
- dx[1] = - dy0*sin_theta + dy1*cos_theta;
12800
+ dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
12801
+ dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
12355
12802
  }
12356
12803
  } else {
12357
12804
  for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
@@ -13044,6 +13491,108 @@ static void ggml_compute_forward_conv_2d(
13044
13491
  }
13045
13492
  }
13046
13493
 
13494
+ // ggml_compute_forward_conv_transpose_2d
13495
+
13496
+ static void ggml_compute_forward_conv_transpose_2d(
13497
+ const struct ggml_compute_params * params,
13498
+ const struct ggml_tensor * src0,
13499
+ const struct ggml_tensor * src1,
13500
+ const struct ggml_tensor * opt0,
13501
+ struct ggml_tensor * dst) {
13502
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13503
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13504
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13505
+
13506
+ int64_t t0 = ggml_perf_time_us();
13507
+ UNUSED(t0);
13508
+
13509
+ GGML_TENSOR_BINARY_OP_LOCALS;
13510
+
13511
+ const int ith = params->ith;
13512
+ const int nth = params->nth;
13513
+
13514
+ const int nk = ne00*ne01*ne02*ne03;
13515
+
13516
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13517
+ GGML_ASSERT(nb10 == sizeof(float));
13518
+
13519
+ if (params->type == GGML_TASK_INIT) {
13520
+ memset(params->wdata, 0, params->wsize);
13521
+
13522
+ // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
13523
+ {
13524
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13525
+
13526
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
13527
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
13528
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
13529
+ ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
13530
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
13531
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
13532
+ dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
13533
+ }
13534
+ }
13535
+ }
13536
+ }
13537
+ }
13538
+
13539
+ // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
13540
+ {
13541
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
13542
+ for (int i12 = 0; i12 < ne12; i12++) {
13543
+ for (int i11 = 0; i11 < ne11; i11++) {
13544
+ const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
13545
+ ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
13546
+ for (int i10 = 0; i10 < ne10; i10++) {
13547
+ dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
13548
+ }
13549
+ }
13550
+ }
13551
+ }
13552
+
13553
+ return;
13554
+ }
13555
+
13556
+ if (params->type == GGML_TASK_FINALIZE) {
13557
+ return;
13558
+ }
13559
+
13560
+ const int32_t stride = ((const int32_t*)(opt0->data))[0];
13561
+
13562
+ // total patches in dst
13563
+ const int np = ne2;
13564
+
13565
+ // patches per thread
13566
+ const int dp = (np + nth - 1)/nth;
13567
+
13568
+ // patch range for this thread
13569
+ const int ip0 = dp*ith;
13570
+ const int ip1 = MIN(ip0 + dp, np);
13571
+
13572
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13573
+ ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
13574
+
13575
+ for (int i2 = ip0; i2 < ip1; i2++) { // Cout
13576
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13577
+ ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
13578
+ for (int i11 = 0; i11 < ne11; i11++) {
13579
+ for (int i10 = 0; i10 < ne10; i10++) {
13580
+ const int i1n = i11*ne10*ne12 + i10*ne12;
13581
+ for (int i01 = 0; i01 < ne01; i01++) {
13582
+ for (int i00 = 0; i00 < ne00; i00++) {
13583
+ float v = 0;
13584
+ ggml_vec_dot_f16(ne03, &v,
13585
+ (ggml_fp16_t *) wdata_src + i1n,
13586
+ (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
13587
+
13588
+ dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
13589
+ }
13590
+ }
13591
+ }
13592
+ }
13593
+ }
13594
+ }
13595
+
13047
13596
  // ggml_compute_forward_pool_1d_sk_p0
13048
13597
 
13049
13598
  static void ggml_compute_forward_pool_1d_sk_p0(
@@ -13202,6 +13751,60 @@ static void ggml_compute_forward_pool_2d(
13202
13751
  ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
13203
13752
  }
13204
13753
 
13754
+ // ggml_compute_forward_upscale
13755
+
13756
+ static void ggml_compute_forward_upscale_f32(
13757
+ const struct ggml_compute_params * params,
13758
+ const struct ggml_tensor * src0,
13759
+ struct ggml_tensor * dst) {
13760
+
13761
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
13762
+ return;
13763
+ }
13764
+
13765
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13766
+
13767
+ const int ith = params->ith;
13768
+
13769
+ GGML_TENSOR_UNARY_OP_LOCALS;
13770
+
13771
+ const int scale_factor = dst->op_params[0];
13772
+
13773
+ // TODO: optimize
13774
+
13775
+ for (int i03 = 0; i03 < ne03; i03++) {
13776
+ for (int i02 = ith; i02 < ne02; i02++) {
13777
+ for (int m = 0; m < dst->ne[1]; m++) {
13778
+ int i01 = m / scale_factor;
13779
+ for (int n = 0; n < dst->ne[0]; n++) {
13780
+ int i00 = n / scale_factor;
13781
+
13782
+ const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
13783
+
13784
+ float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
13785
+
13786
+ *y = *x;
13787
+ }
13788
+ }
13789
+ }
13790
+ }
13791
+ }
13792
+
13793
+ static void ggml_compute_forward_upscale(
13794
+ const struct ggml_compute_params * params,
13795
+ const struct ggml_tensor * src0,
13796
+ struct ggml_tensor * dst) {
13797
+ switch (src0->type) {
13798
+ case GGML_TYPE_F32:
13799
+ {
13800
+ ggml_compute_forward_upscale_f32(params, src0, dst);
13801
+ } break;
13802
+ default:
13803
+ {
13804
+ GGML_ASSERT(false);
13805
+ } break;
13806
+ }
13807
+ }
13205
13808
 
13206
13809
  // ggml_compute_forward_flash_attn
13207
13810
 
@@ -14327,42 +14930,43 @@ static void ggml_compute_forward_unary(
14327
14930
  }
14328
14931
  }
14329
14932
 
14330
- // ggml_compute_forward_map_unary
14933
+ // ggml_compute_forward_get_rel_pos
14331
14934
 
14332
- static void ggml_compute_forward_map_unary_f32(
14935
+ static void ggml_compute_forward_get_rel_pos_f16(
14333
14936
  const struct ggml_compute_params * params,
14334
14937
  const struct ggml_tensor * src0,
14335
- struct ggml_tensor * dst,
14336
- const ggml_unary_op_f32_t fun) {
14337
- GGML_ASSERT(ggml_are_same_shape(src0, dst));
14338
-
14938
+ struct ggml_tensor * dst) {
14339
14939
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14340
14940
  return;
14341
14941
  }
14342
14942
 
14343
- const int n = ggml_nrows(src0);
14344
- const int nc = src0->ne[0];
14943
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
14345
14944
 
14346
- assert( dst->nb[0] == sizeof(float));
14347
- assert(src0->nb[0] == sizeof(float));
14945
+ GGML_TENSOR_UNARY_OP_LOCALS;
14348
14946
 
14349
- for (int i = 0; i < n; i++) {
14350
- fun(nc,
14351
- (float *) ((char *) dst->data + i*( dst->nb[1])),
14352
- (float *) ((char *) src0->data + i*(src0->nb[1])));
14947
+ const int64_t w = ne1;
14948
+
14949
+ ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
14950
+ ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
14951
+
14952
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14953
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14954
+ const int64_t pos = (w - i1 - 1) + i2;
14955
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14956
+ dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
14957
+ }
14958
+ }
14353
14959
  }
14354
14960
  }
14355
14961
 
14356
-
14357
- static void ggml_compute_forward_map_unary(
14962
+ static void ggml_compute_forward_get_rel_pos(
14358
14963
  const struct ggml_compute_params * params,
14359
14964
  const struct ggml_tensor * src0,
14360
- struct ggml_tensor * dst,
14361
- const ggml_unary_op_f32_t fun) {
14965
+ struct ggml_tensor * dst) {
14362
14966
  switch (src0->type) {
14363
- case GGML_TYPE_F32:
14967
+ case GGML_TYPE_F16:
14364
14968
  {
14365
- ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
14969
+ ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
14366
14970
  } break;
14367
14971
  default:
14368
14972
  {
@@ -14371,34 +14975,164 @@ static void ggml_compute_forward_map_unary(
14371
14975
  }
14372
14976
  }
14373
14977
 
14374
- // ggml_compute_forward_map_binary
14978
+ // ggml_compute_forward_add_rel_pos
14375
14979
 
14376
- static void ggml_compute_forward_map_binary_f32(
14980
+ static void ggml_compute_forward_add_rel_pos_f32(
14377
14981
  const struct ggml_compute_params * params,
14378
14982
  const struct ggml_tensor * src0,
14379
14983
  const struct ggml_tensor * src1,
14380
- struct ggml_tensor * dst,
14381
- const ggml_binary_op_f32_t fun) {
14382
- assert(params->ith == 0);
14383
- assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
14984
+ const struct ggml_tensor * src2,
14985
+ struct ggml_tensor * dst) {
14384
14986
 
14987
+ const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14988
+ if (!inplace && params->type == GGML_TASK_INIT) {
14989
+ memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14990
+ return;
14991
+ }
14385
14992
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14386
14993
  return;
14387
14994
  }
14388
14995
 
14389
- const int n = ggml_nrows(src0);
14390
- const int nc = src0->ne[0];
14996
+ int64_t t0 = ggml_perf_time_us();
14997
+ UNUSED(t0);
14391
14998
 
14392
- assert( dst->nb[0] == sizeof(float));
14393
- assert(src0->nb[0] == sizeof(float));
14394
- assert(src1->nb[0] == sizeof(float));
14999
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
14395
15000
 
14396
- for (int i = 0; i < n; i++) {
14397
- fun(nc,
14398
- (float *) ((char *) dst->data + i*( dst->nb[1])),
14399
- (float *) ((char *) src0->data + i*(src0->nb[1])),
14400
- (float *) ((char *) src1->data + i*(src1->nb[1])));
14401
- }
15001
+ float * src1_data = (float *) src1->data;
15002
+ float * src2_data = (float *) src2->data;
15003
+ float * dst_data = (float *) dst->data;
15004
+
15005
+ const int64_t ne10 = src1->ne[0];
15006
+ const int64_t ne11 = src1->ne[1];
15007
+ const int64_t ne12 = src1->ne[2];
15008
+ const int64_t ne13 = src1->ne[3];
15009
+
15010
+ const int ith = params->ith;
15011
+ const int nth = params->nth;
15012
+
15013
+ // total patches in dst
15014
+ const int np = ne13;
15015
+
15016
+ // patches per thread
15017
+ const int dp = (np + nth - 1)/nth;
15018
+
15019
+ // patch range for this thread
15020
+ const int ip0 = dp*ith;
15021
+ const int ip1 = MIN(ip0 + dp, np);
15022
+
15023
+
15024
+ for (int64_t i13 = ip0; i13 < ip1; ++i13) {
15025
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
15026
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
15027
+ const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
15028
+ for (int64_t i10 = 0; i10 < ne10; ++i10) {
15029
+ const int64_t jp0 = jp1 + i10;
15030
+ const float src1_e = src1_data[jp0];
15031
+ const float src2_e = src2_data[jp0];
15032
+
15033
+ const int64_t jdh = jp0 * ne10;
15034
+ const int64_t jdw = jdh - (ne10 - 1) * i10;
15035
+
15036
+ for (int64_t j = 0; j < ne10; ++j) {
15037
+ dst_data[jdh + j ] += src2_e;
15038
+ dst_data[jdw + j*ne10] += src1_e;
15039
+ }
15040
+ }
15041
+ }
15042
+ }
15043
+ }
15044
+ }
15045
+
15046
+ static void ggml_compute_forward_add_rel_pos(
15047
+ const struct ggml_compute_params * params,
15048
+ const struct ggml_tensor * src0,
15049
+ const struct ggml_tensor * src1,
15050
+ const struct ggml_tensor * src2,
15051
+ struct ggml_tensor * dst) {
15052
+ switch (src0->type) {
15053
+ case GGML_TYPE_F32:
15054
+ {
15055
+ ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
15056
+ } break;
15057
+ default:
15058
+ {
15059
+ GGML_ASSERT(false);
15060
+ } break;
15061
+ }
15062
+ }
15063
+
15064
+ // ggml_compute_forward_map_unary
15065
+
15066
+ static void ggml_compute_forward_map_unary_f32(
15067
+ const struct ggml_compute_params * params,
15068
+ const struct ggml_tensor * src0,
15069
+ struct ggml_tensor * dst,
15070
+ const ggml_unary_op_f32_t fun) {
15071
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
15072
+
15073
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15074
+ return;
15075
+ }
15076
+
15077
+ const int n = ggml_nrows(src0);
15078
+ const int nc = src0->ne[0];
15079
+
15080
+ assert( dst->nb[0] == sizeof(float));
15081
+ assert(src0->nb[0] == sizeof(float));
15082
+
15083
+ for (int i = 0; i < n; i++) {
15084
+ fun(nc,
15085
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
15086
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
15087
+ }
15088
+ }
15089
+
15090
+
15091
+ static void ggml_compute_forward_map_unary(
15092
+ const struct ggml_compute_params * params,
15093
+ const struct ggml_tensor * src0,
15094
+ struct ggml_tensor * dst,
15095
+ const ggml_unary_op_f32_t fun) {
15096
+ switch (src0->type) {
15097
+ case GGML_TYPE_F32:
15098
+ {
15099
+ ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
15100
+ } break;
15101
+ default:
15102
+ {
15103
+ GGML_ASSERT(false);
15104
+ } break;
15105
+ }
15106
+ }
15107
+
15108
+ // ggml_compute_forward_map_binary
15109
+
15110
+ static void ggml_compute_forward_map_binary_f32(
15111
+ const struct ggml_compute_params * params,
15112
+ const struct ggml_tensor * src0,
15113
+ const struct ggml_tensor * src1,
15114
+ struct ggml_tensor * dst,
15115
+ const ggml_binary_op_f32_t fun) {
15116
+ assert(params->ith == 0);
15117
+ assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
15118
+
15119
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15120
+ return;
15121
+ }
15122
+
15123
+ const int n = ggml_nrows(src0);
15124
+ const int nc = src0->ne[0];
15125
+
15126
+ assert( dst->nb[0] == sizeof(float));
15127
+ assert(src0->nb[0] == sizeof(float));
15128
+ assert(src1->nb[0] == sizeof(float));
15129
+
15130
+ for (int i = 0; i < n; i++) {
15131
+ fun(nc,
15132
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
15133
+ (float *) ((char *) src0->data + i*(src0->nb[1])),
15134
+ (float *) ((char *) src1->data + i*(src1->nb[1])));
15135
+ }
14402
15136
  }
14403
15137
 
14404
15138
 
@@ -14879,6 +15613,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14879
15613
  {
14880
15614
  ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
14881
15615
  } break;
15616
+ case GGML_OP_CONCAT:
15617
+ {
15618
+ ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
15619
+ } break;
14882
15620
  case GGML_OP_SILU_BACK:
14883
15621
  {
14884
15622
  ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
@@ -14895,6 +15633,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14895
15633
  {
14896
15634
  ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
14897
15635
  } break;
15636
+ case GGML_OP_GROUP_NORM:
15637
+ {
15638
+ ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
15639
+ } break;
14898
15640
  case GGML_OP_MUL_MAT:
14899
15641
  {
14900
15642
  ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
@@ -14987,6 +15729,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14987
15729
  {
14988
15730
  ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14989
15731
  } break;
15732
+ case GGML_OP_CONV_TRANSPOSE_2D:
15733
+ {
15734
+ ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15735
+ } break;
14990
15736
  case GGML_OP_POOL_1D:
14991
15737
  {
14992
15738
  ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
@@ -14995,6 +15741,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14995
15741
  {
14996
15742
  ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
14997
15743
  } break;
15744
+ case GGML_OP_UPSCALE:
15745
+ {
15746
+ ggml_compute_forward_upscale(params, tensor->src[0], tensor);
15747
+ } break;
14998
15748
  case GGML_OP_FLASH_ATTN:
14999
15749
  {
15000
15750
  const int32_t t = ggml_get_op_params_i32(tensor, 0);
@@ -15025,6 +15775,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15025
15775
  {
15026
15776
  ggml_compute_forward_unary(params, tensor->src[0], tensor);
15027
15777
  } break;
15778
+ case GGML_OP_GET_REL_POS:
15779
+ {
15780
+ ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
15781
+ } break;
15782
+ case GGML_OP_ADD_REL_POS:
15783
+ {
15784
+ ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
15785
+ } break;
15028
15786
  case GGML_OP_MAP_UNARY:
15029
15787
  {
15030
15788
  ggml_unary_op_f32_t fun;
@@ -15288,6 +16046,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15288
16046
  inplace);
15289
16047
  }
15290
16048
  } break;
16049
+ case GGML_OP_CONCAT:
16050
+ {
16051
+ GGML_ASSERT(false); // TODO: implement
16052
+ } break;
15291
16053
  case GGML_OP_SILU_BACK:
15292
16054
  {
15293
16055
  GGML_ASSERT(false); // TODO: not implemented
@@ -15310,6 +16072,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15310
16072
  {
15311
16073
  GGML_ASSERT(false); // TODO: not implemented
15312
16074
  } break;
16075
+ case GGML_OP_GROUP_NORM:
16076
+ {
16077
+ GGML_ASSERT(false); // TODO: not implemented
16078
+ } break;
15313
16079
  case GGML_OP_MUL_MAT:
15314
16080
  {
15315
16081
  // https://cs231n.github.io/optimization-2/#staged
@@ -15584,6 +16350,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15584
16350
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15585
16351
  const int mode = ((int32_t *) tensor->op_params)[2];
15586
16352
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16353
+ float freq_base;
16354
+ float freq_scale;
16355
+ float xpos_base;
16356
+ bool xpos_down;
16357
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16358
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16359
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16360
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16361
+
15587
16362
  src0->grad = ggml_add_impl(ctx,
15588
16363
  src0->grad,
15589
16364
  ggml_rope_back(ctx,
@@ -15591,7 +16366,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15591
16366
  n_past,
15592
16367
  n_dims,
15593
16368
  mode,
15594
- n_ctx),
16369
+ n_ctx,
16370
+ freq_base,
16371
+ freq_scale,
16372
+ xpos_base,
16373
+ xpos_down),
15595
16374
  inplace);
15596
16375
  }
15597
16376
  } break;
@@ -15602,14 +16381,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15602
16381
  const int n_dims = ((int32_t *) tensor->op_params)[1];
15603
16382
  const int mode = ((int32_t *) tensor->op_params)[2];
15604
16383
  const int n_ctx = ((int32_t *) tensor->op_params)[3];
16384
+ float freq_base;
16385
+ float freq_scale;
16386
+ float xpos_base;
16387
+ bool xpos_down;
16388
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
16389
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
16390
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
16391
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
16392
+
15605
16393
  src0->grad = ggml_add_impl(ctx,
15606
16394
  src0->grad,
15607
- ggml_rope(ctx,
16395
+ ggml_rope_impl(ctx,
15608
16396
  tensor->grad,
15609
16397
  n_past,
15610
16398
  n_dims,
15611
16399
  mode,
15612
- n_ctx),
16400
+ n_ctx,
16401
+ freq_base,
16402
+ freq_scale,
16403
+ xpos_base,
16404
+ xpos_down,
16405
+ false),
15613
16406
  inplace);
15614
16407
  }
15615
16408
  } break;
@@ -15629,6 +16422,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15629
16422
  {
15630
16423
  GGML_ASSERT(false); // TODO: not implemented
15631
16424
  } break;
16425
+ case GGML_OP_CONV_TRANSPOSE_2D:
16426
+ {
16427
+ GGML_ASSERT(false); // TODO: not implemented
16428
+ } break;
15632
16429
  case GGML_OP_POOL_1D:
15633
16430
  {
15634
16431
  GGML_ASSERT(false); // TODO: not implemented
@@ -15637,6 +16434,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15637
16434
  {
15638
16435
  GGML_ASSERT(false); // TODO: not implemented
15639
16436
  } break;
16437
+ case GGML_OP_UPSCALE:
16438
+ {
16439
+ GGML_ASSERT(false); // TODO: not implemented
16440
+ } break;
15640
16441
  case GGML_OP_FLASH_ATTN:
15641
16442
  {
15642
16443
  struct ggml_tensor * flash_grad = NULL;
@@ -15878,6 +16679,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15878
16679
  GGML_ASSERT(false);
15879
16680
  }
15880
16681
  } break;
16682
+ case GGML_OP_GET_REL_POS:
16683
+ case GGML_OP_ADD_REL_POS:
15881
16684
  case GGML_OP_MAP_UNARY:
15882
16685
  case GGML_OP_MAP_BINARY:
15883
16686
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16382,7 +17185,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16382
17185
 
16383
17186
  size_t cur = 0;
16384
17187
  if (ggml_is_quantized(node->type)) {
16385
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
17188
+ cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16386
17189
  }
16387
17190
 
16388
17191
  work_size = MAX(work_size, cur);
@@ -16395,7 +17198,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16395
17198
  size_t cur = 0;
16396
17199
 
16397
17200
  if (ggml_is_quantized(node->src[0]->type)) {
16398
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks;
17201
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16399
17202
  }
16400
17203
 
16401
17204
  work_size = MAX(work_size, cur);
@@ -16407,7 +17210,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16407
17210
  size_t cur = 0;
16408
17211
 
16409
17212
  if (ggml_is_quantized(node->src[0]->type)) {
16410
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks;
17213
+ cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16411
17214
  }
16412
17215
 
16413
17216
  work_size = MAX(work_size, cur);
@@ -16454,9 +17257,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16454
17257
  case GGML_OP_NORM:
16455
17258
  case GGML_OP_RMS_NORM:
16456
17259
  case GGML_OP_RMS_NORM_BACK:
17260
+ case GGML_OP_GROUP_NORM:
16457
17261
  {
16458
17262
  n_tasks = n_threads;
16459
17263
  } break;
17264
+ case GGML_OP_CONCAT:
16460
17265
  case GGML_OP_MUL_MAT:
16461
17266
  case GGML_OP_OUT_PROD:
16462
17267
  {
@@ -16490,12 +17295,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16490
17295
  // the threads are still spinning
16491
17296
  if (node->src[0]->type != GGML_TYPE_F32) {
16492
17297
  // here we need memory just for single 2D matrix from src0
16493
- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]);
17298
+ cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
16494
17299
  }
16495
17300
  } else
16496
17301
  #endif
16497
17302
  if (node->src[1]->type != vec_dot_type) {
16498
- cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type];
17303
+ cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16499
17304
  } else {
16500
17305
  cur = 0;
16501
17306
  }
@@ -16524,6 +17329,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16524
17329
  case GGML_OP_SOFT_MAX_BACK:
16525
17330
  case GGML_OP_ROPE:
16526
17331
  case GGML_OP_ROPE_BACK:
17332
+ case GGML_OP_ADD_REL_POS:
16527
17333
  {
16528
17334
  n_tasks = n_threads;
16529
17335
  } break;
@@ -16598,6 +17404,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16598
17404
  GGML_ASSERT(false);
16599
17405
  }
16600
17406
 
17407
+ work_size = MAX(work_size, cur);
17408
+ } break;
17409
+ case GGML_OP_CONV_TRANSPOSE_2D:
17410
+ {
17411
+ n_tasks = n_threads;
17412
+
17413
+ const int64_t ne00 = node->src[0]->ne[0]; // W
17414
+ const int64_t ne01 = node->src[0]->ne[1]; // H
17415
+ const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
17416
+ const int64_t ne03 = node->src[0]->ne[3]; // Channels In
17417
+
17418
+ const int64_t ne10 = node->src[1]->ne[0]; // W
17419
+ const int64_t ne11 = node->src[1]->ne[1]; // H
17420
+ const int64_t ne12 = node->src[1]->ne[2]; // Channels In
17421
+
17422
+ size_t cur = 0;
17423
+ cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
17424
+ cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
17425
+
16601
17426
  work_size = MAX(work_size, cur);
16602
17427
  } break;
16603
17428
  case GGML_OP_POOL_1D:
@@ -16605,6 +17430,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16605
17430
  {
16606
17431
  n_tasks = 1;
16607
17432
  } break;
17433
+ case GGML_OP_UPSCALE:
17434
+ {
17435
+ n_tasks = n_threads;
17436
+ } break;
16608
17437
  case GGML_OP_FLASH_ATTN:
16609
17438
  {
16610
17439
  n_tasks = n_threads;
@@ -16666,6 +17495,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16666
17495
  } break;
16667
17496
  case GGML_OP_WIN_PART:
16668
17497
  case GGML_OP_WIN_UNPART:
17498
+ case GGML_OP_GET_REL_POS:
16669
17499
  case GGML_OP_MAP_UNARY:
16670
17500
  case GGML_OP_MAP_BINARY:
16671
17501
  case GGML_OP_MAP_CUSTOM1_F32:
@@ -16783,8 +17613,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16783
17613
 
16784
17614
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16785
17615
  GGML_ASSERT(rc == 0);
17616
+ UNUSED(rc);
16786
17617
  }
16787
17618
  }
17619
+
16788
17620
  workers[0].ith = 0;
16789
17621
  workers[0].shared = &state_shared;
16790
17622
 
@@ -16900,7 +17732,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16900
17732
  // compute size of intermediate results
16901
17733
  // TODO: does not take into account scratch buffers !!!!
16902
17734
  for (int i = 0; i < cgraph->n_nodes; ++i) {
16903
- size_eval += ggml_nbytes(cgraph->nodes[i]);
17735
+ size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
16904
17736
  }
16905
17737
 
16906
17738
  // print
@@ -18301,8 +19133,8 @@ enum ggml_opt_result ggml_opt_resume(
18301
19133
  struct ggml_tensor * f) {
18302
19134
 
18303
19135
  // build forward + backward compute graphs
18304
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
18305
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0));
19136
+ struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
19137
+ struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18306
19138
 
18307
19139
  struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18308
19140
  struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
@@ -18561,6 +19393,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18561
19393
 
18562
19394
  ////////////////////////////////////////////////////////////////////////////////
18563
19395
 
19396
+ struct gguf_str {
19397
+ uint32_t n;
19398
+ char * data;
19399
+ };
19400
+
19401
+ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
19402
+ [GGUF_TYPE_UINT8] = sizeof(uint8_t),
19403
+ [GGUF_TYPE_INT8] = sizeof(int8_t),
19404
+ [GGUF_TYPE_UINT16] = sizeof(uint16_t),
19405
+ [GGUF_TYPE_INT16] = sizeof(int16_t),
19406
+ [GGUF_TYPE_UINT32] = sizeof(uint32_t),
19407
+ [GGUF_TYPE_INT32] = sizeof(int32_t),
19408
+ [GGUF_TYPE_FLOAT32] = sizeof(float),
19409
+ [GGUF_TYPE_BOOL] = sizeof(bool),
19410
+ [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
19411
+ [GGUF_TYPE_ARRAY] = 0, // undefined
19412
+ };
19413
+ static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19414
+
19415
+ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
19416
+ [GGUF_TYPE_UINT8] = "u8",
19417
+ [GGUF_TYPE_INT8] = "i8",
19418
+ [GGUF_TYPE_UINT16] = "u16",
19419
+ [GGUF_TYPE_INT16] = "i16",
19420
+ [GGUF_TYPE_UINT32] = "u32",
19421
+ [GGUF_TYPE_INT32] = "i32",
19422
+ [GGUF_TYPE_FLOAT32] = "f32",
19423
+ [GGUF_TYPE_BOOL] = "bool",
19424
+ [GGUF_TYPE_STRING] = "str",
19425
+ [GGUF_TYPE_ARRAY] = "arr",
19426
+ };
19427
+ static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
19428
+
19429
+ union gguf_value {
19430
+ uint8_t uint8;
19431
+ int8_t int8;
19432
+ uint16_t uint16;
19433
+ int16_t int16;
19434
+ uint32_t uint32;
19435
+ int32_t int32;
19436
+ float float32;
19437
+ bool bool_;
19438
+
19439
+ struct gguf_str str;
19440
+
19441
+ struct {
19442
+ enum gguf_type type;
19443
+
19444
+ uint32_t n;
19445
+ void * data;
19446
+ } arr;
19447
+ };
19448
+
19449
+ struct gguf_kv {
19450
+ struct gguf_str key;
19451
+
19452
+ uint32_t n_bytes; // TODO: is this actually needed?
19453
+
19454
+ enum gguf_type type;
19455
+ union gguf_value value;
19456
+ };
19457
+
19458
+ struct gguf_header {
19459
+ uint32_t magic;
19460
+ uint32_t version;
19461
+ uint32_t n_tensors;
19462
+ uint32_t n_kv;
19463
+ };
19464
+
19465
+ struct gguf_tensor_info {
19466
+ struct gguf_str name;
19467
+
19468
+ uint32_t n_dims;
19469
+ uint32_t ne[GGML_MAX_DIMS];
19470
+
19471
+ enum ggml_type type;
19472
+
19473
+ uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
19474
+
19475
+ // for writing API
19476
+ const void * data;
19477
+ size_t size;
19478
+ };
19479
+
19480
+ struct gguf_context {
19481
+ struct gguf_header header;
19482
+
19483
+ struct gguf_kv * kv;
19484
+ struct gguf_tensor_info * infos;
19485
+
19486
+ size_t alignment;
19487
+ size_t offset; // offset of `data` from beginning of file
19488
+ size_t size; // size of `data` in bytes
19489
+
19490
+ //uint8_t * padding;
19491
+ void * data;
19492
+ };
19493
+
19494
+ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
19495
+ const size_t n = fread(dst, 1, size, file);
19496
+ *offset += n;
19497
+ return n == size;
19498
+ }
19499
+
19500
+ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
19501
+ p->n = 0;
19502
+ p->data = NULL;
19503
+
19504
+ bool ok = true;
19505
+
19506
+ // TODO: how to avoid mallocs for strings?
19507
+ ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
19508
+ ok = ok && gguf_fread_el(file, p->data, p->n, offset);
19509
+
19510
+ return ok;
19511
+ }
19512
+
19513
+ struct gguf_context * gguf_init_empty(void) {
19514
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19515
+
19516
+ ctx->header.magic = GGUF_MAGIC;
19517
+ ctx->header.version = GGUF_VERSION;
19518
+ ctx->header.n_tensors = 0;
19519
+ ctx->header.n_kv = 0;
19520
+
19521
+ ctx->kv = NULL;
19522
+ ctx->infos = NULL;
19523
+
19524
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19525
+ ctx->offset = 0;
19526
+ ctx->size = 0;
19527
+
19528
+ ctx->data = NULL;
19529
+
19530
+ return ctx;
19531
+ }
19532
+
19533
+ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
19534
+ FILE * file = fopen(fname, "rb");
19535
+ if (!file) {
19536
+ return NULL;
19537
+ }
19538
+
19539
+ // offset from start of file
19540
+ size_t offset = 0;
19541
+
19542
+ uint32_t magic = 0;
19543
+
19544
+ // check the magic before making allocations
19545
+ {
19546
+ gguf_fread_el(file, &magic, sizeof(magic), &offset);
19547
+
19548
+ if (magic != GGUF_MAGIC) {
19549
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
19550
+ fclose(file);
19551
+ return NULL;
19552
+ }
19553
+ }
19554
+
19555
+ bool ok = true;
19556
+
19557
+ struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
19558
+
19559
+ // read the header
19560
+ {
19561
+ ctx->header.magic = magic;
19562
+
19563
+ ctx->kv = NULL;
19564
+ ctx->infos = NULL;
19565
+ ctx->data = NULL;
19566
+
19567
+ ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
19568
+ ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
19569
+ ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
19570
+
19571
+ if (!ok) {
19572
+ fprintf(stderr, "%s: failed to read header\n", __func__);
19573
+ fclose(file);
19574
+ gguf_free(ctx);
19575
+ return NULL;
19576
+ }
19577
+ }
19578
+
19579
+ // read the kv pairs
19580
+ {
19581
+ ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
19582
+
19583
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19584
+ struct gguf_kv * kv = &ctx->kv[i];
19585
+
19586
+ //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
19587
+
19588
+ ok = ok && gguf_fread_str(file, &kv->key, &offset);
19589
+ //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
19590
+ ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
19591
+
19592
+ //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
19593
+
19594
+ switch (kv->type) {
19595
+ case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
19596
+ case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
19597
+ case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
19598
+ case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
19599
+ case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
19600
+ case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
19601
+ case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
19602
+ case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
19603
+ case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
19604
+ case GGUF_TYPE_ARRAY:
19605
+ {
19606
+ ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
19607
+ ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
19608
+
19609
+ switch (kv->value.arr.type) {
19610
+ case GGUF_TYPE_UINT8:
19611
+ case GGUF_TYPE_INT8:
19612
+ case GGUF_TYPE_UINT16:
19613
+ case GGUF_TYPE_INT16:
19614
+ case GGUF_TYPE_UINT32:
19615
+ case GGUF_TYPE_INT32:
19616
+ case GGUF_TYPE_FLOAT32:
19617
+ case GGUF_TYPE_BOOL:
19618
+ {
19619
+ kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
19620
+ ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
19621
+ } break;
19622
+ case GGUF_TYPE_STRING:
19623
+ {
19624
+ kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
19625
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19626
+ ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
19627
+ }
19628
+ } break;
19629
+ case GGUF_TYPE_ARRAY:
19630
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
19631
+ };
19632
+ } break;
19633
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
19634
+ };
19635
+
19636
+ if (!ok) {
19637
+ break;
19638
+ }
19639
+ }
19640
+
19641
+ if (!ok) {
19642
+ fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
19643
+ fclose(file);
19644
+ gguf_free(ctx);
19645
+ return NULL;
19646
+ }
19647
+ }
19648
+
19649
+ // read the tensor infos
19650
+ {
19651
+ ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
19652
+
19653
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19654
+ struct gguf_tensor_info * info = &ctx->infos[i];
19655
+
19656
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
19657
+ info->ne[j] = 1;
19658
+ }
19659
+
19660
+ ok = ok && gguf_fread_str(file, &info->name, &offset);
19661
+ ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
19662
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
19663
+ ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
19664
+ }
19665
+ ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
19666
+ ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
19667
+
19668
+ if (!ok) {
19669
+ fprintf(stderr, "%s: failed to read tensor info\n", __func__);
19670
+ fclose(file);
19671
+ gguf_free(ctx);
19672
+ return NULL;
19673
+ }
19674
+ }
19675
+ }
19676
+
19677
+ ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
19678
+
19679
+ int alignment_idx = gguf_find_key(ctx, "general.alignment");
19680
+ if (alignment_idx != -1) {
19681
+ ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
19682
+ }
19683
+
19684
+ // we require the data section to be aligned, so take into account any padding
19685
+ {
19686
+ const size_t offset_pad = offset % ctx->alignment;
19687
+
19688
+ if (offset_pad != 0) {
19689
+ offset += ctx->alignment - offset_pad;
19690
+ fseek(file, offset, SEEK_SET);
19691
+ }
19692
+ }
19693
+
19694
+ // store the current file offset - this is where the data section starts
19695
+ ctx->offset = offset;
19696
+
19697
+ // compute the total size of the data section, taking into account the alignment
19698
+ {
19699
+ ctx->size = 0;
19700
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19701
+ struct gguf_tensor_info * info = &ctx->infos[i];
19702
+
19703
+ const int64_t ne =
19704
+ (int64_t) info->ne[0] *
19705
+ (int64_t) info->ne[1] *
19706
+ (int64_t) info->ne[2] *
19707
+ (int64_t) info->ne[3];
19708
+
19709
+ if (ne % ggml_blck_size(info->type) != 0) {
19710
+ fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
19711
+ __func__, info->name.data, ne, ggml_blck_size(info->type));
19712
+ fclose(file);
19713
+ gguf_free(ctx);
19714
+ return NULL;
19715
+ }
19716
+
19717
+ const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
19718
+
19719
+ ctx->size += GGML_PAD(size_cur, ctx->alignment);
19720
+ }
19721
+ }
19722
+
19723
+ // load the tensor data only if requested
19724
+ if (params.ctx != NULL) {
19725
+ // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
19726
+ // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
19727
+ // the ggml_tensor structs to the appropriate locations in the binary blob
19728
+
19729
+ // compute the exact size needed for the new ggml_context
19730
+ const size_t mem_size =
19731
+ params.no_alloc ?
19732
+ (ctx->header.n_tensors )*ggml_tensor_overhead() :
19733
+ (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
19734
+
19735
+ struct ggml_init_params pdata = {
19736
+ .mem_size = mem_size,
19737
+ .mem_buffer = NULL,
19738
+ .no_alloc = params.no_alloc,
19739
+ };
19740
+
19741
+ *params.ctx = ggml_init(pdata);
19742
+
19743
+ struct ggml_context * ctx_data = *params.ctx;
19744
+
19745
+ struct ggml_tensor * data = NULL;
19746
+
19747
+ if (params.no_alloc == false) {
19748
+ data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
19749
+
19750
+ ok = ok && data != NULL;
19751
+
19752
+ // read the binary blob with the tensor data
19753
+ ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
19754
+
19755
+ if (!ok) {
19756
+ fprintf(stderr, "%s: failed to read tensor data\n", __func__);
19757
+ fclose(file);
19758
+ ggml_free(ctx_data);
19759
+ gguf_free(ctx);
19760
+ return NULL;
19761
+ }
19762
+
19763
+ ctx->data = data->data;
19764
+ }
19765
+
19766
+ ggml_set_no_alloc(ctx_data, true);
19767
+
19768
+ // create the tensors
19769
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19770
+ const int64_t ne[GGML_MAX_DIMS] = {
19771
+ ctx->infos[i].ne[0],
19772
+ ctx->infos[i].ne[1],
19773
+ ctx->infos[i].ne[2],
19774
+ ctx->infos[i].ne[3],
19775
+ };
19776
+
19777
+ struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
19778
+
19779
+ ok = ok && cur != NULL;
19780
+
19781
+ ggml_set_name(cur, ctx->infos[i].name.data);
19782
+
19783
+ if (!ok) {
19784
+ break;
19785
+ }
19786
+
19787
+ // point the data member to the appropriate location in the binary blob using the tensor infos
19788
+ if (params.no_alloc == false) {
19789
+ //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
19790
+ cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
19791
+ }
19792
+ }
19793
+
19794
+ if (!ok) {
19795
+ fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
19796
+ fclose(file);
19797
+ ggml_free(ctx_data);
19798
+ gguf_free(ctx);
19799
+ return NULL;
19800
+ }
19801
+
19802
+ ggml_set_no_alloc(ctx_data, params.no_alloc);
19803
+ }
19804
+
19805
+ fclose(file);
19806
+
19807
+ return ctx;
19808
+ }
19809
+
19810
+ void gguf_free(struct gguf_context * ctx) {
19811
+ if (ctx == NULL) {
19812
+ return;
19813
+ }
19814
+
19815
+ if (ctx->kv) {
19816
+ // free string memory - not great..
19817
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
19818
+ struct gguf_kv * kv = &ctx->kv[i];
19819
+
19820
+ if (kv->key.data) {
19821
+ free(kv->key.data);
19822
+ }
19823
+
19824
+ if (kv->type == GGUF_TYPE_STRING) {
19825
+ if (kv->value.str.data) {
19826
+ free(kv->value.str.data);
19827
+ }
19828
+ }
19829
+
19830
+ if (kv->type == GGUF_TYPE_ARRAY) {
19831
+ if (kv->value.arr.data) {
19832
+ if (kv->value.arr.type == GGUF_TYPE_STRING) {
19833
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
19834
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
19835
+ if (str->data) {
19836
+ free(str->data);
19837
+ }
19838
+ }
19839
+ }
19840
+ free(kv->value.arr.data);
19841
+ }
19842
+ }
19843
+ }
19844
+
19845
+ GGML_ALIGNED_FREE(ctx->kv);
19846
+ }
19847
+
19848
+ if (ctx->infos) {
19849
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
19850
+ struct gguf_tensor_info * info = &ctx->infos[i];
19851
+
19852
+ if (info->name.data) {
19853
+ free(info->name.data);
19854
+ }
19855
+ }
19856
+
19857
+ GGML_ALIGNED_FREE(ctx->infos);
19858
+ }
19859
+
19860
+ GGML_ALIGNED_FREE(ctx);
19861
+ }
19862
+
19863
+ const char * gguf_type_name(enum gguf_type type) {
19864
+ return GGUF_TYPE_NAME[type];
19865
+ }
19866
+
19867
+ int gguf_get_version(struct gguf_context * ctx) {
19868
+ return ctx->header.version;
19869
+ }
19870
+
19871
+ size_t gguf_get_alignment(struct gguf_context * ctx) {
19872
+ return ctx->alignment;
19873
+ }
19874
+
19875
+ size_t gguf_get_data_offset(struct gguf_context * ctx) {
19876
+ return ctx->offset;
19877
+ }
19878
+
19879
+ void * gguf_get_data(struct gguf_context * ctx) {
19880
+ return ctx->data;
19881
+ }
19882
+
19883
+ int gguf_get_n_kv(struct gguf_context * ctx) {
19884
+ return ctx->header.n_kv;
19885
+ }
19886
+
19887
+ int gguf_find_key(struct gguf_context * ctx, const char * key) {
19888
+ // return -1 if key not found
19889
+ int keyfound = -1;
19890
+
19891
+ const int n_kv = gguf_get_n_kv(ctx);
19892
+
19893
+ for (int i = 0; i < n_kv; ++i) {
19894
+ if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
19895
+ keyfound = i;
19896
+ break;
19897
+ }
19898
+ }
19899
+
19900
+ return keyfound;
19901
+ }
19902
+
19903
+ const char * gguf_get_key(struct gguf_context * ctx, int i) {
19904
+ return ctx->kv[i].key.data;
19905
+ }
19906
+
19907
+ enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
19908
+ return ctx->kv[i].type;
19909
+ }
19910
+
19911
+ enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
19912
+ return ctx->kv[i].value.arr.type;
19913
+ }
19914
+
19915
+ const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
19916
+ return ctx->kv[i].value.arr.data;
19917
+ }
19918
+
19919
+ const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
19920
+ struct gguf_kv * kv = &ctx->kv[key_id];
19921
+ struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
19922
+ return str->data;
19923
+ }
19924
+
19925
+ int gguf_get_arr_n(struct gguf_context * ctx, int i) {
19926
+ return ctx->kv[i].value.arr.n;
19927
+ }
19928
+
19929
+ uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
19930
+ return ctx->kv[i].value.uint8;
19931
+ }
19932
+
19933
+ int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
19934
+ return ctx->kv[i].value.int8;
19935
+ }
19936
+
19937
+ uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
19938
+ return ctx->kv[i].value.uint16;
19939
+ }
19940
+
19941
+ int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
19942
+ return ctx->kv[i].value.int16;
19943
+ }
19944
+
19945
+ uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
19946
+ return ctx->kv[i].value.uint32;
19947
+ }
19948
+
19949
+ int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
19950
+ return ctx->kv[i].value.int32;
19951
+ }
19952
+
19953
+ float gguf_get_val_f32(struct gguf_context * ctx, int i) {
19954
+ return ctx->kv[i].value.float32;
19955
+ }
19956
+
19957
+ bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
19958
+ return ctx->kv[i].value.bool_;
19959
+ }
19960
+
19961
+ const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
19962
+ return ctx->kv[i].value.str.data;
19963
+ }
19964
+
19965
+ int gguf_get_n_tensors(struct gguf_context * ctx) {
19966
+ return ctx->header.n_tensors;
19967
+ }
19968
+
19969
+ int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
19970
+ // return -1 if tensor not found
19971
+ int tensorfound = -1;
19972
+
19973
+ const int n_tensors = gguf_get_n_tensors(ctx);
19974
+
19975
+ for (int i = 0; i < n_tensors; ++i) {
19976
+ if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
19977
+ tensorfound = i;
19978
+ break;
19979
+ }
19980
+ }
19981
+
19982
+ return tensorfound;
19983
+ }
19984
+
19985
+ size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
19986
+ return ctx->infos[i].offset;
19987
+ }
19988
+
19989
+ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
19990
+ return ctx->infos[i].name.data;
19991
+ }
19992
+
19993
+ // returns the index
19994
+ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
19995
+ const int idx = gguf_find_key(ctx, key);
19996
+ if (idx >= 0) {
19997
+ return idx;
19998
+ }
19999
+
20000
+ const int n_kv = gguf_get_n_kv(ctx);
20001
+
20002
+ ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
20003
+ ctx->kv[n_kv].key.n = strlen(key) + 1;
20004
+ ctx->kv[n_kv].key.data = strdup(key);
20005
+ ctx->header.n_kv++;
20006
+
20007
+ return n_kv;
20008
+ }
20009
+
20010
+ void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
20011
+ const int idx = gguf_get_or_add_key(ctx, key);
20012
+
20013
+ ctx->kv[idx].type = GGUF_TYPE_UINT8;
20014
+ ctx->kv[idx].value.uint8 = val;
20015
+ }
20016
+
20017
+ void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
20018
+ const int idx = gguf_get_or_add_key(ctx, key);
20019
+
20020
+ ctx->kv[idx].type = GGUF_TYPE_INT8;
20021
+ ctx->kv[idx].value.int8 = val;
20022
+ }
20023
+
20024
+ void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
20025
+ const int idx = gguf_get_or_add_key(ctx, key);
20026
+
20027
+ ctx->kv[idx].type = GGUF_TYPE_UINT16;
20028
+ ctx->kv[idx].value.uint16 = val;
20029
+ }
20030
+
20031
+ void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
20032
+ const int idx = gguf_get_or_add_key(ctx, key);
20033
+
20034
+ ctx->kv[idx].type = GGUF_TYPE_INT16;
20035
+ ctx->kv[idx].value.int16 = val;
20036
+ }
20037
+
20038
+ void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
20039
+ const int idx = gguf_get_or_add_key(ctx, key);
20040
+
20041
+ ctx->kv[idx].type = GGUF_TYPE_UINT32;
20042
+ ctx->kv[idx].value.uint32 = val;
20043
+ }
20044
+
20045
+ void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
20046
+ const int idx = gguf_get_or_add_key(ctx, key);
20047
+
20048
+ ctx->kv[idx].type = GGUF_TYPE_INT32;
20049
+ ctx->kv[idx].value.int32 = val;
20050
+ }
20051
+
20052
+ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
20053
+ const int idx = gguf_get_or_add_key(ctx, key);
20054
+
20055
+ ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
20056
+ ctx->kv[idx].value.float32 = val;
20057
+ }
20058
+
20059
+ void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
20060
+ const int idx = gguf_get_or_add_key(ctx, key);
20061
+
20062
+ ctx->kv[idx].type = GGUF_TYPE_BOOL;
20063
+ ctx->kv[idx].value.bool_ = val;
20064
+ }
20065
+
20066
+ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
20067
+ const int idx = gguf_get_or_add_key(ctx, key);
20068
+
20069
+ ctx->kv[idx].type = GGUF_TYPE_STRING;
20070
+ ctx->kv[idx].value.str.n = strlen(val) + 1;
20071
+ ctx->kv[idx].value.str.data = strdup(val);
20072
+ }
20073
+
20074
+ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
20075
+ const int idx = gguf_get_or_add_key(ctx, key);
20076
+
20077
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20078
+ ctx->kv[idx].value.arr.type = type;
20079
+ ctx->kv[idx].value.arr.n = n;
20080
+ ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
20081
+ memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
20082
+ }
20083
+
20084
+ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
20085
+ const int idx = gguf_get_or_add_key(ctx, key);
20086
+
20087
+ ctx->kv[idx].type = GGUF_TYPE_ARRAY;
20088
+ ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
20089
+ ctx->kv[idx].value.arr.n = n;
20090
+ ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
20091
+ for (int i = 0; i < n; i++) {
20092
+ struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
20093
+ str->n = strlen(data[i]) + 1;
20094
+ str->data = strdup(data[i]);
20095
+ }
20096
+ }
20097
+
20098
+ // set or add KV pairs from another context
20099
+ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
20100
+ for (uint32_t i = 0; i < src->header.n_kv; i++) {
20101
+ switch (src->kv[i].type) {
20102
+ case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
20103
+ case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
20104
+ case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
20105
+ case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
20106
+ case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
20107
+ case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
20108
+ case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
20109
+ case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
20110
+ case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
20111
+ case GGUF_TYPE_ARRAY:
20112
+ {
20113
+ if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
20114
+ const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
20115
+ for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
20116
+ data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
20117
+ }
20118
+ gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
20119
+ free(data);
20120
+ } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
20121
+ GGML_ASSERT(false && "nested arrays not supported");
20122
+ } else {
20123
+ gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
20124
+ }
20125
+ } break;
20126
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20127
+ }
20128
+ }
20129
+ }
20130
+
20131
+ void gguf_add_tensor(
20132
+ struct gguf_context * ctx,
20133
+ const struct ggml_tensor * tensor) {
20134
+ const int idx = ctx->header.n_tensors;
20135
+ ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
20136
+
20137
+ ctx->infos[idx].name.n = strlen(tensor->name) + 1;
20138
+ ctx->infos[idx].name.data = strdup(tensor->name);
20139
+
20140
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
20141
+ ctx->infos[idx].ne[i] = 1;
20142
+ }
20143
+
20144
+ ctx->infos[idx].n_dims = tensor->n_dims;
20145
+ for (int i = 0; i < tensor->n_dims; i++) {
20146
+ ctx->infos[idx].ne[i] = tensor->ne[i];
20147
+ }
20148
+
20149
+ ctx->infos[idx].type = tensor->type;
20150
+ ctx->infos[idx].offset = 0;
20151
+ ctx->infos[idx].data = tensor->data;
20152
+ ctx->infos[idx].size = ggml_nbytes(tensor);
20153
+
20154
+ if (ctx->header.n_tensors > 0) {
20155
+ ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
20156
+ }
20157
+
20158
+ ctx->header.n_tensors++;
20159
+ }
20160
+
20161
+ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
20162
+ const int idx = gguf_find_tensor(ctx, name);
20163
+ if (idx < 0) {
20164
+ GGML_ASSERT(false && "tensor not found");
20165
+ }
20166
+
20167
+ ctx->infos[idx].type = type;
20168
+ }
20169
+
20170
+ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
20171
+ const int idx = gguf_find_tensor(ctx, name);
20172
+ if (idx < 0) {
20173
+ GGML_ASSERT(false && "tensor not found");
20174
+ }
20175
+
20176
+ ctx->infos[idx].data = data;
20177
+ ctx->infos[idx].size = size;
20178
+
20179
+ // update offsets
20180
+ for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
20181
+ ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
20182
+ }
20183
+ }
20184
+
20185
+ //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
20186
+ // fwrite(&val->n, sizeof(val->n), 1, file);
20187
+ // fwrite(val->data, sizeof(char), val->n, file);
20188
+ //}
20189
+ //
20190
+ //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
20191
+ // fwrite(val, sizeof(char), size, file);
20192
+ //}
20193
+
20194
+ struct gguf_buf {
20195
+ void * data;
20196
+ size_t size;
20197
+ size_t offset;
20198
+ };
20199
+
20200
+ static struct gguf_buf gguf_buf_init(size_t size) {
20201
+ struct gguf_buf buf = {
20202
+ /*buf.data =*/ size == 0 ? NULL : malloc(size),
20203
+ /*buf.size =*/ size,
20204
+ /*buf.offset =*/ 0,
20205
+ };
20206
+
20207
+ return buf;
20208
+ }
20209
+
20210
+ static void gguf_buf_free(struct gguf_buf buf) {
20211
+ if (buf.data) {
20212
+ free(buf.data);
20213
+ }
20214
+ }
20215
+
20216
+ static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
20217
+ if (buf->offset + size > buf->size) {
20218
+ buf->size = 1.5*(buf->offset + size);
20219
+ if (buf->data) {
20220
+ buf->data = realloc(buf->data, buf->size);
20221
+ }
20222
+ }
20223
+ }
20224
+
20225
+ static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
20226
+ gguf_buf_grow(buf, sizeof(val->n) + val->n);
20227
+
20228
+ if (buf->data) {
20229
+ memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
20230
+ }
20231
+ buf->offset += sizeof(val->n);
20232
+
20233
+ if (buf->data) {
20234
+ memcpy((char *) buf->data + buf->offset, val->data, val->n);
20235
+ }
20236
+ buf->offset += val->n;
20237
+ }
20238
+
20239
+ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
20240
+ gguf_buf_grow(buf, el_size);
20241
+
20242
+ if (buf->data) {
20243
+ memcpy((char *) buf->data + buf->offset, val, el_size);
20244
+ }
20245
+ buf->offset += el_size;
20246
+ }
20247
+
20248
+ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
20249
+ // write header
20250
+ gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
20251
+ gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
20252
+ gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
20253
+ gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
20254
+
20255
+ // write key-value pairs
20256
+ for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
20257
+ struct gguf_kv * kv = &ctx->kv[i];
20258
+
20259
+ gguf_bwrite_str(buf, &kv->key);
20260
+ gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
20261
+
20262
+ switch (kv->type) {
20263
+ case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
20264
+ case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
20265
+ case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
20266
+ case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
20267
+ case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
20268
+ case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
20269
+ case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
20270
+ case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
20271
+ case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
20272
+ case GGUF_TYPE_ARRAY:
20273
+ {
20274
+ gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
20275
+ gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
20276
+
20277
+ switch (kv->value.arr.type) {
20278
+ case GGUF_TYPE_UINT8:
20279
+ case GGUF_TYPE_INT8:
20280
+ case GGUF_TYPE_UINT16:
20281
+ case GGUF_TYPE_INT16:
20282
+ case GGUF_TYPE_UINT32:
20283
+ case GGUF_TYPE_INT32:
20284
+ case GGUF_TYPE_FLOAT32:
20285
+ case GGUF_TYPE_BOOL:
20286
+ {
20287
+ gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
20288
+ } break;
20289
+ case GGUF_TYPE_STRING:
20290
+ {
20291
+ for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
20292
+ gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
20293
+ }
20294
+ } break;
20295
+ case GGUF_TYPE_ARRAY:
20296
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
20297
+ };
20298
+ } break;
20299
+ case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
20300
+ };
20301
+ }
20302
+
20303
+ // write tensor infos
20304
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20305
+ struct gguf_tensor_info * info = &ctx->infos[i];
20306
+
20307
+ gguf_bwrite_str(buf, &info->name);
20308
+ gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
20309
+ for (uint32_t j = 0; j < info->n_dims; ++j) {
20310
+ gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
20311
+ }
20312
+ gguf_bwrite_el(buf, &info->type, sizeof(info->type));
20313
+ gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
20314
+ }
20315
+
20316
+ // we require the data section to be aligned, so take into account any padding
20317
+ {
20318
+ const size_t offset = buf->offset;
20319
+ const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
20320
+
20321
+ if (offset_pad != offset) {
20322
+ uint8_t pad = 0;
20323
+ for (size_t i = 0; i < offset_pad - offset; ++i) {
20324
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20325
+ }
20326
+ }
20327
+ }
20328
+
20329
+ if (only_meta) {
20330
+ return;
20331
+ }
20332
+
20333
+ size_t offset = 0;
20334
+
20335
+ // write tensor data
20336
+ for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
20337
+ struct gguf_tensor_info * info = &ctx->infos[i];
20338
+
20339
+ const size_t size = info->size;
20340
+ const size_t size_pad = GGML_PAD(size, ctx->alignment);
20341
+
20342
+ gguf_bwrite_el(buf, info->data, size);
20343
+
20344
+ if (size_pad != size) {
20345
+ uint8_t pad = 0;
20346
+ for (size_t j = 0; j < size_pad - size; ++j) {
20347
+ gguf_bwrite_el(buf, &pad, sizeof(pad));
20348
+ }
20349
+ }
20350
+
20351
+ GGML_ASSERT(offset == info->offset);
20352
+
20353
+ offset += size_pad;
20354
+ }
20355
+ }
20356
+
20357
+ void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
20358
+ FILE * file = fopen(fname, "wb");
20359
+ if (!file) {
20360
+ GGML_ASSERT(false && "failed to open file for writing");
20361
+ }
20362
+
20363
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20364
+
20365
+ gguf_write_to_buf(ctx, &buf, only_meta);
20366
+
20367
+ fwrite(buf.data, 1, buf.offset, file);
20368
+
20369
+ gguf_buf_free(buf);
20370
+
20371
+ fclose(file);
20372
+ }
20373
+
20374
+ size_t gguf_get_meta_size(struct gguf_context * ctx) {
20375
+ // no allocs - only compute size
20376
+ struct gguf_buf buf = gguf_buf_init(0);
20377
+
20378
+ gguf_write_to_buf(ctx, &buf, true);
20379
+
20380
+ return buf.offset;
20381
+ }
20382
+
20383
+ void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
20384
+ struct gguf_buf buf = gguf_buf_init(16*1024);
20385
+
20386
+ gguf_write_to_buf(ctx, &buf, true);
20387
+
20388
+ memcpy(data, buf.data, buf.offset);
20389
+
20390
+ gguf_buf_free(buf);
20391
+ }
20392
+
20393
+ ////////////////////////////////////////////////////////////////////////////////
20394
+
18564
20395
  int ggml_cpu_has_avx(void) {
18565
20396
  #if defined(__AVX__)
18566
20397
  return 1;