llama_cpp 0.3.7 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -213,8 +213,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
213
213
|
error_desc = "insufficient memory";
|
214
214
|
break;
|
215
215
|
}
|
216
|
-
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
217
|
-
__func__, error_desc, size/(1024.0*1024.0));
|
216
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
218
217
|
return NULL;
|
219
218
|
}
|
220
219
|
return aligned_memory;
|
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
1643
1642
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1644
1643
|
|
1645
1644
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1645
|
+
[GGML_TYPE_I8] = {
|
1646
|
+
.type_name = "i8",
|
1647
|
+
.blck_size = 1,
|
1648
|
+
.type_size = sizeof(int8_t),
|
1649
|
+
.is_quantized = false,
|
1650
|
+
},
|
1651
|
+
[GGML_TYPE_I16] = {
|
1652
|
+
.type_name = "i16",
|
1653
|
+
.blck_size = 1,
|
1654
|
+
.type_size = sizeof(int16_t),
|
1655
|
+
.is_quantized = false,
|
1656
|
+
},
|
1657
|
+
[GGML_TYPE_I32] = {
|
1658
|
+
.type_name = "i32",
|
1659
|
+
.blck_size = 1,
|
1660
|
+
.type_size = sizeof(int32_t),
|
1661
|
+
.is_quantized = false,
|
1662
|
+
},
|
1646
1663
|
[GGML_TYPE_F32] = {
|
1664
|
+
.type_name = "f32",
|
1665
|
+
.blck_size = 1,
|
1666
|
+
.type_size = sizeof(float),
|
1667
|
+
.is_quantized = false,
|
1647
1668
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1648
1669
|
.vec_dot_type = GGML_TYPE_F32,
|
1649
1670
|
},
|
1650
1671
|
[GGML_TYPE_F16] = {
|
1672
|
+
.type_name = "f16",
|
1673
|
+
.blck_size = 1,
|
1674
|
+
.type_size = sizeof(ggml_fp16_t),
|
1675
|
+
.is_quantized = false,
|
1651
1676
|
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1652
1677
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1653
1678
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1655
1680
|
.vec_dot_type = GGML_TYPE_F16,
|
1656
1681
|
},
|
1657
1682
|
[GGML_TYPE_Q4_0] = {
|
1683
|
+
.type_name = "q4_0",
|
1684
|
+
.blck_size = QK4_0,
|
1685
|
+
.type_size = sizeof(block_q4_0),
|
1686
|
+
.is_quantized = true,
|
1658
1687
|
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1659
1688
|
.from_float = quantize_row_q4_0,
|
1660
1689
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1662
1691
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1663
1692
|
},
|
1664
1693
|
[GGML_TYPE_Q4_1] = {
|
1694
|
+
.type_name = "q4_1",
|
1695
|
+
.blck_size = QK4_1,
|
1696
|
+
.type_size = sizeof(block_q4_1),
|
1697
|
+
.is_quantized = true,
|
1665
1698
|
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1666
1699
|
.from_float = quantize_row_q4_1,
|
1667
1700
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1669
1702
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1670
1703
|
},
|
1671
1704
|
[GGML_TYPE_Q5_0] = {
|
1705
|
+
.type_name = "q5_0",
|
1706
|
+
.blck_size = QK5_0,
|
1707
|
+
.type_size = sizeof(block_q5_0),
|
1708
|
+
.is_quantized = true,
|
1672
1709
|
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1673
1710
|
.from_float = quantize_row_q5_0,
|
1674
1711
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1676
1713
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1677
1714
|
},
|
1678
1715
|
[GGML_TYPE_Q5_1] = {
|
1716
|
+
.type_name = "q5_1",
|
1717
|
+
.blck_size = QK5_1,
|
1718
|
+
.type_size = sizeof(block_q5_1),
|
1719
|
+
.is_quantized = true,
|
1679
1720
|
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1680
1721
|
.from_float = quantize_row_q5_1,
|
1681
1722
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1683
1724
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1684
1725
|
},
|
1685
1726
|
[GGML_TYPE_Q8_0] = {
|
1727
|
+
.type_name = "q8_0",
|
1728
|
+
.blck_size = QK8_0,
|
1729
|
+
.type_size = sizeof(block_q8_0),
|
1730
|
+
.is_quantized = true,
|
1686
1731
|
.to_float = dequantize_row_q8_0,
|
1687
1732
|
.from_float = quantize_row_q8_0,
|
1688
1733
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1690
1735
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1691
1736
|
},
|
1692
1737
|
[GGML_TYPE_Q8_1] = {
|
1738
|
+
.type_name = "q8_1",
|
1739
|
+
.blck_size = QK8_1,
|
1740
|
+
.type_size = sizeof(block_q8_1),
|
1741
|
+
.is_quantized = true,
|
1693
1742
|
.from_float = quantize_row_q8_1,
|
1694
1743
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1695
1744
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1696
1745
|
},
|
1697
1746
|
#ifdef GGML_USE_K_QUANTS
|
1698
1747
|
[GGML_TYPE_Q2_K] = {
|
1748
|
+
.type_name = "q2_K",
|
1749
|
+
.blck_size = QK_K,
|
1750
|
+
.type_size = sizeof(block_q2_K),
|
1751
|
+
.is_quantized = true,
|
1699
1752
|
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1700
1753
|
.from_float = quantize_row_q2_K,
|
1701
1754
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1703
1756
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1704
1757
|
},
|
1705
1758
|
[GGML_TYPE_Q3_K] = {
|
1759
|
+
.type_name = "q3_K",
|
1760
|
+
.blck_size = QK_K,
|
1761
|
+
.type_size = sizeof(block_q3_K),
|
1762
|
+
.is_quantized = true,
|
1706
1763
|
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1707
1764
|
.from_float = quantize_row_q3_K,
|
1708
1765
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1710
1767
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1711
1768
|
},
|
1712
1769
|
[GGML_TYPE_Q4_K] = {
|
1770
|
+
.type_name = "q4_K",
|
1771
|
+
.blck_size = QK_K,
|
1772
|
+
.type_size = sizeof(block_q4_K),
|
1773
|
+
.is_quantized = true,
|
1713
1774
|
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1714
1775
|
.from_float = quantize_row_q4_K,
|
1715
1776
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1717
1778
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1718
1779
|
},
|
1719
1780
|
[GGML_TYPE_Q5_K] = {
|
1781
|
+
.type_name = "q5_K",
|
1782
|
+
.blck_size = QK_K,
|
1783
|
+
.type_size = sizeof(block_q5_K),
|
1784
|
+
.is_quantized = true,
|
1720
1785
|
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1721
1786
|
.from_float = quantize_row_q5_K,
|
1722
1787
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1724
1789
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1725
1790
|
},
|
1726
1791
|
[GGML_TYPE_Q6_K] = {
|
1792
|
+
.type_name = "q6_K",
|
1793
|
+
.blck_size = QK_K,
|
1794
|
+
.type_size = sizeof(block_q6_K),
|
1795
|
+
.is_quantized = true,
|
1727
1796
|
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1728
1797
|
.from_float = quantize_row_q6_K,
|
1729
1798
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1731
1800
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1732
1801
|
},
|
1733
1802
|
[GGML_TYPE_Q8_K] = {
|
1803
|
+
.type_name = "q8_K",
|
1804
|
+
.blck_size = QK_K,
|
1805
|
+
.type_size = sizeof(block_q8_K),
|
1806
|
+
.is_quantized = true,
|
1734
1807
|
.from_float = quantize_row_q8_K,
|
1735
1808
|
}
|
1736
1809
|
#endif
|
1737
1810
|
};
|
1738
1811
|
|
1739
1812
|
// For internal test use
|
1740
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type
|
1741
|
-
GGML_ASSERT(
|
1742
|
-
return type_traits[
|
1813
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
1814
|
+
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
1815
|
+
return type_traits[type];
|
1743
1816
|
}
|
1744
1817
|
|
1745
1818
|
|
@@ -3481,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
3481
3554
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
3482
3555
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3483
3556
|
|
3484
|
-
static const float GELU_COEF_A
|
3485
|
-
static const float GELU_QUICK_COEF
|
3486
|
-
static const float SQRT_2_OVER_PI
|
3557
|
+
static const float GELU_COEF_A = 0.044715f;
|
3558
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3559
|
+
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3487
3560
|
|
3488
3561
|
inline static float ggml_gelu_f32(float x) {
|
3489
3562
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
@@ -3652,95 +3725,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
|
3652
3725
|
// data types
|
3653
3726
|
//
|
3654
3727
|
|
3655
|
-
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
3656
|
-
[GGML_TYPE_F32] = 1,
|
3657
|
-
[GGML_TYPE_F16] = 1,
|
3658
|
-
[GGML_TYPE_Q4_0] = QK4_0,
|
3659
|
-
[GGML_TYPE_Q4_1] = QK4_1,
|
3660
|
-
[GGML_TYPE_Q5_0] = QK5_0,
|
3661
|
-
[GGML_TYPE_Q5_1] = QK5_1,
|
3662
|
-
[GGML_TYPE_Q8_0] = QK8_0,
|
3663
|
-
[GGML_TYPE_Q8_1] = QK8_1,
|
3664
|
-
#ifdef GGML_USE_K_QUANTS
|
3665
|
-
[GGML_TYPE_Q2_K] = QK_K,
|
3666
|
-
[GGML_TYPE_Q3_K] = QK_K,
|
3667
|
-
[GGML_TYPE_Q4_K] = QK_K,
|
3668
|
-
[GGML_TYPE_Q5_K] = QK_K,
|
3669
|
-
[GGML_TYPE_Q6_K] = QK_K,
|
3670
|
-
[GGML_TYPE_Q8_K] = QK_K,
|
3671
|
-
#endif
|
3672
|
-
[GGML_TYPE_I8] = 1,
|
3673
|
-
[GGML_TYPE_I16] = 1,
|
3674
|
-
[GGML_TYPE_I32] = 1,
|
3675
|
-
};
|
3676
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3677
|
-
|
3678
|
-
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3679
|
-
[GGML_TYPE_F32] = sizeof(float),
|
3680
|
-
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
3681
|
-
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
3682
|
-
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
3683
|
-
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
3684
|
-
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3685
|
-
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3686
|
-
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3687
|
-
#ifdef GGML_USE_K_QUANTS
|
3688
|
-
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3689
|
-
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3690
|
-
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3691
|
-
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3692
|
-
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3693
|
-
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3694
|
-
#endif
|
3695
|
-
[GGML_TYPE_I8] = sizeof(int8_t),
|
3696
|
-
[GGML_TYPE_I16] = sizeof(int16_t),
|
3697
|
-
[GGML_TYPE_I32] = sizeof(int32_t),
|
3698
|
-
};
|
3699
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3700
|
-
|
3701
|
-
|
3702
|
-
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
3703
|
-
[GGML_TYPE_F32] = "f32",
|
3704
|
-
[GGML_TYPE_F16] = "f16",
|
3705
|
-
[GGML_TYPE_Q4_0] = "q4_0",
|
3706
|
-
[GGML_TYPE_Q4_1] = "q4_1",
|
3707
|
-
[GGML_TYPE_Q5_0] = "q5_0",
|
3708
|
-
[GGML_TYPE_Q5_1] = "q5_1",
|
3709
|
-
[GGML_TYPE_Q8_0] = "q8_0",
|
3710
|
-
[GGML_TYPE_Q8_1] = "q8_1",
|
3711
|
-
[GGML_TYPE_Q2_K] = "q2_K",
|
3712
|
-
[GGML_TYPE_Q3_K] = "q3_K",
|
3713
|
-
[GGML_TYPE_Q4_K] = "q4_K",
|
3714
|
-
[GGML_TYPE_Q5_K] = "q5_K",
|
3715
|
-
[GGML_TYPE_Q6_K] = "q6_K",
|
3716
|
-
[GGML_TYPE_Q8_K] = "q8_K",
|
3717
|
-
[GGML_TYPE_I8] = "i8",
|
3718
|
-
[GGML_TYPE_I16] = "i16",
|
3719
|
-
[GGML_TYPE_I32] = "i32",
|
3720
|
-
};
|
3721
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3722
|
-
|
3723
|
-
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3724
|
-
[GGML_TYPE_F32] = false,
|
3725
|
-
[GGML_TYPE_F16] = false,
|
3726
|
-
[GGML_TYPE_Q4_0] = true,
|
3727
|
-
[GGML_TYPE_Q4_1] = true,
|
3728
|
-
[GGML_TYPE_Q5_0] = true,
|
3729
|
-
[GGML_TYPE_Q5_1] = true,
|
3730
|
-
[GGML_TYPE_Q8_0] = true,
|
3731
|
-
[GGML_TYPE_Q8_1] = true,
|
3732
|
-
[GGML_TYPE_Q2_K] = true,
|
3733
|
-
[GGML_TYPE_Q3_K] = true,
|
3734
|
-
[GGML_TYPE_Q4_K] = true,
|
3735
|
-
[GGML_TYPE_Q5_K] = true,
|
3736
|
-
[GGML_TYPE_Q6_K] = true,
|
3737
|
-
[GGML_TYPE_Q8_K] = true,
|
3738
|
-
[GGML_TYPE_I8] = false,
|
3739
|
-
[GGML_TYPE_I16] = false,
|
3740
|
-
[GGML_TYPE_I32] = false,
|
3741
|
-
};
|
3742
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3743
|
-
|
3744
3728
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3745
3729
|
"NONE",
|
3746
3730
|
|
@@ -3760,10 +3744,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3760
3744
|
"ARGMAX",
|
3761
3745
|
"REPEAT",
|
3762
3746
|
"REPEAT_BACK",
|
3747
|
+
"CONCAT",
|
3763
3748
|
"SILU_BACK",
|
3764
3749
|
"NORM",
|
3765
3750
|
"RMS_NORM",
|
3766
3751
|
"RMS_NORM_BACK",
|
3752
|
+
"GROUP_NORM",
|
3767
3753
|
|
3768
3754
|
"MUL_MAT",
|
3769
3755
|
"OUT_PROD",
|
@@ -3789,20 +3775,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3789
3775
|
"CLAMP",
|
3790
3776
|
"CONV_1D",
|
3791
3777
|
"CONV_2D",
|
3778
|
+
"CONV_TRANSPOSE_2D",
|
3792
3779
|
"POOL_1D",
|
3793
3780
|
"POOL_2D",
|
3781
|
+
"UPSCALE",
|
3794
3782
|
|
3795
3783
|
"FLASH_ATTN",
|
3796
3784
|
"FLASH_FF",
|
3797
3785
|
"FLASH_ATTN_BACK",
|
3798
3786
|
"WIN_PART",
|
3799
3787
|
"WIN_UNPART",
|
3788
|
+
"GET_REL_POS",
|
3789
|
+
"ADD_REL_POS",
|
3800
3790
|
|
3801
3791
|
"UNARY",
|
3802
3792
|
|
3803
3793
|
"MAP_UNARY",
|
3804
3794
|
"MAP_BINARY",
|
3805
3795
|
|
3796
|
+
"MAP_CUSTOM1_F32",
|
3797
|
+
"MAP_CUSTOM2_F32",
|
3798
|
+
"MAP_CUSTOM3_F32",
|
3799
|
+
|
3806
3800
|
"MAP_CUSTOM1",
|
3807
3801
|
"MAP_CUSTOM2",
|
3808
3802
|
"MAP_CUSTOM3",
|
@@ -3811,7 +3805,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
3805
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
3806
|
};
|
3813
3807
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
3808
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3815
3809
|
|
3816
3810
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
3811
|
"none",
|
@@ -3832,10 +3826,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3832
3826
|
"argmax(x)",
|
3833
3827
|
"repeat(x)",
|
3834
3828
|
"repeat_back(x)",
|
3829
|
+
"concat(x, y)",
|
3835
3830
|
"silu_back(x)",
|
3836
3831
|
"norm(x)",
|
3837
3832
|
"rms_norm(x)",
|
3838
3833
|
"rms_norm_back(x)",
|
3834
|
+
"group_norm(x)",
|
3839
3835
|
|
3840
3836
|
"X*Y",
|
3841
3837
|
"X*Y",
|
@@ -3861,20 +3857,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3861
3857
|
"clamp(x)",
|
3862
3858
|
"conv_1d(x)",
|
3863
3859
|
"conv_2d(x)",
|
3860
|
+
"conv_transpose_2d(x)",
|
3864
3861
|
"pool_1d(x)",
|
3865
3862
|
"pool_2d(x)",
|
3863
|
+
"upscale(x)",
|
3866
3864
|
|
3867
3865
|
"flash_attn(x)",
|
3868
3866
|
"flash_ff(x)",
|
3869
3867
|
"flash_attn_back(x)",
|
3870
3868
|
"win_part(x)",
|
3871
3869
|
"win_unpart(x)",
|
3870
|
+
"get_rel_pos(x)",
|
3871
|
+
"add_rel_pos(x)",
|
3872
3872
|
|
3873
3873
|
"unary(x)",
|
3874
3874
|
|
3875
3875
|
"f(x)",
|
3876
3876
|
"f(x,y)",
|
3877
3877
|
|
3878
|
+
"custom_f32(x)",
|
3879
|
+
"custom_f32(x,y)",
|
3880
|
+
"custom_f32(x,y,z)",
|
3881
|
+
|
3878
3882
|
"custom(x)",
|
3879
3883
|
"custom(x,y)",
|
3880
3884
|
"custom(x,y,z)",
|
@@ -3883,7 +3887,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
3887
|
"cross_entropy_loss_back(x,y)",
|
3884
3888
|
};
|
3885
3889
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
3890
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3887
3891
|
|
3888
3892
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
3893
|
|
@@ -3913,8 +3917,10 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
3913
3917
|
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3914
3918
|
p[GGML_OP_CONV_1D ] = true;
|
3915
3919
|
p[GGML_OP_CONV_2D ] = true;
|
3920
|
+
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
3916
3921
|
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3917
3922
|
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3923
|
+
p[GGML_OP_ADD_REL_POS ] = true;
|
3918
3924
|
}
|
3919
3925
|
|
3920
3926
|
{ // FINALIZE
|
@@ -4110,29 +4116,37 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
4110
4116
|
//
|
4111
4117
|
// is enough, but just in case, adding the second part
|
4112
4118
|
|
4113
|
-
return
|
4119
|
+
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
|
4120
|
+
}
|
4121
|
+
|
4122
|
+
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
4123
|
+
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
4114
4124
|
}
|
4115
4125
|
|
4116
4126
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
4117
4127
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4118
4128
|
|
4119
|
-
return (nrows_split*tensor->ne[0]*
|
4129
|
+
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
4120
4130
|
}
|
4121
4131
|
|
4122
4132
|
int ggml_blck_size(enum ggml_type type) {
|
4123
|
-
return
|
4133
|
+
return type_traits[type].blck_size;
|
4124
4134
|
}
|
4125
4135
|
|
4126
4136
|
size_t ggml_type_size(enum ggml_type type) {
|
4127
|
-
return
|
4137
|
+
return type_traits[type].type_size;
|
4128
4138
|
}
|
4129
4139
|
|
4130
4140
|
float ggml_type_sizef(enum ggml_type type) {
|
4131
|
-
return ((float)(
|
4141
|
+
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
|
4132
4142
|
}
|
4133
4143
|
|
4134
4144
|
const char * ggml_type_name(enum ggml_type type) {
|
4135
|
-
return
|
4145
|
+
return type_traits[type].type_name;
|
4146
|
+
}
|
4147
|
+
|
4148
|
+
bool ggml_is_quantized(enum ggml_type type) {
|
4149
|
+
return type_traits[type].is_quantized;
|
4136
4150
|
}
|
4137
4151
|
|
4138
4152
|
const char * ggml_op_name(enum ggml_op op) {
|
@@ -4144,7 +4158,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
4144
4158
|
}
|
4145
4159
|
|
4146
4160
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
4147
|
-
return
|
4161
|
+
return ggml_type_size(tensor->type);
|
4148
4162
|
}
|
4149
4163
|
|
4150
4164
|
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
@@ -4182,10 +4196,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
|
|
4182
4196
|
(t0->ne[3] == t1->ne[3]);
|
4183
4197
|
}
|
4184
4198
|
|
4185
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
4186
|
-
return GGML_IS_QUANTIZED[type];
|
4187
|
-
}
|
4188
|
-
|
4189
4199
|
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
4190
4200
|
enum ggml_type wtype = GGML_TYPE_COUNT;
|
4191
4201
|
|
@@ -4223,8 +4233,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
4223
4233
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4224
4234
|
|
4225
4235
|
return
|
4226
|
-
tensor->nb[0] ==
|
4227
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/
|
4236
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4237
|
+
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
4228
4238
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4229
4239
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4230
4240
|
}
|
@@ -4233,7 +4243,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
4233
4243
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4234
4244
|
|
4235
4245
|
return
|
4236
|
-
tensor->nb[0] ==
|
4246
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4237
4247
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4238
4248
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4239
4249
|
}
|
@@ -4248,7 +4258,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4248
4258
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4249
4259
|
|
4250
4260
|
return
|
4251
|
-
tensor->nb[0] ==
|
4261
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4252
4262
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4253
4263
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4264
|
}
|
@@ -4567,7 +4577,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4567
4577
|
size_t data_size = 0;
|
4568
4578
|
|
4569
4579
|
if (data == NULL && !ctx->no_alloc) {
|
4570
|
-
data_size +=
|
4580
|
+
data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
4571
4581
|
for (int i = 1; i < n_dims; i++) {
|
4572
4582
|
data_size *= ne[i];
|
4573
4583
|
}
|
@@ -4622,8 +4632,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4622
4632
|
result->ne[i] = ne[i];
|
4623
4633
|
}
|
4624
4634
|
|
4625
|
-
result->nb[0] =
|
4626
|
-
result->nb[1] = result->nb[0]*(result->ne[0]/
|
4635
|
+
result->nb[0] = ggml_type_size(type);
|
4636
|
+
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
4627
4637
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
4628
4638
|
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
4629
4639
|
}
|
@@ -5545,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
|
|
5545
5555
|
is_node = true;
|
5546
5556
|
}
|
5547
5557
|
|
5548
|
-
if (ggml_are_same_shape(a, b) && !is_node) {
|
5549
|
-
return a;
|
5550
|
-
}
|
5551
|
-
|
5552
5558
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
5553
5559
|
|
5554
5560
|
result->op = GGML_OP_REPEAT;
|
@@ -5587,6 +5593,30 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5587
5593
|
return result;
|
5588
5594
|
}
|
5589
5595
|
|
5596
|
+
// ggml_concat
|
5597
|
+
|
5598
|
+
struct ggml_tensor* ggml_concat(
|
5599
|
+
struct ggml_context* ctx,
|
5600
|
+
struct ggml_tensor* a,
|
5601
|
+
struct ggml_tensor* b) {
|
5602
|
+
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
5603
|
+
|
5604
|
+
bool is_node = false;
|
5605
|
+
|
5606
|
+
if (a->grad || b->grad) {
|
5607
|
+
is_node = true;
|
5608
|
+
}
|
5609
|
+
|
5610
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
5611
|
+
|
5612
|
+
result->op = GGML_OP_CONCAT;
|
5613
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5614
|
+
result->src[0] = a;
|
5615
|
+
result->src[1] = b;
|
5616
|
+
|
5617
|
+
return result;
|
5618
|
+
}
|
5619
|
+
|
5590
5620
|
// ggml_abs
|
5591
5621
|
|
5592
5622
|
struct ggml_tensor * ggml_abs(
|
@@ -5755,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
|
|
5755
5785
|
static struct ggml_tensor * ggml_norm_impl(
|
5756
5786
|
struct ggml_context * ctx,
|
5757
5787
|
struct ggml_tensor * a,
|
5788
|
+
float eps,
|
5758
5789
|
bool inplace) {
|
5759
5790
|
bool is_node = false;
|
5760
5791
|
|
@@ -5765,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5765
5796
|
|
5766
5797
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5767
5798
|
|
5768
|
-
|
5799
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
5769
5800
|
|
5770
5801
|
result->op = GGML_OP_NORM;
|
5771
5802
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5776,16 +5807,20 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5776
5807
|
|
5777
5808
|
struct ggml_tensor * ggml_norm(
|
5778
5809
|
struct ggml_context * ctx,
|
5779
|
-
struct ggml_tensor * a
|
5780
|
-
|
5810
|
+
struct ggml_tensor * a,
|
5811
|
+
float eps) {
|
5812
|
+
return ggml_norm_impl(ctx, a, eps, false);
|
5781
5813
|
}
|
5782
5814
|
|
5783
5815
|
struct ggml_tensor * ggml_norm_inplace(
|
5784
5816
|
struct ggml_context * ctx,
|
5785
|
-
struct ggml_tensor * a
|
5786
|
-
|
5817
|
+
struct ggml_tensor * a,
|
5818
|
+
float eps) {
|
5819
|
+
return ggml_norm_impl(ctx, a, eps, true);
|
5787
5820
|
}
|
5788
5821
|
|
5822
|
+
// ggml_rms_norm
|
5823
|
+
|
5789
5824
|
static struct ggml_tensor * ggml_rms_norm_impl(
|
5790
5825
|
struct ggml_context * ctx,
|
5791
5826
|
struct ggml_tensor * a,
|
@@ -5822,6 +5857,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
5822
5857
|
return ggml_rms_norm_impl(ctx, a, eps, true);
|
5823
5858
|
}
|
5824
5859
|
|
5860
|
+
// ggml_rms_norm_back
|
5861
|
+
|
5825
5862
|
struct ggml_tensor * ggml_rms_norm_back(
|
5826
5863
|
struct ggml_context * ctx,
|
5827
5864
|
struct ggml_tensor * a,
|
@@ -5843,6 +5880,44 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5843
5880
|
return result;
|
5844
5881
|
}
|
5845
5882
|
|
5883
|
+
// ggml_group_norm
|
5884
|
+
|
5885
|
+
static struct ggml_tensor * ggml_group_norm_impl(
|
5886
|
+
struct ggml_context * ctx,
|
5887
|
+
struct ggml_tensor * a,
|
5888
|
+
int n_groups,
|
5889
|
+
bool inplace) {
|
5890
|
+
|
5891
|
+
bool is_node = false;
|
5892
|
+
if (!inplace && (a->grad)) {
|
5893
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5894
|
+
is_node = true;
|
5895
|
+
}
|
5896
|
+
|
5897
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5898
|
+
|
5899
|
+
result->op = GGML_OP_GROUP_NORM;
|
5900
|
+
result->op_params[0] = n_groups;
|
5901
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5902
|
+
result->src[0] = a;
|
5903
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
5904
|
+
|
5905
|
+
return result;
|
5906
|
+
}
|
5907
|
+
|
5908
|
+
struct ggml_tensor * ggml_group_norm(
|
5909
|
+
struct ggml_context * ctx,
|
5910
|
+
struct ggml_tensor * a,
|
5911
|
+
int n_groups) {
|
5912
|
+
return ggml_group_norm_impl(ctx, a, n_groups, false);
|
5913
|
+
}
|
5914
|
+
|
5915
|
+
struct ggml_tensor * ggml_group_norm_inplace(
|
5916
|
+
struct ggml_context * ctx,
|
5917
|
+
struct ggml_tensor * a,
|
5918
|
+
int n_groups) {
|
5919
|
+
return ggml_group_norm_impl(ctx, a, n_groups, true);
|
5920
|
+
}
|
5846
5921
|
|
5847
5922
|
// ggml_mul_mat
|
5848
5923
|
|
@@ -6711,6 +6786,8 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6711
6786
|
int n_ctx,
|
6712
6787
|
float freq_base,
|
6713
6788
|
float freq_scale,
|
6789
|
+
float xpos_base,
|
6790
|
+
bool xpos_down,
|
6714
6791
|
bool inplace) {
|
6715
6792
|
GGML_ASSERT(n_past >= 0);
|
6716
6793
|
bool is_node = false;
|
@@ -6721,9 +6798,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6721
6798
|
|
6722
6799
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6723
6800
|
|
6724
|
-
int32_t params[
|
6801
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
6725
6802
|
memcpy(params + 4, &freq_base, sizeof(float));
|
6726
6803
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
6804
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
6805
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6727
6806
|
ggml_set_op_params(result, params, sizeof(params));
|
6728
6807
|
|
6729
6808
|
result->op = GGML_OP_ROPE;
|
@@ -6740,7 +6819,7 @@ struct ggml_tensor * ggml_rope(
|
|
6740
6819
|
int n_dims,
|
6741
6820
|
int mode,
|
6742
6821
|
int n_ctx) {
|
6743
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
6822
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
6744
6823
|
}
|
6745
6824
|
|
6746
6825
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6750,7 +6829,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6750
6829
|
int n_dims,
|
6751
6830
|
int mode,
|
6752
6831
|
int n_ctx) {
|
6753
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
6832
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
6754
6833
|
}
|
6755
6834
|
|
6756
6835
|
struct ggml_tensor * ggml_rope_custom(
|
@@ -6762,7 +6841,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6762
6841
|
int n_ctx,
|
6763
6842
|
float freq_base,
|
6764
6843
|
float freq_scale) {
|
6765
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
6844
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
6766
6845
|
}
|
6767
6846
|
|
6768
6847
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -6774,7 +6853,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6774
6853
|
int n_ctx,
|
6775
6854
|
float freq_base,
|
6776
6855
|
float freq_scale) {
|
6777
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
6856
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
6857
|
+
}
|
6858
|
+
|
6859
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
6860
|
+
struct ggml_context * ctx,
|
6861
|
+
struct ggml_tensor * a,
|
6862
|
+
int n_past,
|
6863
|
+
int n_dims,
|
6864
|
+
float base,
|
6865
|
+
bool down) {
|
6866
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
6778
6867
|
}
|
6779
6868
|
|
6780
6869
|
// ggml_rope_back
|
@@ -6785,7 +6874,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6785
6874
|
int n_past,
|
6786
6875
|
int n_dims,
|
6787
6876
|
int mode,
|
6788
|
-
int n_ctx
|
6877
|
+
int n_ctx,
|
6878
|
+
float freq_base,
|
6879
|
+
float freq_scale,
|
6880
|
+
float xpos_base,
|
6881
|
+
bool xpos_down) {
|
6789
6882
|
GGML_ASSERT(n_past >= 0);
|
6790
6883
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
6791
6884
|
|
@@ -6797,7 +6890,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6797
6890
|
|
6798
6891
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6799
6892
|
|
6800
|
-
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
6893
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
6894
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
6895
|
+
memcpy(params + 5, &freq_scale, sizeof(float));
|
6896
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
6897
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6801
6898
|
ggml_set_op_params(result, params, sizeof(params));
|
6802
6899
|
|
6803
6900
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -6904,6 +7001,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6904
7001
|
return result;
|
6905
7002
|
}
|
6906
7003
|
|
7004
|
+
// ggml_conv_1d_ph
|
7005
|
+
|
7006
|
+
struct ggml_tensor* ggml_conv_1d_ph(
|
7007
|
+
struct ggml_context * ctx,
|
7008
|
+
struct ggml_tensor * a,
|
7009
|
+
struct ggml_tensor * b,
|
7010
|
+
int s,
|
7011
|
+
int d) {
|
7012
|
+
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7013
|
+
}
|
7014
|
+
|
6907
7015
|
// ggml_conv_2d
|
6908
7016
|
|
6909
7017
|
struct ggml_tensor * ggml_conv_2d(
|
@@ -6944,17 +7052,59 @@ struct ggml_tensor * ggml_conv_2d(
|
|
6944
7052
|
|
6945
7053
|
}
|
6946
7054
|
|
6947
|
-
//
|
7055
|
+
// ggml_conv_2d_sk_p0
|
6948
7056
|
|
6949
|
-
struct ggml_tensor *
|
7057
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6950
7058
|
struct ggml_context * ctx,
|
6951
7059
|
struct ggml_tensor * a,
|
6952
|
-
struct ggml_tensor * b
|
6953
|
-
|
6954
|
-
|
6955
|
-
|
7060
|
+
struct ggml_tensor * b) {
|
7061
|
+
return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
|
7062
|
+
}
|
7063
|
+
|
7064
|
+
// ggml_conv_2d_s1_ph
|
7065
|
+
|
7066
|
+
struct ggml_tensor * ggml_conv_2d_s1_ph(
|
7067
|
+
struct ggml_context * ctx,
|
7068
|
+
struct ggml_tensor * a,
|
7069
|
+
struct ggml_tensor * b) {
|
7070
|
+
return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
|
6956
7071
|
}
|
6957
7072
|
|
7073
|
+
// ggml_conv_transpose_2d_p0
|
7074
|
+
|
7075
|
+
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
7076
|
+
return (ins - 1) * s - 2 * p + ks;
|
7077
|
+
}
|
7078
|
+
|
7079
|
+
struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
7080
|
+
struct ggml_context * ctx,
|
7081
|
+
struct ggml_tensor * a,
|
7082
|
+
struct ggml_tensor * b,
|
7083
|
+
int stride) {
|
7084
|
+
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
7085
|
+
|
7086
|
+
bool is_node = false;
|
7087
|
+
|
7088
|
+
if (a->grad || b->grad) {
|
7089
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7090
|
+
is_node = true;
|
7091
|
+
}
|
7092
|
+
|
7093
|
+
const int64_t ne[4] = {
|
7094
|
+
ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
|
7095
|
+
ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
|
7096
|
+
a->ne[2], b->ne[3],
|
7097
|
+
};
|
7098
|
+
|
7099
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7100
|
+
result->op = GGML_OP_CONV_TRANSPOSE_2D;
|
7101
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7102
|
+
result->src[0] = a;
|
7103
|
+
result->src[1] = b;
|
7104
|
+
result->src[2] = ggml_new_i32(ctx, stride);
|
7105
|
+
|
7106
|
+
return result;
|
7107
|
+
}
|
6958
7108
|
|
6959
7109
|
// ggml_pool_*
|
6960
7110
|
|
@@ -7032,6 +7182,40 @@ struct ggml_tensor * ggml_pool_2d(
|
|
7032
7182
|
return result;
|
7033
7183
|
}
|
7034
7184
|
|
7185
|
+
// ggml_upscale
|
7186
|
+
|
7187
|
+
static struct ggml_tensor * ggml_upscale_impl(
|
7188
|
+
struct ggml_context * ctx,
|
7189
|
+
struct ggml_tensor * a,
|
7190
|
+
int scale_factor) {
|
7191
|
+
bool is_node = false;
|
7192
|
+
|
7193
|
+
if (a->grad) {
|
7194
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7195
|
+
is_node = true;
|
7196
|
+
}
|
7197
|
+
|
7198
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
7199
|
+
a->ne[0] * scale_factor,
|
7200
|
+
a->ne[1] * scale_factor,
|
7201
|
+
a->ne[2], a->ne[3]);
|
7202
|
+
|
7203
|
+
result->op = GGML_OP_UPSCALE;
|
7204
|
+
result->op_params[0] = scale_factor;
|
7205
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7206
|
+
result->src[0] = a;
|
7207
|
+
result->src[1] = NULL;
|
7208
|
+
|
7209
|
+
return result;
|
7210
|
+
}
|
7211
|
+
|
7212
|
+
struct ggml_tensor * ggml_upscale(
|
7213
|
+
struct ggml_context * ctx,
|
7214
|
+
struct ggml_tensor * a,
|
7215
|
+
int scale_factor) {
|
7216
|
+
return ggml_upscale_impl(ctx, a, scale_factor);
|
7217
|
+
}
|
7218
|
+
|
7035
7219
|
// ggml_flash_attn
|
7036
7220
|
|
7037
7221
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -7230,6 +7414,87 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7230
7414
|
return result;
|
7231
7415
|
}
|
7232
7416
|
|
7417
|
+
// ggml_get_rel_pos
|
7418
|
+
|
7419
|
+
struct ggml_tensor * ggml_get_rel_pos(
|
7420
|
+
struct ggml_context * ctx,
|
7421
|
+
struct ggml_tensor * a,
|
7422
|
+
int qh,
|
7423
|
+
int kh) {
|
7424
|
+
GGML_ASSERT(qh == kh);
|
7425
|
+
GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
|
7426
|
+
|
7427
|
+
bool is_node = false;
|
7428
|
+
|
7429
|
+
if (a->grad) {
|
7430
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7431
|
+
is_node = true;
|
7432
|
+
}
|
7433
|
+
|
7434
|
+
const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
|
7435
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
|
7436
|
+
|
7437
|
+
result->op = GGML_OP_GET_REL_POS;
|
7438
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7439
|
+
result->src[0] = a;
|
7440
|
+
result->src[1] = NULL;
|
7441
|
+
|
7442
|
+
return result;
|
7443
|
+
}
|
7444
|
+
|
7445
|
+
// ggml_add_rel_pos
|
7446
|
+
|
7447
|
+
static struct ggml_tensor * ggml_add_rel_pos_impl(
|
7448
|
+
struct ggml_context * ctx,
|
7449
|
+
struct ggml_tensor * a,
|
7450
|
+
struct ggml_tensor * pw,
|
7451
|
+
struct ggml_tensor * ph,
|
7452
|
+
bool inplace) {
|
7453
|
+
GGML_ASSERT(ggml_are_same_shape(pw, ph));
|
7454
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
7455
|
+
GGML_ASSERT(ggml_is_contiguous(pw));
|
7456
|
+
GGML_ASSERT(ggml_is_contiguous(ph));
|
7457
|
+
GGML_ASSERT(ph->type == GGML_TYPE_F32);
|
7458
|
+
GGML_ASSERT(pw->type == GGML_TYPE_F32);
|
7459
|
+
GGML_ASSERT(pw->ne[3] == a->ne[2]);
|
7460
|
+
GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
|
7461
|
+
GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
|
7462
|
+
|
7463
|
+
bool is_node = false;
|
7464
|
+
|
7465
|
+
if (!inplace && (a->grad || pw->grad || ph->grad)) {
|
7466
|
+
is_node = true;
|
7467
|
+
}
|
7468
|
+
|
7469
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7470
|
+
ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
|
7471
|
+
|
7472
|
+
result->op = GGML_OP_ADD_REL_POS;
|
7473
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7474
|
+
result->src[0] = a;
|
7475
|
+
result->src[1] = pw;
|
7476
|
+
result->src[2] = ph;
|
7477
|
+
|
7478
|
+
return result;
|
7479
|
+
}
|
7480
|
+
|
7481
|
+
|
7482
|
+
struct ggml_tensor * ggml_add_rel_pos(
|
7483
|
+
struct ggml_context * ctx,
|
7484
|
+
struct ggml_tensor * a,
|
7485
|
+
struct ggml_tensor * pw,
|
7486
|
+
struct ggml_tensor * ph) {
|
7487
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
|
7488
|
+
}
|
7489
|
+
|
7490
|
+
struct ggml_tensor * ggml_add_rel_pos_inplace(
|
7491
|
+
struct ggml_context * ctx,
|
7492
|
+
struct ggml_tensor * a,
|
7493
|
+
struct ggml_tensor * pw,
|
7494
|
+
struct ggml_tensor * ph) {
|
7495
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7496
|
+
}
|
7497
|
+
|
7233
7498
|
// gmml_unary
|
7234
7499
|
|
7235
7500
|
static struct ggml_tensor * ggml_unary_impl(
|
@@ -7745,7 +8010,7 @@ static void ggml_compute_forward_dup_same_cont(
|
|
7745
8010
|
memcpy(
|
7746
8011
|
((char *) dst->data + ie0*nb0),
|
7747
8012
|
((char *) src0->data + ie0*nb00),
|
7748
|
-
(ie1 - ie0) *
|
8013
|
+
(ie1 - ie0) * ggml_type_size(src0->type));
|
7749
8014
|
}
|
7750
8015
|
|
7751
8016
|
}
|
@@ -7779,7 +8044,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7779
8044
|
|
7780
8045
|
if (src0->type == dst->type &&
|
7781
8046
|
ne00 == ne0 &&
|
7782
|
-
nb00 ==
|
8047
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
7783
8048
|
// copy by rows
|
7784
8049
|
const size_t rs = ne00*nb00;
|
7785
8050
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -7837,7 +8102,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7837
8102
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7838
8103
|
|
7839
8104
|
size_t id = 0;
|
7840
|
-
size_t rs = nb0 * (ne00 /
|
8105
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
7841
8106
|
char * dst_ptr = (char *) dst->data;
|
7842
8107
|
|
7843
8108
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8050,7 +8315,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8050
8315
|
|
8051
8316
|
if (src0->type == dst->type &&
|
8052
8317
|
ne00 == ne0 &&
|
8053
|
-
nb00 ==
|
8318
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
8054
8319
|
// copy by rows
|
8055
8320
|
const size_t rs = ne00*nb00;
|
8056
8321
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -8089,7 +8354,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8089
8354
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
8090
8355
|
|
8091
8356
|
size_t id = 0;
|
8092
|
-
size_t rs = nb0 * (ne00 /
|
8357
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
8093
8358
|
char * dst_ptr = (char *) dst->data;
|
8094
8359
|
|
8095
8360
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8501,7 +8766,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
8501
8766
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8502
8767
|
|
8503
8768
|
// we don't support permuted src0 or src1
|
8504
|
-
GGML_ASSERT(nb00 ==
|
8769
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8505
8770
|
GGML_ASSERT(nb10 == sizeof(float));
|
8506
8771
|
|
8507
8772
|
// dst cannot be transposed or permuted
|
@@ -8775,7 +9040,7 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8775
9040
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8776
9041
|
|
8777
9042
|
// we don't support permuted src0
|
8778
|
-
GGML_ASSERT(nb00 ==
|
9043
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8779
9044
|
|
8780
9045
|
// dst cannot be transposed or permuted
|
8781
9046
|
GGML_ASSERT(nb0 <= nb1);
|
@@ -9137,6 +9402,8 @@ static void ggml_compute_forward_mul(
|
|
9137
9402
|
const struct ggml_tensor * src0,
|
9138
9403
|
const struct ggml_tensor * src1,
|
9139
9404
|
struct ggml_tensor * dst) {
|
9405
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
|
9406
|
+
|
9140
9407
|
switch (src0->type) {
|
9141
9408
|
case GGML_TYPE_F32:
|
9142
9409
|
{
|
@@ -9731,6 +9998,72 @@ static void ggml_compute_forward_repeat_back(
|
|
9731
9998
|
}
|
9732
9999
|
}
|
9733
10000
|
|
10001
|
+
// ggml_compute_forward_concat
|
10002
|
+
|
10003
|
+
static void ggml_compute_forward_concat_f32(
|
10004
|
+
const struct ggml_compute_params * params,
|
10005
|
+
const struct ggml_tensor * src0,
|
10006
|
+
const struct ggml_tensor * src1,
|
10007
|
+
struct ggml_tensor * dst) {
|
10008
|
+
|
10009
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10010
|
+
return;
|
10011
|
+
}
|
10012
|
+
|
10013
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10014
|
+
|
10015
|
+
const int ith = params->ith;
|
10016
|
+
|
10017
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10018
|
+
|
10019
|
+
// TODO: support for transposed / permuted tensors
|
10020
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
10021
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
10022
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
10023
|
+
|
10024
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
10025
|
+
for (int i2 = ith; i2 < ne2; i2++) {
|
10026
|
+
if (i2 < ne02) { // src0
|
10027
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10028
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10029
|
+
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
10030
|
+
|
10031
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10032
|
+
*y = *x;
|
10033
|
+
}
|
10034
|
+
}
|
10035
|
+
} // src1
|
10036
|
+
else {
|
10037
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10038
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10039
|
+
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10040
|
+
|
10041
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10042
|
+
*y = *x;
|
10043
|
+
}
|
10044
|
+
}
|
10045
|
+
}
|
10046
|
+
}
|
10047
|
+
}
|
10048
|
+
}
|
10049
|
+
|
10050
|
+
static void ggml_compute_forward_concat(
|
10051
|
+
const struct ggml_compute_params* params,
|
10052
|
+
const struct ggml_tensor* src0,
|
10053
|
+
const struct ggml_tensor* src1,
|
10054
|
+
struct ggml_tensor* dst) {
|
10055
|
+
switch (src0->type) {
|
10056
|
+
case GGML_TYPE_F32:
|
10057
|
+
{
|
10058
|
+
ggml_compute_forward_concat_f32(params, src0, src1, dst);
|
10059
|
+
} break;
|
10060
|
+
default:
|
10061
|
+
{
|
10062
|
+
GGML_ASSERT(false);
|
10063
|
+
} break;
|
10064
|
+
}
|
10065
|
+
}
|
10066
|
+
|
9734
10067
|
// ggml_compute_forward_abs
|
9735
10068
|
|
9736
10069
|
static void ggml_compute_forward_abs_f32(
|
@@ -10285,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
|
|
10285
10618
|
|
10286
10619
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
10287
10620
|
|
10288
|
-
|
10621
|
+
float eps;
|
10622
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10289
10623
|
|
10290
10624
|
// TODO: optimize
|
10291
10625
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -10334,6 +10668,8 @@ static void ggml_compute_forward_norm(
|
|
10334
10668
|
}
|
10335
10669
|
}
|
10336
10670
|
|
10671
|
+
// ggml_compute_forward_group_rms_norm
|
10672
|
+
|
10337
10673
|
static void ggml_compute_forward_rms_norm_f32(
|
10338
10674
|
const struct ggml_compute_params * params,
|
10339
10675
|
const struct ggml_tensor * src0,
|
@@ -10398,7 +10734,6 @@ static void ggml_compute_forward_rms_norm(
|
|
10398
10734
|
}
|
10399
10735
|
}
|
10400
10736
|
|
10401
|
-
|
10402
10737
|
static void ggml_compute_forward_rms_norm_back_f32(
|
10403
10738
|
const struct ggml_compute_params * params,
|
10404
10739
|
const struct ggml_tensor * src0,
|
@@ -10572,16 +10907,106 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10572
10907
|
}
|
10573
10908
|
}
|
10574
10909
|
|
10575
|
-
//
|
10910
|
+
// ggml_compute_forward_group_norm
|
10576
10911
|
|
10577
|
-
|
10578
|
-
|
10579
|
-
|
10580
|
-
|
10581
|
-
|
10582
|
-
|
10583
|
-
|
10584
|
-
|
10912
|
+
static void ggml_compute_forward_group_norm_f32(
|
10913
|
+
const struct ggml_compute_params * params,
|
10914
|
+
const struct ggml_tensor * src0,
|
10915
|
+
struct ggml_tensor * dst) {
|
10916
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10917
|
+
|
10918
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10919
|
+
return;
|
10920
|
+
}
|
10921
|
+
|
10922
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10923
|
+
|
10924
|
+
const int ith = params->ith;
|
10925
|
+
const int nth = params->nth;
|
10926
|
+
|
10927
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10928
|
+
|
10929
|
+
const float eps = 1e-6f; // TODO: make this a parameter
|
10930
|
+
|
10931
|
+
// TODO: optimize
|
10932
|
+
|
10933
|
+
int n_channels = src0->ne[2];
|
10934
|
+
int n_groups = dst->op_params[0];
|
10935
|
+
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
10936
|
+
for (int i = ith; i < n_groups; i+=nth) {
|
10937
|
+
int start = i * n_channels_per_group;
|
10938
|
+
int end = start + n_channels_per_group;
|
10939
|
+
if (end > n_channels) {
|
10940
|
+
end = n_channels;
|
10941
|
+
}
|
10942
|
+
int step = end - start;
|
10943
|
+
|
10944
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10945
|
+
ggml_float sum = 0.0;
|
10946
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
10947
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10948
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10949
|
+
|
10950
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10951
|
+
sum += (ggml_float)x[i00];
|
10952
|
+
}
|
10953
|
+
}
|
10954
|
+
}
|
10955
|
+
float mean = sum / (ne00 * ne01 * step);
|
10956
|
+
ggml_float sum2 = 0.0;
|
10957
|
+
|
10958
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
10959
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10960
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10961
|
+
|
10962
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
10963
|
+
|
10964
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10965
|
+
float v = x[i00] - mean;
|
10966
|
+
y[i00] = v;
|
10967
|
+
sum2 += (ggml_float)(v * v);
|
10968
|
+
}
|
10969
|
+
}
|
10970
|
+
}
|
10971
|
+
float variance = sum2 / (ne00 * ne01 * step);
|
10972
|
+
const float scale = 1.0f / sqrtf(variance + eps);
|
10973
|
+
|
10974
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
10975
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10976
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
10977
|
+
ggml_vec_scale_f32(ne00, y, scale);
|
10978
|
+
}
|
10979
|
+
}
|
10980
|
+
}
|
10981
|
+
}
|
10982
|
+
}
|
10983
|
+
|
10984
|
+
static void ggml_compute_forward_group_norm(
|
10985
|
+
const struct ggml_compute_params * params,
|
10986
|
+
const struct ggml_tensor * src0,
|
10987
|
+
struct ggml_tensor * dst) {
|
10988
|
+
switch (src0->type) {
|
10989
|
+
case GGML_TYPE_F32:
|
10990
|
+
{
|
10991
|
+
ggml_compute_forward_group_norm_f32(params, src0, dst);
|
10992
|
+
} break;
|
10993
|
+
default:
|
10994
|
+
{
|
10995
|
+
GGML_ASSERT(false);
|
10996
|
+
} break;
|
10997
|
+
}
|
10998
|
+
}
|
10999
|
+
|
11000
|
+
// ggml_compute_forward_mul_mat
|
11001
|
+
|
11002
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
11003
|
+
// helper function to determine if it is better to use BLAS or not
|
11004
|
+
// for large matrices, BLAS is faster
|
11005
|
+
static bool ggml_compute_forward_mul_mat_use_blas(
|
11006
|
+
const struct ggml_tensor * src0,
|
11007
|
+
const struct ggml_tensor * src1,
|
11008
|
+
struct ggml_tensor * dst) {
|
11009
|
+
//const int64_t ne00 = src0->ne[0];
|
10585
11010
|
//const int64_t ne01 = src0->ne[1];
|
10586
11011
|
|
10587
11012
|
const int64_t ne10 = src1->ne[0];
|
@@ -10629,7 +11054,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10629
11054
|
GGML_ASSERT(ne3 == ne13);
|
10630
11055
|
|
10631
11056
|
// we don't support permuted src0 or src1
|
10632
|
-
GGML_ASSERT(nb00 ==
|
11057
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10633
11058
|
GGML_ASSERT(nb10 == sizeof(float));
|
10634
11059
|
|
10635
11060
|
// dst cannot be transposed or permuted
|
@@ -10638,6 +11063,10 @@ static void ggml_compute_forward_mul_mat(
|
|
10638
11063
|
GGML_ASSERT(nb1 <= nb2);
|
10639
11064
|
GGML_ASSERT(nb2 <= nb3);
|
10640
11065
|
|
11066
|
+
// broadcast factors
|
11067
|
+
const int64_t r2 = ne12/ne02;
|
11068
|
+
const int64_t r3 = ne13/ne03;
|
11069
|
+
|
10641
11070
|
// nb01 >= nb00 - src0 is not transposed
|
10642
11071
|
// compute by src0 rows
|
10643
11072
|
|
@@ -10657,11 +11086,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10657
11086
|
|
10658
11087
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10659
11088
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10660
|
-
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10661
|
-
// ref: https://github.com/ggerganov/ggml/pull/224
|
10662
|
-
GGML_ASSERT(ne02 == ne12);
|
10663
|
-
GGML_ASSERT(ne03 == ne13);
|
10664
|
-
|
10665
11089
|
if (params->ith != 0) {
|
10666
11090
|
return;
|
10667
11091
|
}
|
@@ -10674,12 +11098,16 @@ static void ggml_compute_forward_mul_mat(
|
|
10674
11098
|
return;
|
10675
11099
|
}
|
10676
11100
|
|
10677
|
-
for (int64_t
|
10678
|
-
for (int64_t
|
10679
|
-
|
10680
|
-
const
|
11101
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
11102
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
11103
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
11104
|
+
const int64_t i03 = i13/r3;
|
11105
|
+
const int64_t i02 = i12/r2;
|
11106
|
+
|
11107
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
11108
|
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
10681
11109
|
|
10682
|
-
float * d = (float *) ((char *) dst->data +
|
11110
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
10683
11111
|
|
10684
11112
|
if (type != GGML_TYPE_F32) {
|
10685
11113
|
float * const wdata = params->wdata;
|
@@ -10687,7 +11115,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10687
11115
|
|
10688
11116
|
size_t id = 0;
|
10689
11117
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
10690
|
-
to_float((char *)
|
11118
|
+
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
10691
11119
|
id += ne00;
|
10692
11120
|
}
|
10693
11121
|
|
@@ -10712,7 +11140,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10712
11140
|
if (params->type == GGML_TASK_INIT) {
|
10713
11141
|
if (src1->type != vec_dot_type) {
|
10714
11142
|
char * wdata = params->wdata;
|
10715
|
-
const size_t row_size = ne10*
|
11143
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10716
11144
|
|
10717
11145
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10718
11146
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -10732,7 +11160,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10732
11160
|
}
|
10733
11161
|
|
10734
11162
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
-
const size_t row_size = ne10*
|
11163
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10736
11164
|
|
10737
11165
|
const int64_t nr0 = ne01; // src0 rows
|
10738
11166
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
@@ -10767,10 +11195,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10767
11195
|
assert(ne12 % ne02 == 0);
|
10768
11196
|
assert(ne13 % ne03 == 0);
|
10769
11197
|
|
10770
|
-
// broadcast factors
|
10771
|
-
const int64_t r2 = ne12/ne02;
|
10772
|
-
const int64_t r3 = ne13/ne03;
|
10773
|
-
|
10774
11198
|
// block-tiling attempt
|
10775
11199
|
const int64_t blck_0 = 16;
|
10776
11200
|
const int64_t blck_1 = 16;
|
@@ -11205,7 +11629,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11205
11629
|
|
11206
11630
|
assert( dst->ne[0] == nc);
|
11207
11631
|
assert( dst->ne[1] == nr);
|
11208
|
-
assert(src0->nb[0] ==
|
11632
|
+
assert(src0->nb[0] == ggml_type_size(type));
|
11209
11633
|
|
11210
11634
|
for (int i = 0; i < nr; ++i) {
|
11211
11635
|
const int r = ((int32_t *) src1->data)[i];
|
@@ -11926,7 +12350,6 @@ static void ggml_compute_forward_alibi(
|
|
11926
12350
|
}
|
11927
12351
|
}
|
11928
12352
|
|
11929
|
-
|
11930
12353
|
// ggml_compute_forward_clamp
|
11931
12354
|
|
11932
12355
|
static void ggml_compute_forward_clamp_f32(
|
@@ -12015,12 +12438,18 @@ static void ggml_compute_forward_rope_f32(
|
|
12015
12438
|
float freq_base;
|
12016
12439
|
float freq_scale;
|
12017
12440
|
|
12441
|
+
// these two only relevant for xPos RoPE:
|
12442
|
+
float xpos_base;
|
12443
|
+
bool xpos_down;
|
12444
|
+
|
12018
12445
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12019
12446
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12020
12447
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12021
12448
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
12022
12449
|
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12023
12450
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12451
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12452
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12024
12453
|
|
12025
12454
|
assert(n_past >= 0);
|
12026
12455
|
|
@@ -12092,6 +12521,9 @@ static void ggml_compute_forward_rope_f32(
|
|
12092
12521
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12093
12522
|
const float cos_theta = cosf(theta);
|
12094
12523
|
const float sin_theta = sinf(theta);
|
12524
|
+
// zeta scaling for xPos only:
|
12525
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12526
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12095
12527
|
|
12096
12528
|
theta *= theta_scale;
|
12097
12529
|
|
@@ -12101,11 +12533,11 @@ static void ggml_compute_forward_rope_f32(
|
|
12101
12533
|
const float x0 = src[0];
|
12102
12534
|
const float x1 = src[1];
|
12103
12535
|
|
12104
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12105
|
-
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
12536
|
+
dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
|
12537
|
+
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
12106
12538
|
}
|
12107
12539
|
} else {
|
12108
|
-
// TODO: this
|
12540
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12109
12541
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12110
12542
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12111
12543
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12234,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12234
12666
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12235
12667
|
}
|
12236
12668
|
} else {
|
12237
|
-
// TODO: this
|
12669
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12238
12670
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12239
12671
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12240
12672
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12296,9 +12728,21 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12296
12728
|
// dx = rope_back(dy, src1)
|
12297
12729
|
// src0 is dy, src1 contains options
|
12298
12730
|
|
12731
|
+
float freq_base;
|
12732
|
+
float freq_scale;
|
12733
|
+
|
12734
|
+
// these two only relevant for xPos RoPE:
|
12735
|
+
float xpos_base;
|
12736
|
+
bool xpos_down;
|
12737
|
+
|
12299
12738
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12300
12739
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12301
12740
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12741
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
12742
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12743
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12744
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12745
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12302
12746
|
|
12303
12747
|
assert(n_past >= 0);
|
12304
12748
|
|
@@ -12324,7 +12768,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12324
12768
|
// row index used to determine which thread to use
|
12325
12769
|
int ir = 0;
|
12326
12770
|
|
12327
|
-
const float theta_scale = powf(
|
12771
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12328
12772
|
|
12329
12773
|
const bool is_neox = mode & 2;
|
12330
12774
|
|
@@ -12335,12 +12779,15 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12335
12779
|
if (ir++ < ir0) continue;
|
12336
12780
|
if (ir > ir1) break;
|
12337
12781
|
|
12338
|
-
float theta = (float)p;
|
12782
|
+
float theta = freq_scale * (float)p;
|
12339
12783
|
|
12340
12784
|
if (!is_neox) {
|
12341
12785
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12342
12786
|
const float cos_theta = cosf(theta);
|
12343
12787
|
const float sin_theta = sinf(theta);
|
12788
|
+
// zeta scaling for xPos only:
|
12789
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12790
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12344
12791
|
|
12345
12792
|
theta *= theta_scale;
|
12346
12793
|
|
@@ -12350,8 +12797,8 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12350
12797
|
const float dy0 = dy[0];
|
12351
12798
|
const float dy1 = dy[1];
|
12352
12799
|
|
12353
|
-
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
12354
|
-
dx[1] = - dy0*sin_theta + dy1*cos_theta;
|
12800
|
+
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
12801
|
+
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
12355
12802
|
}
|
12356
12803
|
} else {
|
12357
12804
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
@@ -13044,6 +13491,108 @@ static void ggml_compute_forward_conv_2d(
|
|
13044
13491
|
}
|
13045
13492
|
}
|
13046
13493
|
|
13494
|
+
// ggml_compute_forward_conv_transpose_2d
|
13495
|
+
|
13496
|
+
static void ggml_compute_forward_conv_transpose_2d(
|
13497
|
+
const struct ggml_compute_params * params,
|
13498
|
+
const struct ggml_tensor * src0,
|
13499
|
+
const struct ggml_tensor * src1,
|
13500
|
+
const struct ggml_tensor * opt0,
|
13501
|
+
struct ggml_tensor * dst) {
|
13502
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13503
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13504
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13505
|
+
|
13506
|
+
int64_t t0 = ggml_perf_time_us();
|
13507
|
+
UNUSED(t0);
|
13508
|
+
|
13509
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13510
|
+
|
13511
|
+
const int ith = params->ith;
|
13512
|
+
const int nth = params->nth;
|
13513
|
+
|
13514
|
+
const int nk = ne00*ne01*ne02*ne03;
|
13515
|
+
|
13516
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13517
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13518
|
+
|
13519
|
+
if (params->type == GGML_TASK_INIT) {
|
13520
|
+
memset(params->wdata, 0, params->wsize);
|
13521
|
+
|
13522
|
+
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
13523
|
+
{
|
13524
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13525
|
+
|
13526
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
13527
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
13528
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
13529
|
+
ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
13530
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
13531
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
13532
|
+
dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
|
13533
|
+
}
|
13534
|
+
}
|
13535
|
+
}
|
13536
|
+
}
|
13537
|
+
}
|
13538
|
+
|
13539
|
+
// permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
|
13540
|
+
{
|
13541
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
13542
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13543
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13544
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
13545
|
+
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
13546
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13547
|
+
dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
|
13548
|
+
}
|
13549
|
+
}
|
13550
|
+
}
|
13551
|
+
}
|
13552
|
+
|
13553
|
+
return;
|
13554
|
+
}
|
13555
|
+
|
13556
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13557
|
+
return;
|
13558
|
+
}
|
13559
|
+
|
13560
|
+
const int32_t stride = ((const int32_t*)(opt0->data))[0];
|
13561
|
+
|
13562
|
+
// total patches in dst
|
13563
|
+
const int np = ne2;
|
13564
|
+
|
13565
|
+
// patches per thread
|
13566
|
+
const int dp = (np + nth - 1)/nth;
|
13567
|
+
|
13568
|
+
// patch range for this thread
|
13569
|
+
const int ip0 = dp*ith;
|
13570
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13571
|
+
|
13572
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13573
|
+
ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
|
13574
|
+
|
13575
|
+
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
13576
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13577
|
+
ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
13578
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13579
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13580
|
+
const int i1n = i11*ne10*ne12 + i10*ne12;
|
13581
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
13582
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
13583
|
+
float v = 0;
|
13584
|
+
ggml_vec_dot_f16(ne03, &v,
|
13585
|
+
(ggml_fp16_t *) wdata_src + i1n,
|
13586
|
+
(ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
13587
|
+
|
13588
|
+
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
13589
|
+
}
|
13590
|
+
}
|
13591
|
+
}
|
13592
|
+
}
|
13593
|
+
}
|
13594
|
+
}
|
13595
|
+
|
13047
13596
|
// ggml_compute_forward_pool_1d_sk_p0
|
13048
13597
|
|
13049
13598
|
static void ggml_compute_forward_pool_1d_sk_p0(
|
@@ -13202,6 +13751,60 @@ static void ggml_compute_forward_pool_2d(
|
|
13202
13751
|
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
13203
13752
|
}
|
13204
13753
|
|
13754
|
+
// ggml_compute_forward_upscale
|
13755
|
+
|
13756
|
+
static void ggml_compute_forward_upscale_f32(
|
13757
|
+
const struct ggml_compute_params * params,
|
13758
|
+
const struct ggml_tensor * src0,
|
13759
|
+
struct ggml_tensor * dst) {
|
13760
|
+
|
13761
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13762
|
+
return;
|
13763
|
+
}
|
13764
|
+
|
13765
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
13766
|
+
|
13767
|
+
const int ith = params->ith;
|
13768
|
+
|
13769
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
13770
|
+
|
13771
|
+
const int scale_factor = dst->op_params[0];
|
13772
|
+
|
13773
|
+
// TODO: optimize
|
13774
|
+
|
13775
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
13776
|
+
for (int i02 = ith; i02 < ne02; i02++) {
|
13777
|
+
for (int m = 0; m < dst->ne[1]; m++) {
|
13778
|
+
int i01 = m / scale_factor;
|
13779
|
+
for (int n = 0; n < dst->ne[0]; n++) {
|
13780
|
+
int i00 = n / scale_factor;
|
13781
|
+
|
13782
|
+
const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
|
13783
|
+
|
13784
|
+
float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
13785
|
+
|
13786
|
+
*y = *x;
|
13787
|
+
}
|
13788
|
+
}
|
13789
|
+
}
|
13790
|
+
}
|
13791
|
+
}
|
13792
|
+
|
13793
|
+
static void ggml_compute_forward_upscale(
|
13794
|
+
const struct ggml_compute_params * params,
|
13795
|
+
const struct ggml_tensor * src0,
|
13796
|
+
struct ggml_tensor * dst) {
|
13797
|
+
switch (src0->type) {
|
13798
|
+
case GGML_TYPE_F32:
|
13799
|
+
{
|
13800
|
+
ggml_compute_forward_upscale_f32(params, src0, dst);
|
13801
|
+
} break;
|
13802
|
+
default:
|
13803
|
+
{
|
13804
|
+
GGML_ASSERT(false);
|
13805
|
+
} break;
|
13806
|
+
}
|
13807
|
+
}
|
13205
13808
|
|
13206
13809
|
// ggml_compute_forward_flash_attn
|
13207
13810
|
|
@@ -14327,42 +14930,43 @@ static void ggml_compute_forward_unary(
|
|
14327
14930
|
}
|
14328
14931
|
}
|
14329
14932
|
|
14330
|
-
//
|
14933
|
+
// ggml_compute_forward_get_rel_pos
|
14331
14934
|
|
14332
|
-
static void
|
14935
|
+
static void ggml_compute_forward_get_rel_pos_f16(
|
14333
14936
|
const struct ggml_compute_params * params,
|
14334
14937
|
const struct ggml_tensor * src0,
|
14335
|
-
struct ggml_tensor * dst
|
14336
|
-
const ggml_unary_op_f32_t fun) {
|
14337
|
-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
14338
|
-
|
14938
|
+
struct ggml_tensor * dst) {
|
14339
14939
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14340
14940
|
return;
|
14341
14941
|
}
|
14342
14942
|
|
14343
|
-
|
14344
|
-
const int nc = src0->ne[0];
|
14943
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
|
14345
14944
|
|
14346
|
-
|
14347
|
-
assert(src0->nb[0] == sizeof(float));
|
14945
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
14348
14946
|
|
14349
|
-
|
14350
|
-
|
14351
|
-
|
14352
|
-
|
14947
|
+
const int64_t w = ne1;
|
14948
|
+
|
14949
|
+
ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
|
14950
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
|
14951
|
+
|
14952
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14953
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14954
|
+
const int64_t pos = (w - i1 - 1) + i2;
|
14955
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14956
|
+
dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
|
14957
|
+
}
|
14958
|
+
}
|
14353
14959
|
}
|
14354
14960
|
}
|
14355
14961
|
|
14356
|
-
|
14357
|
-
static void ggml_compute_forward_map_unary(
|
14962
|
+
static void ggml_compute_forward_get_rel_pos(
|
14358
14963
|
const struct ggml_compute_params * params,
|
14359
14964
|
const struct ggml_tensor * src0,
|
14360
|
-
struct ggml_tensor * dst
|
14361
|
-
const ggml_unary_op_f32_t fun) {
|
14965
|
+
struct ggml_tensor * dst) {
|
14362
14966
|
switch (src0->type) {
|
14363
|
-
case
|
14967
|
+
case GGML_TYPE_F16:
|
14364
14968
|
{
|
14365
|
-
|
14969
|
+
ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
|
14366
14970
|
} break;
|
14367
14971
|
default:
|
14368
14972
|
{
|
@@ -14371,34 +14975,164 @@ static void ggml_compute_forward_map_unary(
|
|
14371
14975
|
}
|
14372
14976
|
}
|
14373
14977
|
|
14374
|
-
//
|
14978
|
+
// ggml_compute_forward_add_rel_pos
|
14375
14979
|
|
14376
|
-
static void
|
14980
|
+
static void ggml_compute_forward_add_rel_pos_f32(
|
14377
14981
|
const struct ggml_compute_params * params,
|
14378
14982
|
const struct ggml_tensor * src0,
|
14379
14983
|
const struct ggml_tensor * src1,
|
14380
|
-
struct ggml_tensor *
|
14381
|
-
|
14382
|
-
assert(params->ith == 0);
|
14383
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
14984
|
+
const struct ggml_tensor * src2,
|
14985
|
+
struct ggml_tensor * dst) {
|
14384
14986
|
|
14987
|
+
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
14988
|
+
if (!inplace && params->type == GGML_TASK_INIT) {
|
14989
|
+
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
14990
|
+
return;
|
14991
|
+
}
|
14385
14992
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14386
14993
|
return;
|
14387
14994
|
}
|
14388
14995
|
|
14389
|
-
|
14390
|
-
|
14996
|
+
int64_t t0 = ggml_perf_time_us();
|
14997
|
+
UNUSED(t0);
|
14391
14998
|
|
14392
|
-
|
14393
|
-
assert(src0->nb[0] == sizeof(float));
|
14394
|
-
assert(src1->nb[0] == sizeof(float));
|
14999
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
|
14395
15000
|
|
14396
|
-
|
14397
|
-
|
14398
|
-
|
14399
|
-
|
14400
|
-
|
14401
|
-
|
15001
|
+
float * src1_data = (float *) src1->data;
|
15002
|
+
float * src2_data = (float *) src2->data;
|
15003
|
+
float * dst_data = (float *) dst->data;
|
15004
|
+
|
15005
|
+
const int64_t ne10 = src1->ne[0];
|
15006
|
+
const int64_t ne11 = src1->ne[1];
|
15007
|
+
const int64_t ne12 = src1->ne[2];
|
15008
|
+
const int64_t ne13 = src1->ne[3];
|
15009
|
+
|
15010
|
+
const int ith = params->ith;
|
15011
|
+
const int nth = params->nth;
|
15012
|
+
|
15013
|
+
// total patches in dst
|
15014
|
+
const int np = ne13;
|
15015
|
+
|
15016
|
+
// patches per thread
|
15017
|
+
const int dp = (np + nth - 1)/nth;
|
15018
|
+
|
15019
|
+
// patch range for this thread
|
15020
|
+
const int ip0 = dp*ith;
|
15021
|
+
const int ip1 = MIN(ip0 + dp, np);
|
15022
|
+
|
15023
|
+
|
15024
|
+
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
|
15025
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
15026
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
15027
|
+
const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
|
15028
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
15029
|
+
const int64_t jp0 = jp1 + i10;
|
15030
|
+
const float src1_e = src1_data[jp0];
|
15031
|
+
const float src2_e = src2_data[jp0];
|
15032
|
+
|
15033
|
+
const int64_t jdh = jp0 * ne10;
|
15034
|
+
const int64_t jdw = jdh - (ne10 - 1) * i10;
|
15035
|
+
|
15036
|
+
for (int64_t j = 0; j < ne10; ++j) {
|
15037
|
+
dst_data[jdh + j ] += src2_e;
|
15038
|
+
dst_data[jdw + j*ne10] += src1_e;
|
15039
|
+
}
|
15040
|
+
}
|
15041
|
+
}
|
15042
|
+
}
|
15043
|
+
}
|
15044
|
+
}
|
15045
|
+
|
15046
|
+
static void ggml_compute_forward_add_rel_pos(
|
15047
|
+
const struct ggml_compute_params * params,
|
15048
|
+
const struct ggml_tensor * src0,
|
15049
|
+
const struct ggml_tensor * src1,
|
15050
|
+
const struct ggml_tensor * src2,
|
15051
|
+
struct ggml_tensor * dst) {
|
15052
|
+
switch (src0->type) {
|
15053
|
+
case GGML_TYPE_F32:
|
15054
|
+
{
|
15055
|
+
ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
|
15056
|
+
} break;
|
15057
|
+
default:
|
15058
|
+
{
|
15059
|
+
GGML_ASSERT(false);
|
15060
|
+
} break;
|
15061
|
+
}
|
15062
|
+
}
|
15063
|
+
|
15064
|
+
// ggml_compute_forward_map_unary
|
15065
|
+
|
15066
|
+
static void ggml_compute_forward_map_unary_f32(
|
15067
|
+
const struct ggml_compute_params * params,
|
15068
|
+
const struct ggml_tensor * src0,
|
15069
|
+
struct ggml_tensor * dst,
|
15070
|
+
const ggml_unary_op_f32_t fun) {
|
15071
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
15072
|
+
|
15073
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15074
|
+
return;
|
15075
|
+
}
|
15076
|
+
|
15077
|
+
const int n = ggml_nrows(src0);
|
15078
|
+
const int nc = src0->ne[0];
|
15079
|
+
|
15080
|
+
assert( dst->nb[0] == sizeof(float));
|
15081
|
+
assert(src0->nb[0] == sizeof(float));
|
15082
|
+
|
15083
|
+
for (int i = 0; i < n; i++) {
|
15084
|
+
fun(nc,
|
15085
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
15086
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
15087
|
+
}
|
15088
|
+
}
|
15089
|
+
|
15090
|
+
|
15091
|
+
static void ggml_compute_forward_map_unary(
|
15092
|
+
const struct ggml_compute_params * params,
|
15093
|
+
const struct ggml_tensor * src0,
|
15094
|
+
struct ggml_tensor * dst,
|
15095
|
+
const ggml_unary_op_f32_t fun) {
|
15096
|
+
switch (src0->type) {
|
15097
|
+
case GGML_TYPE_F32:
|
15098
|
+
{
|
15099
|
+
ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
|
15100
|
+
} break;
|
15101
|
+
default:
|
15102
|
+
{
|
15103
|
+
GGML_ASSERT(false);
|
15104
|
+
} break;
|
15105
|
+
}
|
15106
|
+
}
|
15107
|
+
|
15108
|
+
// ggml_compute_forward_map_binary
|
15109
|
+
|
15110
|
+
static void ggml_compute_forward_map_binary_f32(
|
15111
|
+
const struct ggml_compute_params * params,
|
15112
|
+
const struct ggml_tensor * src0,
|
15113
|
+
const struct ggml_tensor * src1,
|
15114
|
+
struct ggml_tensor * dst,
|
15115
|
+
const ggml_binary_op_f32_t fun) {
|
15116
|
+
assert(params->ith == 0);
|
15117
|
+
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
15118
|
+
|
15119
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15120
|
+
return;
|
15121
|
+
}
|
15122
|
+
|
15123
|
+
const int n = ggml_nrows(src0);
|
15124
|
+
const int nc = src0->ne[0];
|
15125
|
+
|
15126
|
+
assert( dst->nb[0] == sizeof(float));
|
15127
|
+
assert(src0->nb[0] == sizeof(float));
|
15128
|
+
assert(src1->nb[0] == sizeof(float));
|
15129
|
+
|
15130
|
+
for (int i = 0; i < n; i++) {
|
15131
|
+
fun(nc,
|
15132
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
15133
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])),
|
15134
|
+
(float *) ((char *) src1->data + i*(src1->nb[1])));
|
15135
|
+
}
|
14402
15136
|
}
|
14403
15137
|
|
14404
15138
|
|
@@ -14879,6 +15613,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14879
15613
|
{
|
14880
15614
|
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
|
14881
15615
|
} break;
|
15616
|
+
case GGML_OP_CONCAT:
|
15617
|
+
{
|
15618
|
+
ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
|
15619
|
+
} break;
|
14882
15620
|
case GGML_OP_SILU_BACK:
|
14883
15621
|
{
|
14884
15622
|
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14895,6 +15633,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14895
15633
|
{
|
14896
15634
|
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
|
14897
15635
|
} break;
|
15636
|
+
case GGML_OP_GROUP_NORM:
|
15637
|
+
{
|
15638
|
+
ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
|
15639
|
+
} break;
|
14898
15640
|
case GGML_OP_MUL_MAT:
|
14899
15641
|
{
|
14900
15642
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14987,6 +15729,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14987
15729
|
{
|
14988
15730
|
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14989
15731
|
} break;
|
15732
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
15733
|
+
{
|
15734
|
+
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15735
|
+
} break;
|
14990
15736
|
case GGML_OP_POOL_1D:
|
14991
15737
|
{
|
14992
15738
|
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
|
@@ -14995,6 +15741,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14995
15741
|
{
|
14996
15742
|
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
|
14997
15743
|
} break;
|
15744
|
+
case GGML_OP_UPSCALE:
|
15745
|
+
{
|
15746
|
+
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
15747
|
+
} break;
|
14998
15748
|
case GGML_OP_FLASH_ATTN:
|
14999
15749
|
{
|
15000
15750
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -15025,6 +15775,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15025
15775
|
{
|
15026
15776
|
ggml_compute_forward_unary(params, tensor->src[0], tensor);
|
15027
15777
|
} break;
|
15778
|
+
case GGML_OP_GET_REL_POS:
|
15779
|
+
{
|
15780
|
+
ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
|
15781
|
+
} break;
|
15782
|
+
case GGML_OP_ADD_REL_POS:
|
15783
|
+
{
|
15784
|
+
ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15785
|
+
} break;
|
15028
15786
|
case GGML_OP_MAP_UNARY:
|
15029
15787
|
{
|
15030
15788
|
ggml_unary_op_f32_t fun;
|
@@ -15288,6 +16046,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15288
16046
|
inplace);
|
15289
16047
|
}
|
15290
16048
|
} break;
|
16049
|
+
case GGML_OP_CONCAT:
|
16050
|
+
{
|
16051
|
+
GGML_ASSERT(false); // TODO: implement
|
16052
|
+
} break;
|
15291
16053
|
case GGML_OP_SILU_BACK:
|
15292
16054
|
{
|
15293
16055
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15310,6 +16072,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15310
16072
|
{
|
15311
16073
|
GGML_ASSERT(false); // TODO: not implemented
|
15312
16074
|
} break;
|
16075
|
+
case GGML_OP_GROUP_NORM:
|
16076
|
+
{
|
16077
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16078
|
+
} break;
|
15313
16079
|
case GGML_OP_MUL_MAT:
|
15314
16080
|
{
|
15315
16081
|
// https://cs231n.github.io/optimization-2/#staged
|
@@ -15584,6 +16350,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15584
16350
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15585
16351
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15586
16352
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16353
|
+
float freq_base;
|
16354
|
+
float freq_scale;
|
16355
|
+
float xpos_base;
|
16356
|
+
bool xpos_down;
|
16357
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16358
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16359
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16360
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16361
|
+
|
15587
16362
|
src0->grad = ggml_add_impl(ctx,
|
15588
16363
|
src0->grad,
|
15589
16364
|
ggml_rope_back(ctx,
|
@@ -15591,7 +16366,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15591
16366
|
n_past,
|
15592
16367
|
n_dims,
|
15593
16368
|
mode,
|
15594
|
-
n_ctx
|
16369
|
+
n_ctx,
|
16370
|
+
freq_base,
|
16371
|
+
freq_scale,
|
16372
|
+
xpos_base,
|
16373
|
+
xpos_down),
|
15595
16374
|
inplace);
|
15596
16375
|
}
|
15597
16376
|
} break;
|
@@ -15602,14 +16381,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15602
16381
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15603
16382
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15604
16383
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16384
|
+
float freq_base;
|
16385
|
+
float freq_scale;
|
16386
|
+
float xpos_base;
|
16387
|
+
bool xpos_down;
|
16388
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16389
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16390
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16391
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16392
|
+
|
15605
16393
|
src0->grad = ggml_add_impl(ctx,
|
15606
16394
|
src0->grad,
|
15607
|
-
|
16395
|
+
ggml_rope_impl(ctx,
|
15608
16396
|
tensor->grad,
|
15609
16397
|
n_past,
|
15610
16398
|
n_dims,
|
15611
16399
|
mode,
|
15612
|
-
n_ctx
|
16400
|
+
n_ctx,
|
16401
|
+
freq_base,
|
16402
|
+
freq_scale,
|
16403
|
+
xpos_base,
|
16404
|
+
xpos_down,
|
16405
|
+
false),
|
15613
16406
|
inplace);
|
15614
16407
|
}
|
15615
16408
|
} break;
|
@@ -15629,6 +16422,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15629
16422
|
{
|
15630
16423
|
GGML_ASSERT(false); // TODO: not implemented
|
15631
16424
|
} break;
|
16425
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
16426
|
+
{
|
16427
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16428
|
+
} break;
|
15632
16429
|
case GGML_OP_POOL_1D:
|
15633
16430
|
{
|
15634
16431
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15637,6 +16434,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15637
16434
|
{
|
15638
16435
|
GGML_ASSERT(false); // TODO: not implemented
|
15639
16436
|
} break;
|
16437
|
+
case GGML_OP_UPSCALE:
|
16438
|
+
{
|
16439
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16440
|
+
} break;
|
15640
16441
|
case GGML_OP_FLASH_ATTN:
|
15641
16442
|
{
|
15642
16443
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15878,6 +16679,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15878
16679
|
GGML_ASSERT(false);
|
15879
16680
|
}
|
15880
16681
|
} break;
|
16682
|
+
case GGML_OP_GET_REL_POS:
|
16683
|
+
case GGML_OP_ADD_REL_POS:
|
15881
16684
|
case GGML_OP_MAP_UNARY:
|
15882
16685
|
case GGML_OP_MAP_BINARY:
|
15883
16686
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16382,7 +17185,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16382
17185
|
|
16383
17186
|
size_t cur = 0;
|
16384
17187
|
if (ggml_is_quantized(node->type)) {
|
16385
|
-
cur =
|
17188
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16386
17189
|
}
|
16387
17190
|
|
16388
17191
|
work_size = MAX(work_size, cur);
|
@@ -16395,7 +17198,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16395
17198
|
size_t cur = 0;
|
16396
17199
|
|
16397
17200
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16398
|
-
cur =
|
17201
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16399
17202
|
}
|
16400
17203
|
|
16401
17204
|
work_size = MAX(work_size, cur);
|
@@ -16407,7 +17210,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16407
17210
|
size_t cur = 0;
|
16408
17211
|
|
16409
17212
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16410
|
-
cur =
|
17213
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
16411
17214
|
}
|
16412
17215
|
|
16413
17216
|
work_size = MAX(work_size, cur);
|
@@ -16454,9 +17257,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16454
17257
|
case GGML_OP_NORM:
|
16455
17258
|
case GGML_OP_RMS_NORM:
|
16456
17259
|
case GGML_OP_RMS_NORM_BACK:
|
17260
|
+
case GGML_OP_GROUP_NORM:
|
16457
17261
|
{
|
16458
17262
|
n_tasks = n_threads;
|
16459
17263
|
} break;
|
17264
|
+
case GGML_OP_CONCAT:
|
16460
17265
|
case GGML_OP_MUL_MAT:
|
16461
17266
|
case GGML_OP_OUT_PROD:
|
16462
17267
|
{
|
@@ -16490,12 +17295,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16490
17295
|
// the threads are still spinning
|
16491
17296
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16492
17297
|
// here we need memory just for single 2D matrix from src0
|
16493
|
-
cur =
|
17298
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
16494
17299
|
}
|
16495
17300
|
} else
|
16496
17301
|
#endif
|
16497
17302
|
if (node->src[1]->type != vec_dot_type) {
|
16498
|
-
cur =
|
17303
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
16499
17304
|
} else {
|
16500
17305
|
cur = 0;
|
16501
17306
|
}
|
@@ -16524,6 +17329,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16524
17329
|
case GGML_OP_SOFT_MAX_BACK:
|
16525
17330
|
case GGML_OP_ROPE:
|
16526
17331
|
case GGML_OP_ROPE_BACK:
|
17332
|
+
case GGML_OP_ADD_REL_POS:
|
16527
17333
|
{
|
16528
17334
|
n_tasks = n_threads;
|
16529
17335
|
} break;
|
@@ -16598,6 +17404,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16598
17404
|
GGML_ASSERT(false);
|
16599
17405
|
}
|
16600
17406
|
|
17407
|
+
work_size = MAX(work_size, cur);
|
17408
|
+
} break;
|
17409
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
17410
|
+
{
|
17411
|
+
n_tasks = n_threads;
|
17412
|
+
|
17413
|
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
17414
|
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
17415
|
+
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
17416
|
+
const int64_t ne03 = node->src[0]->ne[3]; // Channels In
|
17417
|
+
|
17418
|
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
17419
|
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
17420
|
+
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
17421
|
+
|
17422
|
+
size_t cur = 0;
|
17423
|
+
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
17424
|
+
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
17425
|
+
|
16601
17426
|
work_size = MAX(work_size, cur);
|
16602
17427
|
} break;
|
16603
17428
|
case GGML_OP_POOL_1D:
|
@@ -16605,6 +17430,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16605
17430
|
{
|
16606
17431
|
n_tasks = 1;
|
16607
17432
|
} break;
|
17433
|
+
case GGML_OP_UPSCALE:
|
17434
|
+
{
|
17435
|
+
n_tasks = n_threads;
|
17436
|
+
} break;
|
16608
17437
|
case GGML_OP_FLASH_ATTN:
|
16609
17438
|
{
|
16610
17439
|
n_tasks = n_threads;
|
@@ -16666,6 +17495,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16666
17495
|
} break;
|
16667
17496
|
case GGML_OP_WIN_PART:
|
16668
17497
|
case GGML_OP_WIN_UNPART:
|
17498
|
+
case GGML_OP_GET_REL_POS:
|
16669
17499
|
case GGML_OP_MAP_UNARY:
|
16670
17500
|
case GGML_OP_MAP_BINARY:
|
16671
17501
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16783,8 +17613,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16783
17613
|
|
16784
17614
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
16785
17615
|
GGML_ASSERT(rc == 0);
|
17616
|
+
UNUSED(rc);
|
16786
17617
|
}
|
16787
17618
|
}
|
17619
|
+
|
16788
17620
|
workers[0].ith = 0;
|
16789
17621
|
workers[0].shared = &state_shared;
|
16790
17622
|
|
@@ -16900,7 +17732,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16900
17732
|
// compute size of intermediate results
|
16901
17733
|
// TODO: does not take into account scratch buffers !!!!
|
16902
17734
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16903
|
-
size_eval +=
|
17735
|
+
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
16904
17736
|
}
|
16905
17737
|
|
16906
17738
|
// print
|
@@ -18301,8 +19133,8 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18301
19133
|
struct ggml_tensor * f) {
|
18302
19134
|
|
18303
19135
|
// build forward + backward compute graphs
|
18304
|
-
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
18305
|
-
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
19136
|
+
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
19137
|
+
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
18306
19138
|
|
18307
19139
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
18308
19140
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
@@ -18561,6 +19393,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18561
19393
|
|
18562
19394
|
////////////////////////////////////////////////////////////////////////////////
|
18563
19395
|
|
19396
|
+
struct gguf_str {
|
19397
|
+
uint32_t n;
|
19398
|
+
char * data;
|
19399
|
+
};
|
19400
|
+
|
19401
|
+
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
19402
|
+
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
|
19403
|
+
[GGUF_TYPE_INT8] = sizeof(int8_t),
|
19404
|
+
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
|
19405
|
+
[GGUF_TYPE_INT16] = sizeof(int16_t),
|
19406
|
+
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
|
19407
|
+
[GGUF_TYPE_INT32] = sizeof(int32_t),
|
19408
|
+
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
19409
|
+
[GGUF_TYPE_BOOL] = sizeof(bool),
|
19410
|
+
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
19411
|
+
[GGUF_TYPE_ARRAY] = 0, // undefined
|
19412
|
+
};
|
19413
|
+
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
19414
|
+
|
19415
|
+
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
19416
|
+
[GGUF_TYPE_UINT8] = "u8",
|
19417
|
+
[GGUF_TYPE_INT8] = "i8",
|
19418
|
+
[GGUF_TYPE_UINT16] = "u16",
|
19419
|
+
[GGUF_TYPE_INT16] = "i16",
|
19420
|
+
[GGUF_TYPE_UINT32] = "u32",
|
19421
|
+
[GGUF_TYPE_INT32] = "i32",
|
19422
|
+
[GGUF_TYPE_FLOAT32] = "f32",
|
19423
|
+
[GGUF_TYPE_BOOL] = "bool",
|
19424
|
+
[GGUF_TYPE_STRING] = "str",
|
19425
|
+
[GGUF_TYPE_ARRAY] = "arr",
|
19426
|
+
};
|
19427
|
+
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
19428
|
+
|
19429
|
+
union gguf_value {
|
19430
|
+
uint8_t uint8;
|
19431
|
+
int8_t int8;
|
19432
|
+
uint16_t uint16;
|
19433
|
+
int16_t int16;
|
19434
|
+
uint32_t uint32;
|
19435
|
+
int32_t int32;
|
19436
|
+
float float32;
|
19437
|
+
bool bool_;
|
19438
|
+
|
19439
|
+
struct gguf_str str;
|
19440
|
+
|
19441
|
+
struct {
|
19442
|
+
enum gguf_type type;
|
19443
|
+
|
19444
|
+
uint32_t n;
|
19445
|
+
void * data;
|
19446
|
+
} arr;
|
19447
|
+
};
|
19448
|
+
|
19449
|
+
struct gguf_kv {
|
19450
|
+
struct gguf_str key;
|
19451
|
+
|
19452
|
+
uint32_t n_bytes; // TODO: is this actually needed?
|
19453
|
+
|
19454
|
+
enum gguf_type type;
|
19455
|
+
union gguf_value value;
|
19456
|
+
};
|
19457
|
+
|
19458
|
+
struct gguf_header {
|
19459
|
+
uint32_t magic;
|
19460
|
+
uint32_t version;
|
19461
|
+
uint32_t n_tensors;
|
19462
|
+
uint32_t n_kv;
|
19463
|
+
};
|
19464
|
+
|
19465
|
+
struct gguf_tensor_info {
|
19466
|
+
struct gguf_str name;
|
19467
|
+
|
19468
|
+
uint32_t n_dims;
|
19469
|
+
uint32_t ne[GGML_MAX_DIMS];
|
19470
|
+
|
19471
|
+
enum ggml_type type;
|
19472
|
+
|
19473
|
+
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
19474
|
+
|
19475
|
+
// for writing API
|
19476
|
+
const void * data;
|
19477
|
+
size_t size;
|
19478
|
+
};
|
19479
|
+
|
19480
|
+
struct gguf_context {
|
19481
|
+
struct gguf_header header;
|
19482
|
+
|
19483
|
+
struct gguf_kv * kv;
|
19484
|
+
struct gguf_tensor_info * infos;
|
19485
|
+
|
19486
|
+
size_t alignment;
|
19487
|
+
size_t offset; // offset of `data` from beginning of file
|
19488
|
+
size_t size; // size of `data` in bytes
|
19489
|
+
|
19490
|
+
//uint8_t * padding;
|
19491
|
+
void * data;
|
19492
|
+
};
|
19493
|
+
|
19494
|
+
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
19495
|
+
const size_t n = fread(dst, 1, size, file);
|
19496
|
+
*offset += n;
|
19497
|
+
return n == size;
|
19498
|
+
}
|
19499
|
+
|
19500
|
+
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
19501
|
+
p->n = 0;
|
19502
|
+
p->data = NULL;
|
19503
|
+
|
19504
|
+
bool ok = true;
|
19505
|
+
|
19506
|
+
// TODO: how to avoid mallocs for strings?
|
19507
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
19508
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19509
|
+
|
19510
|
+
return ok;
|
19511
|
+
}
|
19512
|
+
|
19513
|
+
struct gguf_context * gguf_init_empty(void) {
|
19514
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19515
|
+
|
19516
|
+
ctx->header.magic = GGUF_MAGIC;
|
19517
|
+
ctx->header.version = GGUF_VERSION;
|
19518
|
+
ctx->header.n_tensors = 0;
|
19519
|
+
ctx->header.n_kv = 0;
|
19520
|
+
|
19521
|
+
ctx->kv = NULL;
|
19522
|
+
ctx->infos = NULL;
|
19523
|
+
|
19524
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19525
|
+
ctx->offset = 0;
|
19526
|
+
ctx->size = 0;
|
19527
|
+
|
19528
|
+
ctx->data = NULL;
|
19529
|
+
|
19530
|
+
return ctx;
|
19531
|
+
}
|
19532
|
+
|
19533
|
+
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
19534
|
+
FILE * file = fopen(fname, "rb");
|
19535
|
+
if (!file) {
|
19536
|
+
return NULL;
|
19537
|
+
}
|
19538
|
+
|
19539
|
+
// offset from start of file
|
19540
|
+
size_t offset = 0;
|
19541
|
+
|
19542
|
+
uint32_t magic = 0;
|
19543
|
+
|
19544
|
+
// check the magic before making allocations
|
19545
|
+
{
|
19546
|
+
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
19547
|
+
|
19548
|
+
if (magic != GGUF_MAGIC) {
|
19549
|
+
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
19550
|
+
fclose(file);
|
19551
|
+
return NULL;
|
19552
|
+
}
|
19553
|
+
}
|
19554
|
+
|
19555
|
+
bool ok = true;
|
19556
|
+
|
19557
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19558
|
+
|
19559
|
+
// read the header
|
19560
|
+
{
|
19561
|
+
ctx->header.magic = magic;
|
19562
|
+
|
19563
|
+
ctx->kv = NULL;
|
19564
|
+
ctx->infos = NULL;
|
19565
|
+
ctx->data = NULL;
|
19566
|
+
|
19567
|
+
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
19568
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
19569
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
19570
|
+
|
19571
|
+
if (!ok) {
|
19572
|
+
fprintf(stderr, "%s: failed to read header\n", __func__);
|
19573
|
+
fclose(file);
|
19574
|
+
gguf_free(ctx);
|
19575
|
+
return NULL;
|
19576
|
+
}
|
19577
|
+
}
|
19578
|
+
|
19579
|
+
// read the kv pairs
|
19580
|
+
{
|
19581
|
+
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19582
|
+
|
19583
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19584
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
19585
|
+
|
19586
|
+
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
19587
|
+
|
19588
|
+
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
19589
|
+
//ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
|
19590
|
+
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19591
|
+
|
19592
|
+
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
19593
|
+
|
19594
|
+
switch (kv->type) {
|
19595
|
+
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
|
19596
|
+
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
|
19597
|
+
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
|
19598
|
+
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
|
19599
|
+
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
19600
|
+
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
19601
|
+
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
19602
|
+
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
19603
|
+
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
19604
|
+
case GGUF_TYPE_ARRAY:
|
19605
|
+
{
|
19606
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19607
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19608
|
+
|
19609
|
+
switch (kv->value.arr.type) {
|
19610
|
+
case GGUF_TYPE_UINT8:
|
19611
|
+
case GGUF_TYPE_INT8:
|
19612
|
+
case GGUF_TYPE_UINT16:
|
19613
|
+
case GGUF_TYPE_INT16:
|
19614
|
+
case GGUF_TYPE_UINT32:
|
19615
|
+
case GGUF_TYPE_INT32:
|
19616
|
+
case GGUF_TYPE_FLOAT32:
|
19617
|
+
case GGUF_TYPE_BOOL:
|
19618
|
+
{
|
19619
|
+
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
19620
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
19621
|
+
} break;
|
19622
|
+
case GGUF_TYPE_STRING:
|
19623
|
+
{
|
19624
|
+
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
19625
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
19626
|
+
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19627
|
+
}
|
19628
|
+
} break;
|
19629
|
+
case GGUF_TYPE_ARRAY:
|
19630
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
19631
|
+
};
|
19632
|
+
} break;
|
19633
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
19634
|
+
};
|
19635
|
+
|
19636
|
+
if (!ok) {
|
19637
|
+
break;
|
19638
|
+
}
|
19639
|
+
}
|
19640
|
+
|
19641
|
+
if (!ok) {
|
19642
|
+
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
19643
|
+
fclose(file);
|
19644
|
+
gguf_free(ctx);
|
19645
|
+
return NULL;
|
19646
|
+
}
|
19647
|
+
}
|
19648
|
+
|
19649
|
+
// read the tensor infos
|
19650
|
+
{
|
19651
|
+
ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19652
|
+
|
19653
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19654
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19655
|
+
|
19656
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
19657
|
+
info->ne[j] = 1;
|
19658
|
+
}
|
19659
|
+
|
19660
|
+
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19661
|
+
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19662
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19663
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19664
|
+
}
|
19665
|
+
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19666
|
+
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19667
|
+
|
19668
|
+
if (!ok) {
|
19669
|
+
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19670
|
+
fclose(file);
|
19671
|
+
gguf_free(ctx);
|
19672
|
+
return NULL;
|
19673
|
+
}
|
19674
|
+
}
|
19675
|
+
}
|
19676
|
+
|
19677
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19678
|
+
|
19679
|
+
int alignment_idx = gguf_find_key(ctx, "general.alignment");
|
19680
|
+
if (alignment_idx != -1) {
|
19681
|
+
ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
|
19682
|
+
}
|
19683
|
+
|
19684
|
+
// we require the data section to be aligned, so take into account any padding
|
19685
|
+
{
|
19686
|
+
const size_t offset_pad = offset % ctx->alignment;
|
19687
|
+
|
19688
|
+
if (offset_pad != 0) {
|
19689
|
+
offset += ctx->alignment - offset_pad;
|
19690
|
+
fseek(file, offset, SEEK_SET);
|
19691
|
+
}
|
19692
|
+
}
|
19693
|
+
|
19694
|
+
// store the current file offset - this is where the data section starts
|
19695
|
+
ctx->offset = offset;
|
19696
|
+
|
19697
|
+
// compute the total size of the data section, taking into account the alignment
|
19698
|
+
{
|
19699
|
+
ctx->size = 0;
|
19700
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19701
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19702
|
+
|
19703
|
+
const int64_t ne =
|
19704
|
+
(int64_t) info->ne[0] *
|
19705
|
+
(int64_t) info->ne[1] *
|
19706
|
+
(int64_t) info->ne[2] *
|
19707
|
+
(int64_t) info->ne[3];
|
19708
|
+
|
19709
|
+
if (ne % ggml_blck_size(info->type) != 0) {
|
19710
|
+
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19711
|
+
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
19712
|
+
fclose(file);
|
19713
|
+
gguf_free(ctx);
|
19714
|
+
return NULL;
|
19715
|
+
}
|
19716
|
+
|
19717
|
+
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
19718
|
+
|
19719
|
+
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
19720
|
+
}
|
19721
|
+
}
|
19722
|
+
|
19723
|
+
// load the tensor data only if requested
|
19724
|
+
if (params.ctx != NULL) {
|
19725
|
+
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
|
19726
|
+
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
|
19727
|
+
// the ggml_tensor structs to the appropriate locations in the binary blob
|
19728
|
+
|
19729
|
+
// compute the exact size needed for the new ggml_context
|
19730
|
+
const size_t mem_size =
|
19731
|
+
params.no_alloc ?
|
19732
|
+
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
19733
|
+
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
19734
|
+
|
19735
|
+
struct ggml_init_params pdata = {
|
19736
|
+
.mem_size = mem_size,
|
19737
|
+
.mem_buffer = NULL,
|
19738
|
+
.no_alloc = params.no_alloc,
|
19739
|
+
};
|
19740
|
+
|
19741
|
+
*params.ctx = ggml_init(pdata);
|
19742
|
+
|
19743
|
+
struct ggml_context * ctx_data = *params.ctx;
|
19744
|
+
|
19745
|
+
struct ggml_tensor * data = NULL;
|
19746
|
+
|
19747
|
+
if (params.no_alloc == false) {
|
19748
|
+
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
19749
|
+
|
19750
|
+
ok = ok && data != NULL;
|
19751
|
+
|
19752
|
+
// read the binary blob with the tensor data
|
19753
|
+
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
|
19754
|
+
|
19755
|
+
if (!ok) {
|
19756
|
+
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
19757
|
+
fclose(file);
|
19758
|
+
ggml_free(ctx_data);
|
19759
|
+
gguf_free(ctx);
|
19760
|
+
return NULL;
|
19761
|
+
}
|
19762
|
+
|
19763
|
+
ctx->data = data->data;
|
19764
|
+
}
|
19765
|
+
|
19766
|
+
ggml_set_no_alloc(ctx_data, true);
|
19767
|
+
|
19768
|
+
// create the tensors
|
19769
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19770
|
+
const int64_t ne[GGML_MAX_DIMS] = {
|
19771
|
+
ctx->infos[i].ne[0],
|
19772
|
+
ctx->infos[i].ne[1],
|
19773
|
+
ctx->infos[i].ne[2],
|
19774
|
+
ctx->infos[i].ne[3],
|
19775
|
+
};
|
19776
|
+
|
19777
|
+
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
|
19778
|
+
|
19779
|
+
ok = ok && cur != NULL;
|
19780
|
+
|
19781
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
19782
|
+
|
19783
|
+
if (!ok) {
|
19784
|
+
break;
|
19785
|
+
}
|
19786
|
+
|
19787
|
+
// point the data member to the appropriate location in the binary blob using the tensor infos
|
19788
|
+
if (params.no_alloc == false) {
|
19789
|
+
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
19790
|
+
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
19791
|
+
}
|
19792
|
+
}
|
19793
|
+
|
19794
|
+
if (!ok) {
|
19795
|
+
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
|
19796
|
+
fclose(file);
|
19797
|
+
ggml_free(ctx_data);
|
19798
|
+
gguf_free(ctx);
|
19799
|
+
return NULL;
|
19800
|
+
}
|
19801
|
+
|
19802
|
+
ggml_set_no_alloc(ctx_data, params.no_alloc);
|
19803
|
+
}
|
19804
|
+
|
19805
|
+
fclose(file);
|
19806
|
+
|
19807
|
+
return ctx;
|
19808
|
+
}
|
19809
|
+
|
19810
|
+
void gguf_free(struct gguf_context * ctx) {
|
19811
|
+
if (ctx == NULL) {
|
19812
|
+
return;
|
19813
|
+
}
|
19814
|
+
|
19815
|
+
if (ctx->kv) {
|
19816
|
+
// free string memory - not great..
|
19817
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19818
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
19819
|
+
|
19820
|
+
if (kv->key.data) {
|
19821
|
+
free(kv->key.data);
|
19822
|
+
}
|
19823
|
+
|
19824
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
19825
|
+
if (kv->value.str.data) {
|
19826
|
+
free(kv->value.str.data);
|
19827
|
+
}
|
19828
|
+
}
|
19829
|
+
|
19830
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
19831
|
+
if (kv->value.arr.data) {
|
19832
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
19833
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
19834
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19835
|
+
if (str->data) {
|
19836
|
+
free(str->data);
|
19837
|
+
}
|
19838
|
+
}
|
19839
|
+
}
|
19840
|
+
free(kv->value.arr.data);
|
19841
|
+
}
|
19842
|
+
}
|
19843
|
+
}
|
19844
|
+
|
19845
|
+
GGML_ALIGNED_FREE(ctx->kv);
|
19846
|
+
}
|
19847
|
+
|
19848
|
+
if (ctx->infos) {
|
19849
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19850
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19851
|
+
|
19852
|
+
if (info->name.data) {
|
19853
|
+
free(info->name.data);
|
19854
|
+
}
|
19855
|
+
}
|
19856
|
+
|
19857
|
+
GGML_ALIGNED_FREE(ctx->infos);
|
19858
|
+
}
|
19859
|
+
|
19860
|
+
GGML_ALIGNED_FREE(ctx);
|
19861
|
+
}
|
19862
|
+
|
19863
|
+
const char * gguf_type_name(enum gguf_type type) {
|
19864
|
+
return GGUF_TYPE_NAME[type];
|
19865
|
+
}
|
19866
|
+
|
19867
|
+
int gguf_get_version(struct gguf_context * ctx) {
|
19868
|
+
return ctx->header.version;
|
19869
|
+
}
|
19870
|
+
|
19871
|
+
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
19872
|
+
return ctx->alignment;
|
19873
|
+
}
|
19874
|
+
|
19875
|
+
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
19876
|
+
return ctx->offset;
|
19877
|
+
}
|
19878
|
+
|
19879
|
+
void * gguf_get_data(struct gguf_context * ctx) {
|
19880
|
+
return ctx->data;
|
19881
|
+
}
|
19882
|
+
|
19883
|
+
int gguf_get_n_kv(struct gguf_context * ctx) {
|
19884
|
+
return ctx->header.n_kv;
|
19885
|
+
}
|
19886
|
+
|
19887
|
+
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
19888
|
+
// return -1 if key not found
|
19889
|
+
int keyfound = -1;
|
19890
|
+
|
19891
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
19892
|
+
|
19893
|
+
for (int i = 0; i < n_kv; ++i) {
|
19894
|
+
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
19895
|
+
keyfound = i;
|
19896
|
+
break;
|
19897
|
+
}
|
19898
|
+
}
|
19899
|
+
|
19900
|
+
return keyfound;
|
19901
|
+
}
|
19902
|
+
|
19903
|
+
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
19904
|
+
return ctx->kv[i].key.data;
|
19905
|
+
}
|
19906
|
+
|
19907
|
+
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
19908
|
+
return ctx->kv[i].type;
|
19909
|
+
}
|
19910
|
+
|
19911
|
+
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
19912
|
+
return ctx->kv[i].value.arr.type;
|
19913
|
+
}
|
19914
|
+
|
19915
|
+
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
19916
|
+
return ctx->kv[i].value.arr.data;
|
19917
|
+
}
|
19918
|
+
|
19919
|
+
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
19920
|
+
struct gguf_kv * kv = &ctx->kv[key_id];
|
19921
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
19922
|
+
return str->data;
|
19923
|
+
}
|
19924
|
+
|
19925
|
+
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
19926
|
+
return ctx->kv[i].value.arr.n;
|
19927
|
+
}
|
19928
|
+
|
19929
|
+
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
19930
|
+
return ctx->kv[i].value.uint8;
|
19931
|
+
}
|
19932
|
+
|
19933
|
+
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
19934
|
+
return ctx->kv[i].value.int8;
|
19935
|
+
}
|
19936
|
+
|
19937
|
+
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
19938
|
+
return ctx->kv[i].value.uint16;
|
19939
|
+
}
|
19940
|
+
|
19941
|
+
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
19942
|
+
return ctx->kv[i].value.int16;
|
19943
|
+
}
|
19944
|
+
|
19945
|
+
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
19946
|
+
return ctx->kv[i].value.uint32;
|
19947
|
+
}
|
19948
|
+
|
19949
|
+
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
19950
|
+
return ctx->kv[i].value.int32;
|
19951
|
+
}
|
19952
|
+
|
19953
|
+
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
19954
|
+
return ctx->kv[i].value.float32;
|
19955
|
+
}
|
19956
|
+
|
19957
|
+
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
19958
|
+
return ctx->kv[i].value.bool_;
|
19959
|
+
}
|
19960
|
+
|
19961
|
+
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
19962
|
+
return ctx->kv[i].value.str.data;
|
19963
|
+
}
|
19964
|
+
|
19965
|
+
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
19966
|
+
return ctx->header.n_tensors;
|
19967
|
+
}
|
19968
|
+
|
19969
|
+
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
19970
|
+
// return -1 if tensor not found
|
19971
|
+
int tensorfound = -1;
|
19972
|
+
|
19973
|
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
19974
|
+
|
19975
|
+
for (int i = 0; i < n_tensors; ++i) {
|
19976
|
+
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
|
19977
|
+
tensorfound = i;
|
19978
|
+
break;
|
19979
|
+
}
|
19980
|
+
}
|
19981
|
+
|
19982
|
+
return tensorfound;
|
19983
|
+
}
|
19984
|
+
|
19985
|
+
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
19986
|
+
return ctx->infos[i].offset;
|
19987
|
+
}
|
19988
|
+
|
19989
|
+
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
19990
|
+
return ctx->infos[i].name.data;
|
19991
|
+
}
|
19992
|
+
|
19993
|
+
// returns the index
|
19994
|
+
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
19995
|
+
const int idx = gguf_find_key(ctx, key);
|
19996
|
+
if (idx >= 0) {
|
19997
|
+
return idx;
|
19998
|
+
}
|
19999
|
+
|
20000
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
20001
|
+
|
20002
|
+
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
20003
|
+
ctx->kv[n_kv].key.n = strlen(key) + 1;
|
20004
|
+
ctx->kv[n_kv].key.data = strdup(key);
|
20005
|
+
ctx->header.n_kv++;
|
20006
|
+
|
20007
|
+
return n_kv;
|
20008
|
+
}
|
20009
|
+
|
20010
|
+
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
20011
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20012
|
+
|
20013
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT8;
|
20014
|
+
ctx->kv[idx].value.uint8 = val;
|
20015
|
+
}
|
20016
|
+
|
20017
|
+
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
|
20018
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20019
|
+
|
20020
|
+
ctx->kv[idx].type = GGUF_TYPE_INT8;
|
20021
|
+
ctx->kv[idx].value.int8 = val;
|
20022
|
+
}
|
20023
|
+
|
20024
|
+
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
|
20025
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20026
|
+
|
20027
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT16;
|
20028
|
+
ctx->kv[idx].value.uint16 = val;
|
20029
|
+
}
|
20030
|
+
|
20031
|
+
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
|
20032
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20033
|
+
|
20034
|
+
ctx->kv[idx].type = GGUF_TYPE_INT16;
|
20035
|
+
ctx->kv[idx].value.int16 = val;
|
20036
|
+
}
|
20037
|
+
|
20038
|
+
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
|
20039
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20040
|
+
|
20041
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT32;
|
20042
|
+
ctx->kv[idx].value.uint32 = val;
|
20043
|
+
}
|
20044
|
+
|
20045
|
+
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
|
20046
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20047
|
+
|
20048
|
+
ctx->kv[idx].type = GGUF_TYPE_INT32;
|
20049
|
+
ctx->kv[idx].value.int32 = val;
|
20050
|
+
}
|
20051
|
+
|
20052
|
+
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
20053
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20054
|
+
|
20055
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
|
20056
|
+
ctx->kv[idx].value.float32 = val;
|
20057
|
+
}
|
20058
|
+
|
20059
|
+
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
20060
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20061
|
+
|
20062
|
+
ctx->kv[idx].type = GGUF_TYPE_BOOL;
|
20063
|
+
ctx->kv[idx].value.bool_ = val;
|
20064
|
+
}
|
20065
|
+
|
20066
|
+
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
|
20067
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20068
|
+
|
20069
|
+
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
20070
|
+
ctx->kv[idx].value.str.n = strlen(val) + 1;
|
20071
|
+
ctx->kv[idx].value.str.data = strdup(val);
|
20072
|
+
}
|
20073
|
+
|
20074
|
+
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
|
20075
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20076
|
+
|
20077
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20078
|
+
ctx->kv[idx].value.arr.type = type;
|
20079
|
+
ctx->kv[idx].value.arr.n = n;
|
20080
|
+
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
20081
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
20082
|
+
}
|
20083
|
+
|
20084
|
+
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
20085
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20086
|
+
|
20087
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20088
|
+
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
20089
|
+
ctx->kv[idx].value.arr.n = n;
|
20090
|
+
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
20091
|
+
for (int i = 0; i < n; i++) {
|
20092
|
+
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
20093
|
+
str->n = strlen(data[i]) + 1;
|
20094
|
+
str->data = strdup(data[i]);
|
20095
|
+
}
|
20096
|
+
}
|
20097
|
+
|
20098
|
+
// set or add KV pairs from another context
|
20099
|
+
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
20100
|
+
for (uint32_t i = 0; i < src->header.n_kv; i++) {
|
20101
|
+
switch (src->kv[i].type) {
|
20102
|
+
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
|
20103
|
+
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
|
20104
|
+
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
|
20105
|
+
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
|
20106
|
+
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
20107
|
+
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
20108
|
+
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
20109
|
+
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
20110
|
+
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
20111
|
+
case GGUF_TYPE_ARRAY:
|
20112
|
+
{
|
20113
|
+
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
20114
|
+
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
20115
|
+
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
20116
|
+
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
20117
|
+
}
|
20118
|
+
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
20119
|
+
free(data);
|
20120
|
+
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
20121
|
+
GGML_ASSERT(false && "nested arrays not supported");
|
20122
|
+
} else {
|
20123
|
+
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
20124
|
+
}
|
20125
|
+
} break;
|
20126
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20127
|
+
}
|
20128
|
+
}
|
20129
|
+
}
|
20130
|
+
|
20131
|
+
void gguf_add_tensor(
|
20132
|
+
struct gguf_context * ctx,
|
20133
|
+
const struct ggml_tensor * tensor) {
|
20134
|
+
const int idx = ctx->header.n_tensors;
|
20135
|
+
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
20136
|
+
|
20137
|
+
ctx->infos[idx].name.n = strlen(tensor->name) + 1;
|
20138
|
+
ctx->infos[idx].name.data = strdup(tensor->name);
|
20139
|
+
|
20140
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
20141
|
+
ctx->infos[idx].ne[i] = 1;
|
20142
|
+
}
|
20143
|
+
|
20144
|
+
ctx->infos[idx].n_dims = tensor->n_dims;
|
20145
|
+
for (int i = 0; i < tensor->n_dims; i++) {
|
20146
|
+
ctx->infos[idx].ne[i] = tensor->ne[i];
|
20147
|
+
}
|
20148
|
+
|
20149
|
+
ctx->infos[idx].type = tensor->type;
|
20150
|
+
ctx->infos[idx].offset = 0;
|
20151
|
+
ctx->infos[idx].data = tensor->data;
|
20152
|
+
ctx->infos[idx].size = ggml_nbytes(tensor);
|
20153
|
+
|
20154
|
+
if (ctx->header.n_tensors > 0) {
|
20155
|
+
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
|
20156
|
+
}
|
20157
|
+
|
20158
|
+
ctx->header.n_tensors++;
|
20159
|
+
}
|
20160
|
+
|
20161
|
+
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
20162
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20163
|
+
if (idx < 0) {
|
20164
|
+
GGML_ASSERT(false && "tensor not found");
|
20165
|
+
}
|
20166
|
+
|
20167
|
+
ctx->infos[idx].type = type;
|
20168
|
+
}
|
20169
|
+
|
20170
|
+
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
|
20171
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20172
|
+
if (idx < 0) {
|
20173
|
+
GGML_ASSERT(false && "tensor not found");
|
20174
|
+
}
|
20175
|
+
|
20176
|
+
ctx->infos[idx].data = data;
|
20177
|
+
ctx->infos[idx].size = size;
|
20178
|
+
|
20179
|
+
// update offsets
|
20180
|
+
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
|
20181
|
+
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
|
20182
|
+
}
|
20183
|
+
}
|
20184
|
+
|
20185
|
+
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
|
20186
|
+
// fwrite(&val->n, sizeof(val->n), 1, file);
|
20187
|
+
// fwrite(val->data, sizeof(char), val->n, file);
|
20188
|
+
//}
|
20189
|
+
//
|
20190
|
+
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
|
20191
|
+
// fwrite(val, sizeof(char), size, file);
|
20192
|
+
//}
|
20193
|
+
|
20194
|
+
struct gguf_buf {
|
20195
|
+
void * data;
|
20196
|
+
size_t size;
|
20197
|
+
size_t offset;
|
20198
|
+
};
|
20199
|
+
|
20200
|
+
static struct gguf_buf gguf_buf_init(size_t size) {
|
20201
|
+
struct gguf_buf buf = {
|
20202
|
+
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
20203
|
+
/*buf.size =*/ size,
|
20204
|
+
/*buf.offset =*/ 0,
|
20205
|
+
};
|
20206
|
+
|
20207
|
+
return buf;
|
20208
|
+
}
|
20209
|
+
|
20210
|
+
static void gguf_buf_free(struct gguf_buf buf) {
|
20211
|
+
if (buf.data) {
|
20212
|
+
free(buf.data);
|
20213
|
+
}
|
20214
|
+
}
|
20215
|
+
|
20216
|
+
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
|
20217
|
+
if (buf->offset + size > buf->size) {
|
20218
|
+
buf->size = 1.5*(buf->offset + size);
|
20219
|
+
if (buf->data) {
|
20220
|
+
buf->data = realloc(buf->data, buf->size);
|
20221
|
+
}
|
20222
|
+
}
|
20223
|
+
}
|
20224
|
+
|
20225
|
+
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
|
20226
|
+
gguf_buf_grow(buf, sizeof(val->n) + val->n);
|
20227
|
+
|
20228
|
+
if (buf->data) {
|
20229
|
+
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
|
20230
|
+
}
|
20231
|
+
buf->offset += sizeof(val->n);
|
20232
|
+
|
20233
|
+
if (buf->data) {
|
20234
|
+
memcpy((char *) buf->data + buf->offset, val->data, val->n);
|
20235
|
+
}
|
20236
|
+
buf->offset += val->n;
|
20237
|
+
}
|
20238
|
+
|
20239
|
+
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
|
20240
|
+
gguf_buf_grow(buf, el_size);
|
20241
|
+
|
20242
|
+
if (buf->data) {
|
20243
|
+
memcpy((char *) buf->data + buf->offset, val, el_size);
|
20244
|
+
}
|
20245
|
+
buf->offset += el_size;
|
20246
|
+
}
|
20247
|
+
|
20248
|
+
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20249
|
+
// write header
|
20250
|
+
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20251
|
+
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
20252
|
+
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
|
20253
|
+
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
|
20254
|
+
|
20255
|
+
// write key-value pairs
|
20256
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
20257
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
20258
|
+
|
20259
|
+
gguf_bwrite_str(buf, &kv->key);
|
20260
|
+
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
|
20261
|
+
|
20262
|
+
switch (kv->type) {
|
20263
|
+
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
|
20264
|
+
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
|
20265
|
+
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
|
20266
|
+
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
|
20267
|
+
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
20268
|
+
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
20269
|
+
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
20270
|
+
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
20271
|
+
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
20272
|
+
case GGUF_TYPE_ARRAY:
|
20273
|
+
{
|
20274
|
+
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
|
20275
|
+
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
|
20276
|
+
|
20277
|
+
switch (kv->value.arr.type) {
|
20278
|
+
case GGUF_TYPE_UINT8:
|
20279
|
+
case GGUF_TYPE_INT8:
|
20280
|
+
case GGUF_TYPE_UINT16:
|
20281
|
+
case GGUF_TYPE_INT16:
|
20282
|
+
case GGUF_TYPE_UINT32:
|
20283
|
+
case GGUF_TYPE_INT32:
|
20284
|
+
case GGUF_TYPE_FLOAT32:
|
20285
|
+
case GGUF_TYPE_BOOL:
|
20286
|
+
{
|
20287
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
20288
|
+
} break;
|
20289
|
+
case GGUF_TYPE_STRING:
|
20290
|
+
{
|
20291
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
20292
|
+
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
|
20293
|
+
}
|
20294
|
+
} break;
|
20295
|
+
case GGUF_TYPE_ARRAY:
|
20296
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20297
|
+
};
|
20298
|
+
} break;
|
20299
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
20300
|
+
};
|
20301
|
+
}
|
20302
|
+
|
20303
|
+
// write tensor infos
|
20304
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20305
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20306
|
+
|
20307
|
+
gguf_bwrite_str(buf, &info->name);
|
20308
|
+
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
|
20309
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
20310
|
+
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
|
20311
|
+
}
|
20312
|
+
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
|
20313
|
+
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
|
20314
|
+
}
|
20315
|
+
|
20316
|
+
// we require the data section to be aligned, so take into account any padding
|
20317
|
+
{
|
20318
|
+
const size_t offset = buf->offset;
|
20319
|
+
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
|
20320
|
+
|
20321
|
+
if (offset_pad != offset) {
|
20322
|
+
uint8_t pad = 0;
|
20323
|
+
for (size_t i = 0; i < offset_pad - offset; ++i) {
|
20324
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20325
|
+
}
|
20326
|
+
}
|
20327
|
+
}
|
20328
|
+
|
20329
|
+
if (only_meta) {
|
20330
|
+
return;
|
20331
|
+
}
|
20332
|
+
|
20333
|
+
size_t offset = 0;
|
20334
|
+
|
20335
|
+
// write tensor data
|
20336
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20337
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20338
|
+
|
20339
|
+
const size_t size = info->size;
|
20340
|
+
const size_t size_pad = GGML_PAD(size, ctx->alignment);
|
20341
|
+
|
20342
|
+
gguf_bwrite_el(buf, info->data, size);
|
20343
|
+
|
20344
|
+
if (size_pad != size) {
|
20345
|
+
uint8_t pad = 0;
|
20346
|
+
for (size_t j = 0; j < size_pad - size; ++j) {
|
20347
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20348
|
+
}
|
20349
|
+
}
|
20350
|
+
|
20351
|
+
GGML_ASSERT(offset == info->offset);
|
20352
|
+
|
20353
|
+
offset += size_pad;
|
20354
|
+
}
|
20355
|
+
}
|
20356
|
+
|
20357
|
+
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20358
|
+
FILE * file = fopen(fname, "wb");
|
20359
|
+
if (!file) {
|
20360
|
+
GGML_ASSERT(false && "failed to open file for writing");
|
20361
|
+
}
|
20362
|
+
|
20363
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20364
|
+
|
20365
|
+
gguf_write_to_buf(ctx, &buf, only_meta);
|
20366
|
+
|
20367
|
+
fwrite(buf.data, 1, buf.offset, file);
|
20368
|
+
|
20369
|
+
gguf_buf_free(buf);
|
20370
|
+
|
20371
|
+
fclose(file);
|
20372
|
+
}
|
20373
|
+
|
20374
|
+
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20375
|
+
// no allocs - only compute size
|
20376
|
+
struct gguf_buf buf = gguf_buf_init(0);
|
20377
|
+
|
20378
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20379
|
+
|
20380
|
+
return buf.offset;
|
20381
|
+
}
|
20382
|
+
|
20383
|
+
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20384
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20385
|
+
|
20386
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20387
|
+
|
20388
|
+
memcpy(data, buf.data, buf.offset);
|
20389
|
+
|
20390
|
+
gguf_buf_free(buf);
|
20391
|
+
}
|
20392
|
+
|
20393
|
+
////////////////////////////////////////////////////////////////////////////////
|
20394
|
+
|
18564
20395
|
int ggml_cpu_has_avx(void) {
|
18565
20396
|
#if defined(__AVX__)
|
18566
20397
|
return 1;
|