llama_cpp 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -213,8 +213,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
213
213
|
error_desc = "insufficient memory";
|
214
214
|
break;
|
215
215
|
}
|
216
|
-
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
217
|
-
__func__, error_desc, size/(1024.0*1024.0));
|
216
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
218
217
|
return NULL;
|
219
218
|
}
|
220
219
|
return aligned_memory;
|
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
1643
1642
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1644
1643
|
|
1645
1644
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1645
|
+
[GGML_TYPE_I8] = {
|
1646
|
+
.type_name = "i8",
|
1647
|
+
.blck_size = 1,
|
1648
|
+
.type_size = sizeof(int8_t),
|
1649
|
+
.is_quantized = false,
|
1650
|
+
},
|
1651
|
+
[GGML_TYPE_I16] = {
|
1652
|
+
.type_name = "i16",
|
1653
|
+
.blck_size = 1,
|
1654
|
+
.type_size = sizeof(int16_t),
|
1655
|
+
.is_quantized = false,
|
1656
|
+
},
|
1657
|
+
[GGML_TYPE_I32] = {
|
1658
|
+
.type_name = "i32",
|
1659
|
+
.blck_size = 1,
|
1660
|
+
.type_size = sizeof(int32_t),
|
1661
|
+
.is_quantized = false,
|
1662
|
+
},
|
1646
1663
|
[GGML_TYPE_F32] = {
|
1664
|
+
.type_name = "f32",
|
1665
|
+
.blck_size = 1,
|
1666
|
+
.type_size = sizeof(float),
|
1667
|
+
.is_quantized = false,
|
1647
1668
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1648
1669
|
.vec_dot_type = GGML_TYPE_F32,
|
1649
1670
|
},
|
1650
1671
|
[GGML_TYPE_F16] = {
|
1672
|
+
.type_name = "f16",
|
1673
|
+
.blck_size = 1,
|
1674
|
+
.type_size = sizeof(ggml_fp16_t),
|
1675
|
+
.is_quantized = false,
|
1651
1676
|
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1652
1677
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1653
1678
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1655
1680
|
.vec_dot_type = GGML_TYPE_F16,
|
1656
1681
|
},
|
1657
1682
|
[GGML_TYPE_Q4_0] = {
|
1683
|
+
.type_name = "q4_0",
|
1684
|
+
.blck_size = QK4_0,
|
1685
|
+
.type_size = sizeof(block_q4_0),
|
1686
|
+
.is_quantized = true,
|
1658
1687
|
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1659
1688
|
.from_float = quantize_row_q4_0,
|
1660
1689
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1662
1691
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1663
1692
|
},
|
1664
1693
|
[GGML_TYPE_Q4_1] = {
|
1694
|
+
.type_name = "q4_1",
|
1695
|
+
.blck_size = QK4_1,
|
1696
|
+
.type_size = sizeof(block_q4_1),
|
1697
|
+
.is_quantized = true,
|
1665
1698
|
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1666
1699
|
.from_float = quantize_row_q4_1,
|
1667
1700
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1669
1702
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1670
1703
|
},
|
1671
1704
|
[GGML_TYPE_Q5_0] = {
|
1705
|
+
.type_name = "q5_0",
|
1706
|
+
.blck_size = QK5_0,
|
1707
|
+
.type_size = sizeof(block_q5_0),
|
1708
|
+
.is_quantized = true,
|
1672
1709
|
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1673
1710
|
.from_float = quantize_row_q5_0,
|
1674
1711
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1676
1713
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1677
1714
|
},
|
1678
1715
|
[GGML_TYPE_Q5_1] = {
|
1716
|
+
.type_name = "q5_1",
|
1717
|
+
.blck_size = QK5_1,
|
1718
|
+
.type_size = sizeof(block_q5_1),
|
1719
|
+
.is_quantized = true,
|
1679
1720
|
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1680
1721
|
.from_float = quantize_row_q5_1,
|
1681
1722
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1683
1724
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1684
1725
|
},
|
1685
1726
|
[GGML_TYPE_Q8_0] = {
|
1727
|
+
.type_name = "q8_0",
|
1728
|
+
.blck_size = QK8_0,
|
1729
|
+
.type_size = sizeof(block_q8_0),
|
1730
|
+
.is_quantized = true,
|
1686
1731
|
.to_float = dequantize_row_q8_0,
|
1687
1732
|
.from_float = quantize_row_q8_0,
|
1688
1733
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1690
1735
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1691
1736
|
},
|
1692
1737
|
[GGML_TYPE_Q8_1] = {
|
1738
|
+
.type_name = "q8_1",
|
1739
|
+
.blck_size = QK8_1,
|
1740
|
+
.type_size = sizeof(block_q8_1),
|
1741
|
+
.is_quantized = true,
|
1693
1742
|
.from_float = quantize_row_q8_1,
|
1694
1743
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1695
1744
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1696
1745
|
},
|
1697
1746
|
#ifdef GGML_USE_K_QUANTS
|
1698
1747
|
[GGML_TYPE_Q2_K] = {
|
1748
|
+
.type_name = "q2_K",
|
1749
|
+
.blck_size = QK_K,
|
1750
|
+
.type_size = sizeof(block_q2_K),
|
1751
|
+
.is_quantized = true,
|
1699
1752
|
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1700
1753
|
.from_float = quantize_row_q2_K,
|
1701
1754
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1703
1756
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1704
1757
|
},
|
1705
1758
|
[GGML_TYPE_Q3_K] = {
|
1759
|
+
.type_name = "q3_K",
|
1760
|
+
.blck_size = QK_K,
|
1761
|
+
.type_size = sizeof(block_q3_K),
|
1762
|
+
.is_quantized = true,
|
1706
1763
|
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1707
1764
|
.from_float = quantize_row_q3_K,
|
1708
1765
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1710
1767
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1711
1768
|
},
|
1712
1769
|
[GGML_TYPE_Q4_K] = {
|
1770
|
+
.type_name = "q4_K",
|
1771
|
+
.blck_size = QK_K,
|
1772
|
+
.type_size = sizeof(block_q4_K),
|
1773
|
+
.is_quantized = true,
|
1713
1774
|
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1714
1775
|
.from_float = quantize_row_q4_K,
|
1715
1776
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1717
1778
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1718
1779
|
},
|
1719
1780
|
[GGML_TYPE_Q5_K] = {
|
1781
|
+
.type_name = "q5_K",
|
1782
|
+
.blck_size = QK_K,
|
1783
|
+
.type_size = sizeof(block_q5_K),
|
1784
|
+
.is_quantized = true,
|
1720
1785
|
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1721
1786
|
.from_float = quantize_row_q5_K,
|
1722
1787
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1724
1789
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1725
1790
|
},
|
1726
1791
|
[GGML_TYPE_Q6_K] = {
|
1792
|
+
.type_name = "q6_K",
|
1793
|
+
.blck_size = QK_K,
|
1794
|
+
.type_size = sizeof(block_q6_K),
|
1795
|
+
.is_quantized = true,
|
1727
1796
|
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1728
1797
|
.from_float = quantize_row_q6_K,
|
1729
1798
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1731
1800
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1732
1801
|
},
|
1733
1802
|
[GGML_TYPE_Q8_K] = {
|
1803
|
+
.type_name = "q8_K",
|
1804
|
+
.blck_size = QK_K,
|
1805
|
+
.type_size = sizeof(block_q8_K),
|
1806
|
+
.is_quantized = true,
|
1734
1807
|
.from_float = quantize_row_q8_K,
|
1735
1808
|
}
|
1736
1809
|
#endif
|
1737
1810
|
};
|
1738
1811
|
|
1739
1812
|
// For internal test use
|
1740
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type
|
1741
|
-
GGML_ASSERT(
|
1742
|
-
return type_traits[
|
1813
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
1814
|
+
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
1815
|
+
return type_traits[type];
|
1743
1816
|
}
|
1744
1817
|
|
1745
1818
|
|
@@ -3481,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
3481
3554
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
3482
3555
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3483
3556
|
|
3484
|
-
static const float GELU_COEF_A
|
3485
|
-
static const float GELU_QUICK_COEF
|
3486
|
-
static const float SQRT_2_OVER_PI
|
3557
|
+
static const float GELU_COEF_A = 0.044715f;
|
3558
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3559
|
+
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3487
3560
|
|
3488
3561
|
inline static float ggml_gelu_f32(float x) {
|
3489
3562
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
@@ -3652,95 +3725,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
|
3652
3725
|
// data types
|
3653
3726
|
//
|
3654
3727
|
|
3655
|
-
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
3656
|
-
[GGML_TYPE_F32] = 1,
|
3657
|
-
[GGML_TYPE_F16] = 1,
|
3658
|
-
[GGML_TYPE_Q4_0] = QK4_0,
|
3659
|
-
[GGML_TYPE_Q4_1] = QK4_1,
|
3660
|
-
[GGML_TYPE_Q5_0] = QK5_0,
|
3661
|
-
[GGML_TYPE_Q5_1] = QK5_1,
|
3662
|
-
[GGML_TYPE_Q8_0] = QK8_0,
|
3663
|
-
[GGML_TYPE_Q8_1] = QK8_1,
|
3664
|
-
#ifdef GGML_USE_K_QUANTS
|
3665
|
-
[GGML_TYPE_Q2_K] = QK_K,
|
3666
|
-
[GGML_TYPE_Q3_K] = QK_K,
|
3667
|
-
[GGML_TYPE_Q4_K] = QK_K,
|
3668
|
-
[GGML_TYPE_Q5_K] = QK_K,
|
3669
|
-
[GGML_TYPE_Q6_K] = QK_K,
|
3670
|
-
[GGML_TYPE_Q8_K] = QK_K,
|
3671
|
-
#endif
|
3672
|
-
[GGML_TYPE_I8] = 1,
|
3673
|
-
[GGML_TYPE_I16] = 1,
|
3674
|
-
[GGML_TYPE_I32] = 1,
|
3675
|
-
};
|
3676
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3677
|
-
|
3678
|
-
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3679
|
-
[GGML_TYPE_F32] = sizeof(float),
|
3680
|
-
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
3681
|
-
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
3682
|
-
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
3683
|
-
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
3684
|
-
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3685
|
-
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3686
|
-
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3687
|
-
#ifdef GGML_USE_K_QUANTS
|
3688
|
-
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3689
|
-
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3690
|
-
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3691
|
-
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3692
|
-
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3693
|
-
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3694
|
-
#endif
|
3695
|
-
[GGML_TYPE_I8] = sizeof(int8_t),
|
3696
|
-
[GGML_TYPE_I16] = sizeof(int16_t),
|
3697
|
-
[GGML_TYPE_I32] = sizeof(int32_t),
|
3698
|
-
};
|
3699
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3700
|
-
|
3701
|
-
|
3702
|
-
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
3703
|
-
[GGML_TYPE_F32] = "f32",
|
3704
|
-
[GGML_TYPE_F16] = "f16",
|
3705
|
-
[GGML_TYPE_Q4_0] = "q4_0",
|
3706
|
-
[GGML_TYPE_Q4_1] = "q4_1",
|
3707
|
-
[GGML_TYPE_Q5_0] = "q5_0",
|
3708
|
-
[GGML_TYPE_Q5_1] = "q5_1",
|
3709
|
-
[GGML_TYPE_Q8_0] = "q8_0",
|
3710
|
-
[GGML_TYPE_Q8_1] = "q8_1",
|
3711
|
-
[GGML_TYPE_Q2_K] = "q2_K",
|
3712
|
-
[GGML_TYPE_Q3_K] = "q3_K",
|
3713
|
-
[GGML_TYPE_Q4_K] = "q4_K",
|
3714
|
-
[GGML_TYPE_Q5_K] = "q5_K",
|
3715
|
-
[GGML_TYPE_Q6_K] = "q6_K",
|
3716
|
-
[GGML_TYPE_Q8_K] = "q8_K",
|
3717
|
-
[GGML_TYPE_I8] = "i8",
|
3718
|
-
[GGML_TYPE_I16] = "i16",
|
3719
|
-
[GGML_TYPE_I32] = "i32",
|
3720
|
-
};
|
3721
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3722
|
-
|
3723
|
-
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3724
|
-
[GGML_TYPE_F32] = false,
|
3725
|
-
[GGML_TYPE_F16] = false,
|
3726
|
-
[GGML_TYPE_Q4_0] = true,
|
3727
|
-
[GGML_TYPE_Q4_1] = true,
|
3728
|
-
[GGML_TYPE_Q5_0] = true,
|
3729
|
-
[GGML_TYPE_Q5_1] = true,
|
3730
|
-
[GGML_TYPE_Q8_0] = true,
|
3731
|
-
[GGML_TYPE_Q8_1] = true,
|
3732
|
-
[GGML_TYPE_Q2_K] = true,
|
3733
|
-
[GGML_TYPE_Q3_K] = true,
|
3734
|
-
[GGML_TYPE_Q4_K] = true,
|
3735
|
-
[GGML_TYPE_Q5_K] = true,
|
3736
|
-
[GGML_TYPE_Q6_K] = true,
|
3737
|
-
[GGML_TYPE_Q8_K] = true,
|
3738
|
-
[GGML_TYPE_I8] = false,
|
3739
|
-
[GGML_TYPE_I16] = false,
|
3740
|
-
[GGML_TYPE_I32] = false,
|
3741
|
-
};
|
3742
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3743
|
-
|
3744
3728
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3745
3729
|
"NONE",
|
3746
3730
|
|
@@ -3760,10 +3744,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3760
3744
|
"ARGMAX",
|
3761
3745
|
"REPEAT",
|
3762
3746
|
"REPEAT_BACK",
|
3747
|
+
"CONCAT",
|
3763
3748
|
"SILU_BACK",
|
3764
3749
|
"NORM",
|
3765
3750
|
"RMS_NORM",
|
3766
3751
|
"RMS_NORM_BACK",
|
3752
|
+
"GROUP_NORM",
|
3767
3753
|
|
3768
3754
|
"MUL_MAT",
|
3769
3755
|
"OUT_PROD",
|
@@ -3789,20 +3775,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3789
3775
|
"CLAMP",
|
3790
3776
|
"CONV_1D",
|
3791
3777
|
"CONV_2D",
|
3778
|
+
"CONV_TRANSPOSE_2D",
|
3792
3779
|
"POOL_1D",
|
3793
3780
|
"POOL_2D",
|
3781
|
+
"UPSCALE",
|
3794
3782
|
|
3795
3783
|
"FLASH_ATTN",
|
3796
3784
|
"FLASH_FF",
|
3797
3785
|
"FLASH_ATTN_BACK",
|
3798
3786
|
"WIN_PART",
|
3799
3787
|
"WIN_UNPART",
|
3788
|
+
"GET_REL_POS",
|
3789
|
+
"ADD_REL_POS",
|
3800
3790
|
|
3801
3791
|
"UNARY",
|
3802
3792
|
|
3803
3793
|
"MAP_UNARY",
|
3804
3794
|
"MAP_BINARY",
|
3805
3795
|
|
3796
|
+
"MAP_CUSTOM1_F32",
|
3797
|
+
"MAP_CUSTOM2_F32",
|
3798
|
+
"MAP_CUSTOM3_F32",
|
3799
|
+
|
3806
3800
|
"MAP_CUSTOM1",
|
3807
3801
|
"MAP_CUSTOM2",
|
3808
3802
|
"MAP_CUSTOM3",
|
@@ -3811,7 +3805,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
3805
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
3806
|
};
|
3813
3807
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
3808
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3815
3809
|
|
3816
3810
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
3811
|
"none",
|
@@ -3832,10 +3826,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3832
3826
|
"argmax(x)",
|
3833
3827
|
"repeat(x)",
|
3834
3828
|
"repeat_back(x)",
|
3829
|
+
"concat(x, y)",
|
3835
3830
|
"silu_back(x)",
|
3836
3831
|
"norm(x)",
|
3837
3832
|
"rms_norm(x)",
|
3838
3833
|
"rms_norm_back(x)",
|
3834
|
+
"group_norm(x)",
|
3839
3835
|
|
3840
3836
|
"X*Y",
|
3841
3837
|
"X*Y",
|
@@ -3861,20 +3857,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3861
3857
|
"clamp(x)",
|
3862
3858
|
"conv_1d(x)",
|
3863
3859
|
"conv_2d(x)",
|
3860
|
+
"conv_transpose_2d(x)",
|
3864
3861
|
"pool_1d(x)",
|
3865
3862
|
"pool_2d(x)",
|
3863
|
+
"upscale(x)",
|
3866
3864
|
|
3867
3865
|
"flash_attn(x)",
|
3868
3866
|
"flash_ff(x)",
|
3869
3867
|
"flash_attn_back(x)",
|
3870
3868
|
"win_part(x)",
|
3871
3869
|
"win_unpart(x)",
|
3870
|
+
"get_rel_pos(x)",
|
3871
|
+
"add_rel_pos(x)",
|
3872
3872
|
|
3873
3873
|
"unary(x)",
|
3874
3874
|
|
3875
3875
|
"f(x)",
|
3876
3876
|
"f(x,y)",
|
3877
3877
|
|
3878
|
+
"custom_f32(x)",
|
3879
|
+
"custom_f32(x,y)",
|
3880
|
+
"custom_f32(x,y,z)",
|
3881
|
+
|
3878
3882
|
"custom(x)",
|
3879
3883
|
"custom(x,y)",
|
3880
3884
|
"custom(x,y,z)",
|
@@ -3883,7 +3887,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
3887
|
"cross_entropy_loss_back(x,y)",
|
3884
3888
|
};
|
3885
3889
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
3890
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3887
3891
|
|
3888
3892
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
3893
|
|
@@ -3913,8 +3917,10 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
3913
3917
|
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3914
3918
|
p[GGML_OP_CONV_1D ] = true;
|
3915
3919
|
p[GGML_OP_CONV_2D ] = true;
|
3920
|
+
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
3916
3921
|
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3917
3922
|
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3923
|
+
p[GGML_OP_ADD_REL_POS ] = true;
|
3918
3924
|
}
|
3919
3925
|
|
3920
3926
|
{ // FINALIZE
|
@@ -4110,29 +4116,37 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
|
4110
4116
|
//
|
4111
4117
|
// is enough, but just in case, adding the second part
|
4112
4118
|
|
4113
|
-
return
|
4119
|
+
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
|
4120
|
+
}
|
4121
|
+
|
4122
|
+
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
4123
|
+
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
4114
4124
|
}
|
4115
4125
|
|
4116
4126
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
4117
4127
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4118
4128
|
|
4119
|
-
return (nrows_split*tensor->ne[0]*
|
4129
|
+
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
4120
4130
|
}
|
4121
4131
|
|
4122
4132
|
int ggml_blck_size(enum ggml_type type) {
|
4123
|
-
return
|
4133
|
+
return type_traits[type].blck_size;
|
4124
4134
|
}
|
4125
4135
|
|
4126
4136
|
size_t ggml_type_size(enum ggml_type type) {
|
4127
|
-
return
|
4137
|
+
return type_traits[type].type_size;
|
4128
4138
|
}
|
4129
4139
|
|
4130
4140
|
float ggml_type_sizef(enum ggml_type type) {
|
4131
|
-
return ((float)(
|
4141
|
+
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
|
4132
4142
|
}
|
4133
4143
|
|
4134
4144
|
const char * ggml_type_name(enum ggml_type type) {
|
4135
|
-
return
|
4145
|
+
return type_traits[type].type_name;
|
4146
|
+
}
|
4147
|
+
|
4148
|
+
bool ggml_is_quantized(enum ggml_type type) {
|
4149
|
+
return type_traits[type].is_quantized;
|
4136
4150
|
}
|
4137
4151
|
|
4138
4152
|
const char * ggml_op_name(enum ggml_op op) {
|
@@ -4144,7 +4158,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
4144
4158
|
}
|
4145
4159
|
|
4146
4160
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
4147
|
-
return
|
4161
|
+
return ggml_type_size(tensor->type);
|
4148
4162
|
}
|
4149
4163
|
|
4150
4164
|
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
@@ -4182,10 +4196,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
|
|
4182
4196
|
(t0->ne[3] == t1->ne[3]);
|
4183
4197
|
}
|
4184
4198
|
|
4185
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
4186
|
-
return GGML_IS_QUANTIZED[type];
|
4187
|
-
}
|
4188
|
-
|
4189
4199
|
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
4190
4200
|
enum ggml_type wtype = GGML_TYPE_COUNT;
|
4191
4201
|
|
@@ -4223,8 +4233,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
4223
4233
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4224
4234
|
|
4225
4235
|
return
|
4226
|
-
tensor->nb[0] ==
|
4227
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/
|
4236
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4237
|
+
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
4228
4238
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4229
4239
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4230
4240
|
}
|
@@ -4233,7 +4243,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
4233
4243
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4234
4244
|
|
4235
4245
|
return
|
4236
|
-
tensor->nb[0] ==
|
4246
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4237
4247
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4238
4248
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4239
4249
|
}
|
@@ -4248,7 +4258,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4248
4258
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4249
4259
|
|
4250
4260
|
return
|
4251
|
-
tensor->nb[0] ==
|
4261
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4252
4262
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4253
4263
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4264
|
}
|
@@ -4567,7 +4577,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4567
4577
|
size_t data_size = 0;
|
4568
4578
|
|
4569
4579
|
if (data == NULL && !ctx->no_alloc) {
|
4570
|
-
data_size +=
|
4580
|
+
data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
4571
4581
|
for (int i = 1; i < n_dims; i++) {
|
4572
4582
|
data_size *= ne[i];
|
4573
4583
|
}
|
@@ -4622,8 +4632,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4622
4632
|
result->ne[i] = ne[i];
|
4623
4633
|
}
|
4624
4634
|
|
4625
|
-
result->nb[0] =
|
4626
|
-
result->nb[1] = result->nb[0]*(result->ne[0]/
|
4635
|
+
result->nb[0] = ggml_type_size(type);
|
4636
|
+
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
4627
4637
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
4628
4638
|
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
4629
4639
|
}
|
@@ -5545,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
|
|
5545
5555
|
is_node = true;
|
5546
5556
|
}
|
5547
5557
|
|
5548
|
-
if (ggml_are_same_shape(a, b) && !is_node) {
|
5549
|
-
return a;
|
5550
|
-
}
|
5551
|
-
|
5552
5558
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
5553
5559
|
|
5554
5560
|
result->op = GGML_OP_REPEAT;
|
@@ -5587,6 +5593,30 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5587
5593
|
return result;
|
5588
5594
|
}
|
5589
5595
|
|
5596
|
+
// ggml_concat
|
5597
|
+
|
5598
|
+
struct ggml_tensor* ggml_concat(
|
5599
|
+
struct ggml_context* ctx,
|
5600
|
+
struct ggml_tensor* a,
|
5601
|
+
struct ggml_tensor* b) {
|
5602
|
+
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
5603
|
+
|
5604
|
+
bool is_node = false;
|
5605
|
+
|
5606
|
+
if (a->grad || b->grad) {
|
5607
|
+
is_node = true;
|
5608
|
+
}
|
5609
|
+
|
5610
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
5611
|
+
|
5612
|
+
result->op = GGML_OP_CONCAT;
|
5613
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5614
|
+
result->src[0] = a;
|
5615
|
+
result->src[1] = b;
|
5616
|
+
|
5617
|
+
return result;
|
5618
|
+
}
|
5619
|
+
|
5590
5620
|
// ggml_abs
|
5591
5621
|
|
5592
5622
|
struct ggml_tensor * ggml_abs(
|
@@ -5755,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
|
|
5755
5785
|
static struct ggml_tensor * ggml_norm_impl(
|
5756
5786
|
struct ggml_context * ctx,
|
5757
5787
|
struct ggml_tensor * a,
|
5788
|
+
float eps,
|
5758
5789
|
bool inplace) {
|
5759
5790
|
bool is_node = false;
|
5760
5791
|
|
@@ -5765,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5765
5796
|
|
5766
5797
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5767
5798
|
|
5768
|
-
|
5799
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
5769
5800
|
|
5770
5801
|
result->op = GGML_OP_NORM;
|
5771
5802
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5776,16 +5807,20 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5776
5807
|
|
5777
5808
|
struct ggml_tensor * ggml_norm(
|
5778
5809
|
struct ggml_context * ctx,
|
5779
|
-
struct ggml_tensor * a
|
5780
|
-
|
5810
|
+
struct ggml_tensor * a,
|
5811
|
+
float eps) {
|
5812
|
+
return ggml_norm_impl(ctx, a, eps, false);
|
5781
5813
|
}
|
5782
5814
|
|
5783
5815
|
struct ggml_tensor * ggml_norm_inplace(
|
5784
5816
|
struct ggml_context * ctx,
|
5785
|
-
struct ggml_tensor * a
|
5786
|
-
|
5817
|
+
struct ggml_tensor * a,
|
5818
|
+
float eps) {
|
5819
|
+
return ggml_norm_impl(ctx, a, eps, true);
|
5787
5820
|
}
|
5788
5821
|
|
5822
|
+
// ggml_rms_norm
|
5823
|
+
|
5789
5824
|
static struct ggml_tensor * ggml_rms_norm_impl(
|
5790
5825
|
struct ggml_context * ctx,
|
5791
5826
|
struct ggml_tensor * a,
|
@@ -5822,6 +5857,8 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
5822
5857
|
return ggml_rms_norm_impl(ctx, a, eps, true);
|
5823
5858
|
}
|
5824
5859
|
|
5860
|
+
// ggml_rms_norm_back
|
5861
|
+
|
5825
5862
|
struct ggml_tensor * ggml_rms_norm_back(
|
5826
5863
|
struct ggml_context * ctx,
|
5827
5864
|
struct ggml_tensor * a,
|
@@ -5843,6 +5880,44 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5843
5880
|
return result;
|
5844
5881
|
}
|
5845
5882
|
|
5883
|
+
// ggml_group_norm
|
5884
|
+
|
5885
|
+
static struct ggml_tensor * ggml_group_norm_impl(
|
5886
|
+
struct ggml_context * ctx,
|
5887
|
+
struct ggml_tensor * a,
|
5888
|
+
int n_groups,
|
5889
|
+
bool inplace) {
|
5890
|
+
|
5891
|
+
bool is_node = false;
|
5892
|
+
if (!inplace && (a->grad)) {
|
5893
|
+
GGML_ASSERT(false); // TODO: implement backward
|
5894
|
+
is_node = true;
|
5895
|
+
}
|
5896
|
+
|
5897
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5898
|
+
|
5899
|
+
result->op = GGML_OP_GROUP_NORM;
|
5900
|
+
result->op_params[0] = n_groups;
|
5901
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5902
|
+
result->src[0] = a;
|
5903
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
5904
|
+
|
5905
|
+
return result;
|
5906
|
+
}
|
5907
|
+
|
5908
|
+
struct ggml_tensor * ggml_group_norm(
|
5909
|
+
struct ggml_context * ctx,
|
5910
|
+
struct ggml_tensor * a,
|
5911
|
+
int n_groups) {
|
5912
|
+
return ggml_group_norm_impl(ctx, a, n_groups, false);
|
5913
|
+
}
|
5914
|
+
|
5915
|
+
struct ggml_tensor * ggml_group_norm_inplace(
|
5916
|
+
struct ggml_context * ctx,
|
5917
|
+
struct ggml_tensor * a,
|
5918
|
+
int n_groups) {
|
5919
|
+
return ggml_group_norm_impl(ctx, a, n_groups, true);
|
5920
|
+
}
|
5846
5921
|
|
5847
5922
|
// ggml_mul_mat
|
5848
5923
|
|
@@ -6711,6 +6786,8 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6711
6786
|
int n_ctx,
|
6712
6787
|
float freq_base,
|
6713
6788
|
float freq_scale,
|
6789
|
+
float xpos_base,
|
6790
|
+
bool xpos_down,
|
6714
6791
|
bool inplace) {
|
6715
6792
|
GGML_ASSERT(n_past >= 0);
|
6716
6793
|
bool is_node = false;
|
@@ -6721,9 +6798,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6721
6798
|
|
6722
6799
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6723
6800
|
|
6724
|
-
int32_t params[
|
6801
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
6725
6802
|
memcpy(params + 4, &freq_base, sizeof(float));
|
6726
6803
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
6804
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
6805
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6727
6806
|
ggml_set_op_params(result, params, sizeof(params));
|
6728
6807
|
|
6729
6808
|
result->op = GGML_OP_ROPE;
|
@@ -6740,7 +6819,7 @@ struct ggml_tensor * ggml_rope(
|
|
6740
6819
|
int n_dims,
|
6741
6820
|
int mode,
|
6742
6821
|
int n_ctx) {
|
6743
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
6822
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
6744
6823
|
}
|
6745
6824
|
|
6746
6825
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6750,7 +6829,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6750
6829
|
int n_dims,
|
6751
6830
|
int mode,
|
6752
6831
|
int n_ctx) {
|
6753
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
6832
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
6754
6833
|
}
|
6755
6834
|
|
6756
6835
|
struct ggml_tensor * ggml_rope_custom(
|
@@ -6762,7 +6841,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6762
6841
|
int n_ctx,
|
6763
6842
|
float freq_base,
|
6764
6843
|
float freq_scale) {
|
6765
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
6844
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
6766
6845
|
}
|
6767
6846
|
|
6768
6847
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -6774,7 +6853,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6774
6853
|
int n_ctx,
|
6775
6854
|
float freq_base,
|
6776
6855
|
float freq_scale) {
|
6777
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
6856
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
6857
|
+
}
|
6858
|
+
|
6859
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
6860
|
+
struct ggml_context * ctx,
|
6861
|
+
struct ggml_tensor * a,
|
6862
|
+
int n_past,
|
6863
|
+
int n_dims,
|
6864
|
+
float base,
|
6865
|
+
bool down) {
|
6866
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
6778
6867
|
}
|
6779
6868
|
|
6780
6869
|
// ggml_rope_back
|
@@ -6785,7 +6874,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6785
6874
|
int n_past,
|
6786
6875
|
int n_dims,
|
6787
6876
|
int mode,
|
6788
|
-
int n_ctx
|
6877
|
+
int n_ctx,
|
6878
|
+
float freq_base,
|
6879
|
+
float freq_scale,
|
6880
|
+
float xpos_base,
|
6881
|
+
bool xpos_down) {
|
6789
6882
|
GGML_ASSERT(n_past >= 0);
|
6790
6883
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
6791
6884
|
|
@@ -6797,7 +6890,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6797
6890
|
|
6798
6891
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6799
6892
|
|
6800
|
-
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
6893
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
6894
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
6895
|
+
memcpy(params + 5, &freq_scale, sizeof(float));
|
6896
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
6897
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6801
6898
|
ggml_set_op_params(result, params, sizeof(params));
|
6802
6899
|
|
6803
6900
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -6904,6 +7001,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6904
7001
|
return result;
|
6905
7002
|
}
|
6906
7003
|
|
7004
|
+
// ggml_conv_1d_ph
|
7005
|
+
|
7006
|
+
struct ggml_tensor* ggml_conv_1d_ph(
|
7007
|
+
struct ggml_context * ctx,
|
7008
|
+
struct ggml_tensor * a,
|
7009
|
+
struct ggml_tensor * b,
|
7010
|
+
int s,
|
7011
|
+
int d) {
|
7012
|
+
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7013
|
+
}
|
7014
|
+
|
6907
7015
|
// ggml_conv_2d
|
6908
7016
|
|
6909
7017
|
struct ggml_tensor * ggml_conv_2d(
|
@@ -6944,17 +7052,59 @@ struct ggml_tensor * ggml_conv_2d(
|
|
6944
7052
|
|
6945
7053
|
}
|
6946
7054
|
|
6947
|
-
//
|
7055
|
+
// ggml_conv_2d_sk_p0
|
6948
7056
|
|
6949
|
-
struct ggml_tensor *
|
7057
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6950
7058
|
struct ggml_context * ctx,
|
6951
7059
|
struct ggml_tensor * a,
|
6952
|
-
struct ggml_tensor * b
|
6953
|
-
|
6954
|
-
|
6955
|
-
|
7060
|
+
struct ggml_tensor * b) {
|
7061
|
+
return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
|
7062
|
+
}
|
7063
|
+
|
7064
|
+
// ggml_conv_2d_s1_ph
|
7065
|
+
|
7066
|
+
struct ggml_tensor * ggml_conv_2d_s1_ph(
|
7067
|
+
struct ggml_context * ctx,
|
7068
|
+
struct ggml_tensor * a,
|
7069
|
+
struct ggml_tensor * b) {
|
7070
|
+
return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
|
6956
7071
|
}
|
6957
7072
|
|
7073
|
+
// ggml_conv_transpose_2d_p0
|
7074
|
+
|
7075
|
+
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
7076
|
+
return (ins - 1) * s - 2 * p + ks;
|
7077
|
+
}
|
7078
|
+
|
7079
|
+
struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
7080
|
+
struct ggml_context * ctx,
|
7081
|
+
struct ggml_tensor * a,
|
7082
|
+
struct ggml_tensor * b,
|
7083
|
+
int stride) {
|
7084
|
+
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
7085
|
+
|
7086
|
+
bool is_node = false;
|
7087
|
+
|
7088
|
+
if (a->grad || b->grad) {
|
7089
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7090
|
+
is_node = true;
|
7091
|
+
}
|
7092
|
+
|
7093
|
+
const int64_t ne[4] = {
|
7094
|
+
ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
|
7095
|
+
ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
|
7096
|
+
a->ne[2], b->ne[3],
|
7097
|
+
};
|
7098
|
+
|
7099
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7100
|
+
result->op = GGML_OP_CONV_TRANSPOSE_2D;
|
7101
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7102
|
+
result->src[0] = a;
|
7103
|
+
result->src[1] = b;
|
7104
|
+
result->src[2] = ggml_new_i32(ctx, stride);
|
7105
|
+
|
7106
|
+
return result;
|
7107
|
+
}
|
6958
7108
|
|
6959
7109
|
// ggml_pool_*
|
6960
7110
|
|
@@ -7032,6 +7182,40 @@ struct ggml_tensor * ggml_pool_2d(
|
|
7032
7182
|
return result;
|
7033
7183
|
}
|
7034
7184
|
|
7185
|
+
// ggml_upscale
|
7186
|
+
|
7187
|
+
static struct ggml_tensor * ggml_upscale_impl(
|
7188
|
+
struct ggml_context * ctx,
|
7189
|
+
struct ggml_tensor * a,
|
7190
|
+
int scale_factor) {
|
7191
|
+
bool is_node = false;
|
7192
|
+
|
7193
|
+
if (a->grad) {
|
7194
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7195
|
+
is_node = true;
|
7196
|
+
}
|
7197
|
+
|
7198
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
7199
|
+
a->ne[0] * scale_factor,
|
7200
|
+
a->ne[1] * scale_factor,
|
7201
|
+
a->ne[2], a->ne[3]);
|
7202
|
+
|
7203
|
+
result->op = GGML_OP_UPSCALE;
|
7204
|
+
result->op_params[0] = scale_factor;
|
7205
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7206
|
+
result->src[0] = a;
|
7207
|
+
result->src[1] = NULL;
|
7208
|
+
|
7209
|
+
return result;
|
7210
|
+
}
|
7211
|
+
|
7212
|
+
struct ggml_tensor * ggml_upscale(
|
7213
|
+
struct ggml_context * ctx,
|
7214
|
+
struct ggml_tensor * a,
|
7215
|
+
int scale_factor) {
|
7216
|
+
return ggml_upscale_impl(ctx, a, scale_factor);
|
7217
|
+
}
|
7218
|
+
|
7035
7219
|
// ggml_flash_attn
|
7036
7220
|
|
7037
7221
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -7230,6 +7414,87 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7230
7414
|
return result;
|
7231
7415
|
}
|
7232
7416
|
|
7417
|
+
// ggml_get_rel_pos
|
7418
|
+
|
7419
|
+
struct ggml_tensor * ggml_get_rel_pos(
|
7420
|
+
struct ggml_context * ctx,
|
7421
|
+
struct ggml_tensor * a,
|
7422
|
+
int qh,
|
7423
|
+
int kh) {
|
7424
|
+
GGML_ASSERT(qh == kh);
|
7425
|
+
GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
|
7426
|
+
|
7427
|
+
bool is_node = false;
|
7428
|
+
|
7429
|
+
if (a->grad) {
|
7430
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7431
|
+
is_node = true;
|
7432
|
+
}
|
7433
|
+
|
7434
|
+
const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
|
7435
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
|
7436
|
+
|
7437
|
+
result->op = GGML_OP_GET_REL_POS;
|
7438
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7439
|
+
result->src[0] = a;
|
7440
|
+
result->src[1] = NULL;
|
7441
|
+
|
7442
|
+
return result;
|
7443
|
+
}
|
7444
|
+
|
7445
|
+
// ggml_add_rel_pos
|
7446
|
+
|
7447
|
+
static struct ggml_tensor * ggml_add_rel_pos_impl(
|
7448
|
+
struct ggml_context * ctx,
|
7449
|
+
struct ggml_tensor * a,
|
7450
|
+
struct ggml_tensor * pw,
|
7451
|
+
struct ggml_tensor * ph,
|
7452
|
+
bool inplace) {
|
7453
|
+
GGML_ASSERT(ggml_are_same_shape(pw, ph));
|
7454
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
7455
|
+
GGML_ASSERT(ggml_is_contiguous(pw));
|
7456
|
+
GGML_ASSERT(ggml_is_contiguous(ph));
|
7457
|
+
GGML_ASSERT(ph->type == GGML_TYPE_F32);
|
7458
|
+
GGML_ASSERT(pw->type == GGML_TYPE_F32);
|
7459
|
+
GGML_ASSERT(pw->ne[3] == a->ne[2]);
|
7460
|
+
GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
|
7461
|
+
GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
|
7462
|
+
|
7463
|
+
bool is_node = false;
|
7464
|
+
|
7465
|
+
if (!inplace && (a->grad || pw->grad || ph->grad)) {
|
7466
|
+
is_node = true;
|
7467
|
+
}
|
7468
|
+
|
7469
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7470
|
+
ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
|
7471
|
+
|
7472
|
+
result->op = GGML_OP_ADD_REL_POS;
|
7473
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7474
|
+
result->src[0] = a;
|
7475
|
+
result->src[1] = pw;
|
7476
|
+
result->src[2] = ph;
|
7477
|
+
|
7478
|
+
return result;
|
7479
|
+
}
|
7480
|
+
|
7481
|
+
|
7482
|
+
struct ggml_tensor * ggml_add_rel_pos(
|
7483
|
+
struct ggml_context * ctx,
|
7484
|
+
struct ggml_tensor * a,
|
7485
|
+
struct ggml_tensor * pw,
|
7486
|
+
struct ggml_tensor * ph) {
|
7487
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
|
7488
|
+
}
|
7489
|
+
|
7490
|
+
struct ggml_tensor * ggml_add_rel_pos_inplace(
|
7491
|
+
struct ggml_context * ctx,
|
7492
|
+
struct ggml_tensor * a,
|
7493
|
+
struct ggml_tensor * pw,
|
7494
|
+
struct ggml_tensor * ph) {
|
7495
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7496
|
+
}
|
7497
|
+
|
7233
7498
|
// gmml_unary
|
7234
7499
|
|
7235
7500
|
static struct ggml_tensor * ggml_unary_impl(
|
@@ -7745,7 +8010,7 @@ static void ggml_compute_forward_dup_same_cont(
|
|
7745
8010
|
memcpy(
|
7746
8011
|
((char *) dst->data + ie0*nb0),
|
7747
8012
|
((char *) src0->data + ie0*nb00),
|
7748
|
-
(ie1 - ie0) *
|
8013
|
+
(ie1 - ie0) * ggml_type_size(src0->type));
|
7749
8014
|
}
|
7750
8015
|
|
7751
8016
|
}
|
@@ -7779,7 +8044,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7779
8044
|
|
7780
8045
|
if (src0->type == dst->type &&
|
7781
8046
|
ne00 == ne0 &&
|
7782
|
-
nb00 ==
|
8047
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
7783
8048
|
// copy by rows
|
7784
8049
|
const size_t rs = ne00*nb00;
|
7785
8050
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -7837,7 +8102,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7837
8102
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7838
8103
|
|
7839
8104
|
size_t id = 0;
|
7840
|
-
size_t rs = nb0 * (ne00 /
|
8105
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
7841
8106
|
char * dst_ptr = (char *) dst->data;
|
7842
8107
|
|
7843
8108
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8050,7 +8315,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8050
8315
|
|
8051
8316
|
if (src0->type == dst->type &&
|
8052
8317
|
ne00 == ne0 &&
|
8053
|
-
nb00 ==
|
8318
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
8054
8319
|
// copy by rows
|
8055
8320
|
const size_t rs = ne00*nb00;
|
8056
8321
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -8089,7 +8354,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8089
8354
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
8090
8355
|
|
8091
8356
|
size_t id = 0;
|
8092
|
-
size_t rs = nb0 * (ne00 /
|
8357
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
8093
8358
|
char * dst_ptr = (char *) dst->data;
|
8094
8359
|
|
8095
8360
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8501,7 +8766,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
8501
8766
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8502
8767
|
|
8503
8768
|
// we don't support permuted src0 or src1
|
8504
|
-
GGML_ASSERT(nb00 ==
|
8769
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8505
8770
|
GGML_ASSERT(nb10 == sizeof(float));
|
8506
8771
|
|
8507
8772
|
// dst cannot be transposed or permuted
|
@@ -8775,7 +9040,7 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8775
9040
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8776
9041
|
|
8777
9042
|
// we don't support permuted src0
|
8778
|
-
GGML_ASSERT(nb00 ==
|
9043
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8779
9044
|
|
8780
9045
|
// dst cannot be transposed or permuted
|
8781
9046
|
GGML_ASSERT(nb0 <= nb1);
|
@@ -9137,6 +9402,8 @@ static void ggml_compute_forward_mul(
|
|
9137
9402
|
const struct ggml_tensor * src0,
|
9138
9403
|
const struct ggml_tensor * src1,
|
9139
9404
|
struct ggml_tensor * dst) {
|
9405
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
|
9406
|
+
|
9140
9407
|
switch (src0->type) {
|
9141
9408
|
case GGML_TYPE_F32:
|
9142
9409
|
{
|
@@ -9731,6 +9998,72 @@ static void ggml_compute_forward_repeat_back(
|
|
9731
9998
|
}
|
9732
9999
|
}
|
9733
10000
|
|
10001
|
+
// ggml_compute_forward_concat
|
10002
|
+
|
10003
|
+
static void ggml_compute_forward_concat_f32(
|
10004
|
+
const struct ggml_compute_params * params,
|
10005
|
+
const struct ggml_tensor * src0,
|
10006
|
+
const struct ggml_tensor * src1,
|
10007
|
+
struct ggml_tensor * dst) {
|
10008
|
+
|
10009
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10010
|
+
return;
|
10011
|
+
}
|
10012
|
+
|
10013
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10014
|
+
|
10015
|
+
const int ith = params->ith;
|
10016
|
+
|
10017
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10018
|
+
|
10019
|
+
// TODO: support for transposed / permuted tensors
|
10020
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
10021
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
10022
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
10023
|
+
|
10024
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
10025
|
+
for (int i2 = ith; i2 < ne2; i2++) {
|
10026
|
+
if (i2 < ne02) { // src0
|
10027
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10028
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10029
|
+
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
10030
|
+
|
10031
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10032
|
+
*y = *x;
|
10033
|
+
}
|
10034
|
+
}
|
10035
|
+
} // src1
|
10036
|
+
else {
|
10037
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10038
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10039
|
+
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10040
|
+
|
10041
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10042
|
+
*y = *x;
|
10043
|
+
}
|
10044
|
+
}
|
10045
|
+
}
|
10046
|
+
}
|
10047
|
+
}
|
10048
|
+
}
|
10049
|
+
|
10050
|
+
static void ggml_compute_forward_concat(
|
10051
|
+
const struct ggml_compute_params* params,
|
10052
|
+
const struct ggml_tensor* src0,
|
10053
|
+
const struct ggml_tensor* src1,
|
10054
|
+
struct ggml_tensor* dst) {
|
10055
|
+
switch (src0->type) {
|
10056
|
+
case GGML_TYPE_F32:
|
10057
|
+
{
|
10058
|
+
ggml_compute_forward_concat_f32(params, src0, src1, dst);
|
10059
|
+
} break;
|
10060
|
+
default:
|
10061
|
+
{
|
10062
|
+
GGML_ASSERT(false);
|
10063
|
+
} break;
|
10064
|
+
}
|
10065
|
+
}
|
10066
|
+
|
9734
10067
|
// ggml_compute_forward_abs
|
9735
10068
|
|
9736
10069
|
static void ggml_compute_forward_abs_f32(
|
@@ -10285,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
|
|
10285
10618
|
|
10286
10619
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
10287
10620
|
|
10288
|
-
|
10621
|
+
float eps;
|
10622
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10289
10623
|
|
10290
10624
|
// TODO: optimize
|
10291
10625
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -10334,6 +10668,8 @@ static void ggml_compute_forward_norm(
|
|
10334
10668
|
}
|
10335
10669
|
}
|
10336
10670
|
|
10671
|
+
// ggml_compute_forward_group_rms_norm
|
10672
|
+
|
10337
10673
|
static void ggml_compute_forward_rms_norm_f32(
|
10338
10674
|
const struct ggml_compute_params * params,
|
10339
10675
|
const struct ggml_tensor * src0,
|
@@ -10398,7 +10734,6 @@ static void ggml_compute_forward_rms_norm(
|
|
10398
10734
|
}
|
10399
10735
|
}
|
10400
10736
|
|
10401
|
-
|
10402
10737
|
static void ggml_compute_forward_rms_norm_back_f32(
|
10403
10738
|
const struct ggml_compute_params * params,
|
10404
10739
|
const struct ggml_tensor * src0,
|
@@ -10572,16 +10907,106 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10572
10907
|
}
|
10573
10908
|
}
|
10574
10909
|
|
10575
|
-
//
|
10910
|
+
// ggml_compute_forward_group_norm
|
10576
10911
|
|
10577
|
-
|
10578
|
-
|
10579
|
-
|
10580
|
-
|
10581
|
-
|
10582
|
-
|
10583
|
-
|
10584
|
-
|
10912
|
+
static void ggml_compute_forward_group_norm_f32(
|
10913
|
+
const struct ggml_compute_params * params,
|
10914
|
+
const struct ggml_tensor * src0,
|
10915
|
+
struct ggml_tensor * dst) {
|
10916
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10917
|
+
|
10918
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10919
|
+
return;
|
10920
|
+
}
|
10921
|
+
|
10922
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10923
|
+
|
10924
|
+
const int ith = params->ith;
|
10925
|
+
const int nth = params->nth;
|
10926
|
+
|
10927
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10928
|
+
|
10929
|
+
const float eps = 1e-6f; // TODO: make this a parameter
|
10930
|
+
|
10931
|
+
// TODO: optimize
|
10932
|
+
|
10933
|
+
int n_channels = src0->ne[2];
|
10934
|
+
int n_groups = dst->op_params[0];
|
10935
|
+
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
10936
|
+
for (int i = ith; i < n_groups; i+=nth) {
|
10937
|
+
int start = i * n_channels_per_group;
|
10938
|
+
int end = start + n_channels_per_group;
|
10939
|
+
if (end > n_channels) {
|
10940
|
+
end = n_channels;
|
10941
|
+
}
|
10942
|
+
int step = end - start;
|
10943
|
+
|
10944
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
10945
|
+
ggml_float sum = 0.0;
|
10946
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
10947
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10948
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10949
|
+
|
10950
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10951
|
+
sum += (ggml_float)x[i00];
|
10952
|
+
}
|
10953
|
+
}
|
10954
|
+
}
|
10955
|
+
float mean = sum / (ne00 * ne01 * step);
|
10956
|
+
ggml_float sum2 = 0.0;
|
10957
|
+
|
10958
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
10959
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10960
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
10961
|
+
|
10962
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
10963
|
+
|
10964
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
10965
|
+
float v = x[i00] - mean;
|
10966
|
+
y[i00] = v;
|
10967
|
+
sum2 += (ggml_float)(v * v);
|
10968
|
+
}
|
10969
|
+
}
|
10970
|
+
}
|
10971
|
+
float variance = sum2 / (ne00 * ne01 * step);
|
10972
|
+
const float scale = 1.0f / sqrtf(variance + eps);
|
10973
|
+
|
10974
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
10975
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
10976
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
10977
|
+
ggml_vec_scale_f32(ne00, y, scale);
|
10978
|
+
}
|
10979
|
+
}
|
10980
|
+
}
|
10981
|
+
}
|
10982
|
+
}
|
10983
|
+
|
10984
|
+
static void ggml_compute_forward_group_norm(
|
10985
|
+
const struct ggml_compute_params * params,
|
10986
|
+
const struct ggml_tensor * src0,
|
10987
|
+
struct ggml_tensor * dst) {
|
10988
|
+
switch (src0->type) {
|
10989
|
+
case GGML_TYPE_F32:
|
10990
|
+
{
|
10991
|
+
ggml_compute_forward_group_norm_f32(params, src0, dst);
|
10992
|
+
} break;
|
10993
|
+
default:
|
10994
|
+
{
|
10995
|
+
GGML_ASSERT(false);
|
10996
|
+
} break;
|
10997
|
+
}
|
10998
|
+
}
|
10999
|
+
|
11000
|
+
// ggml_compute_forward_mul_mat
|
11001
|
+
|
11002
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
11003
|
+
// helper function to determine if it is better to use BLAS or not
|
11004
|
+
// for large matrices, BLAS is faster
|
11005
|
+
static bool ggml_compute_forward_mul_mat_use_blas(
|
11006
|
+
const struct ggml_tensor * src0,
|
11007
|
+
const struct ggml_tensor * src1,
|
11008
|
+
struct ggml_tensor * dst) {
|
11009
|
+
//const int64_t ne00 = src0->ne[0];
|
10585
11010
|
//const int64_t ne01 = src0->ne[1];
|
10586
11011
|
|
10587
11012
|
const int64_t ne10 = src1->ne[0];
|
@@ -10629,7 +11054,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10629
11054
|
GGML_ASSERT(ne3 == ne13);
|
10630
11055
|
|
10631
11056
|
// we don't support permuted src0 or src1
|
10632
|
-
GGML_ASSERT(nb00 ==
|
11057
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10633
11058
|
GGML_ASSERT(nb10 == sizeof(float));
|
10634
11059
|
|
10635
11060
|
// dst cannot be transposed or permuted
|
@@ -10638,6 +11063,10 @@ static void ggml_compute_forward_mul_mat(
|
|
10638
11063
|
GGML_ASSERT(nb1 <= nb2);
|
10639
11064
|
GGML_ASSERT(nb2 <= nb3);
|
10640
11065
|
|
11066
|
+
// broadcast factors
|
11067
|
+
const int64_t r2 = ne12/ne02;
|
11068
|
+
const int64_t r3 = ne13/ne03;
|
11069
|
+
|
10641
11070
|
// nb01 >= nb00 - src0 is not transposed
|
10642
11071
|
// compute by src0 rows
|
10643
11072
|
|
@@ -10657,11 +11086,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10657
11086
|
|
10658
11087
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10659
11088
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10660
|
-
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10661
|
-
// ref: https://github.com/ggerganov/ggml/pull/224
|
10662
|
-
GGML_ASSERT(ne02 == ne12);
|
10663
|
-
GGML_ASSERT(ne03 == ne13);
|
10664
|
-
|
10665
11089
|
if (params->ith != 0) {
|
10666
11090
|
return;
|
10667
11091
|
}
|
@@ -10674,12 +11098,16 @@ static void ggml_compute_forward_mul_mat(
|
|
10674
11098
|
return;
|
10675
11099
|
}
|
10676
11100
|
|
10677
|
-
for (int64_t
|
10678
|
-
for (int64_t
|
10679
|
-
|
10680
|
-
const
|
11101
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
11102
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
11103
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
11104
|
+
const int64_t i03 = i13/r3;
|
11105
|
+
const int64_t i02 = i12/r2;
|
11106
|
+
|
11107
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
11108
|
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
10681
11109
|
|
10682
|
-
float * d = (float *) ((char *) dst->data +
|
11110
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
10683
11111
|
|
10684
11112
|
if (type != GGML_TYPE_F32) {
|
10685
11113
|
float * const wdata = params->wdata;
|
@@ -10687,7 +11115,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10687
11115
|
|
10688
11116
|
size_t id = 0;
|
10689
11117
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
10690
|
-
to_float((char *)
|
11118
|
+
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
10691
11119
|
id += ne00;
|
10692
11120
|
}
|
10693
11121
|
|
@@ -10712,7 +11140,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10712
11140
|
if (params->type == GGML_TASK_INIT) {
|
10713
11141
|
if (src1->type != vec_dot_type) {
|
10714
11142
|
char * wdata = params->wdata;
|
10715
|
-
const size_t row_size = ne10*
|
11143
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10716
11144
|
|
10717
11145
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10718
11146
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -10732,7 +11160,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10732
11160
|
}
|
10733
11161
|
|
10734
11162
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
-
const size_t row_size = ne10*
|
11163
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10736
11164
|
|
10737
11165
|
const int64_t nr0 = ne01; // src0 rows
|
10738
11166
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
@@ -10767,10 +11195,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10767
11195
|
assert(ne12 % ne02 == 0);
|
10768
11196
|
assert(ne13 % ne03 == 0);
|
10769
11197
|
|
10770
|
-
// broadcast factors
|
10771
|
-
const int64_t r2 = ne12/ne02;
|
10772
|
-
const int64_t r3 = ne13/ne03;
|
10773
|
-
|
10774
11198
|
// block-tiling attempt
|
10775
11199
|
const int64_t blck_0 = 16;
|
10776
11200
|
const int64_t blck_1 = 16;
|
@@ -11205,7 +11629,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11205
11629
|
|
11206
11630
|
assert( dst->ne[0] == nc);
|
11207
11631
|
assert( dst->ne[1] == nr);
|
11208
|
-
assert(src0->nb[0] ==
|
11632
|
+
assert(src0->nb[0] == ggml_type_size(type));
|
11209
11633
|
|
11210
11634
|
for (int i = 0; i < nr; ++i) {
|
11211
11635
|
const int r = ((int32_t *) src1->data)[i];
|
@@ -11926,7 +12350,6 @@ static void ggml_compute_forward_alibi(
|
|
11926
12350
|
}
|
11927
12351
|
}
|
11928
12352
|
|
11929
|
-
|
11930
12353
|
// ggml_compute_forward_clamp
|
11931
12354
|
|
11932
12355
|
static void ggml_compute_forward_clamp_f32(
|
@@ -12015,12 +12438,18 @@ static void ggml_compute_forward_rope_f32(
|
|
12015
12438
|
float freq_base;
|
12016
12439
|
float freq_scale;
|
12017
12440
|
|
12441
|
+
// these two only relevant for xPos RoPE:
|
12442
|
+
float xpos_base;
|
12443
|
+
bool xpos_down;
|
12444
|
+
|
12018
12445
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12019
12446
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12020
12447
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12021
12448
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
12022
12449
|
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12023
12450
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12451
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12452
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12024
12453
|
|
12025
12454
|
assert(n_past >= 0);
|
12026
12455
|
|
@@ -12092,6 +12521,9 @@ static void ggml_compute_forward_rope_f32(
|
|
12092
12521
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12093
12522
|
const float cos_theta = cosf(theta);
|
12094
12523
|
const float sin_theta = sinf(theta);
|
12524
|
+
// zeta scaling for xPos only:
|
12525
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12526
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12095
12527
|
|
12096
12528
|
theta *= theta_scale;
|
12097
12529
|
|
@@ -12101,11 +12533,11 @@ static void ggml_compute_forward_rope_f32(
|
|
12101
12533
|
const float x0 = src[0];
|
12102
12534
|
const float x1 = src[1];
|
12103
12535
|
|
12104
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12105
|
-
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
12536
|
+
dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
|
12537
|
+
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
12106
12538
|
}
|
12107
12539
|
} else {
|
12108
|
-
// TODO: this
|
12540
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12109
12541
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12110
12542
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12111
12543
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12234,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12234
12666
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12235
12667
|
}
|
12236
12668
|
} else {
|
12237
|
-
// TODO: this
|
12669
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12238
12670
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12239
12671
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12240
12672
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12296,9 +12728,21 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12296
12728
|
// dx = rope_back(dy, src1)
|
12297
12729
|
// src0 is dy, src1 contains options
|
12298
12730
|
|
12731
|
+
float freq_base;
|
12732
|
+
float freq_scale;
|
12733
|
+
|
12734
|
+
// these two only relevant for xPos RoPE:
|
12735
|
+
float xpos_base;
|
12736
|
+
bool xpos_down;
|
12737
|
+
|
12299
12738
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12300
12739
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12301
12740
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12741
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
12742
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12743
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12744
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12745
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12302
12746
|
|
12303
12747
|
assert(n_past >= 0);
|
12304
12748
|
|
@@ -12324,7 +12768,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12324
12768
|
// row index used to determine which thread to use
|
12325
12769
|
int ir = 0;
|
12326
12770
|
|
12327
|
-
const float theta_scale = powf(
|
12771
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12328
12772
|
|
12329
12773
|
const bool is_neox = mode & 2;
|
12330
12774
|
|
@@ -12335,12 +12779,15 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12335
12779
|
if (ir++ < ir0) continue;
|
12336
12780
|
if (ir > ir1) break;
|
12337
12781
|
|
12338
|
-
float theta = (float)p;
|
12782
|
+
float theta = freq_scale * (float)p;
|
12339
12783
|
|
12340
12784
|
if (!is_neox) {
|
12341
12785
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12342
12786
|
const float cos_theta = cosf(theta);
|
12343
12787
|
const float sin_theta = sinf(theta);
|
12788
|
+
// zeta scaling for xPos only:
|
12789
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12790
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12344
12791
|
|
12345
12792
|
theta *= theta_scale;
|
12346
12793
|
|
@@ -12350,8 +12797,8 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12350
12797
|
const float dy0 = dy[0];
|
12351
12798
|
const float dy1 = dy[1];
|
12352
12799
|
|
12353
|
-
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
12354
|
-
dx[1] = - dy0*sin_theta + dy1*cos_theta;
|
12800
|
+
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
12801
|
+
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
12355
12802
|
}
|
12356
12803
|
} else {
|
12357
12804
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
@@ -13044,6 +13491,108 @@ static void ggml_compute_forward_conv_2d(
|
|
13044
13491
|
}
|
13045
13492
|
}
|
13046
13493
|
|
13494
|
+
// ggml_compute_forward_conv_transpose_2d
|
13495
|
+
|
13496
|
+
static void ggml_compute_forward_conv_transpose_2d(
|
13497
|
+
const struct ggml_compute_params * params,
|
13498
|
+
const struct ggml_tensor * src0,
|
13499
|
+
const struct ggml_tensor * src1,
|
13500
|
+
const struct ggml_tensor * opt0,
|
13501
|
+
struct ggml_tensor * dst) {
|
13502
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13503
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13504
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13505
|
+
|
13506
|
+
int64_t t0 = ggml_perf_time_us();
|
13507
|
+
UNUSED(t0);
|
13508
|
+
|
13509
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13510
|
+
|
13511
|
+
const int ith = params->ith;
|
13512
|
+
const int nth = params->nth;
|
13513
|
+
|
13514
|
+
const int nk = ne00*ne01*ne02*ne03;
|
13515
|
+
|
13516
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13517
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13518
|
+
|
13519
|
+
if (params->type == GGML_TASK_INIT) {
|
13520
|
+
memset(params->wdata, 0, params->wsize);
|
13521
|
+
|
13522
|
+
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
13523
|
+
{
|
13524
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13525
|
+
|
13526
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
13527
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
13528
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
13529
|
+
ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
13530
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
13531
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
13532
|
+
dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
|
13533
|
+
}
|
13534
|
+
}
|
13535
|
+
}
|
13536
|
+
}
|
13537
|
+
}
|
13538
|
+
|
13539
|
+
// permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
|
13540
|
+
{
|
13541
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
13542
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13543
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13544
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
13545
|
+
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
13546
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13547
|
+
dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
|
13548
|
+
}
|
13549
|
+
}
|
13550
|
+
}
|
13551
|
+
}
|
13552
|
+
|
13553
|
+
return;
|
13554
|
+
}
|
13555
|
+
|
13556
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13557
|
+
return;
|
13558
|
+
}
|
13559
|
+
|
13560
|
+
const int32_t stride = ((const int32_t*)(opt0->data))[0];
|
13561
|
+
|
13562
|
+
// total patches in dst
|
13563
|
+
const int np = ne2;
|
13564
|
+
|
13565
|
+
// patches per thread
|
13566
|
+
const int dp = (np + nth - 1)/nth;
|
13567
|
+
|
13568
|
+
// patch range for this thread
|
13569
|
+
const int ip0 = dp*ith;
|
13570
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13571
|
+
|
13572
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13573
|
+
ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk;
|
13574
|
+
|
13575
|
+
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
13576
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13577
|
+
ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
13578
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13579
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13580
|
+
const int i1n = i11*ne10*ne12 + i10*ne12;
|
13581
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
13582
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
13583
|
+
float v = 0;
|
13584
|
+
ggml_vec_dot_f16(ne03, &v,
|
13585
|
+
(ggml_fp16_t *) wdata_src + i1n,
|
13586
|
+
(ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
13587
|
+
|
13588
|
+
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
13589
|
+
}
|
13590
|
+
}
|
13591
|
+
}
|
13592
|
+
}
|
13593
|
+
}
|
13594
|
+
}
|
13595
|
+
|
13047
13596
|
// ggml_compute_forward_pool_1d_sk_p0
|
13048
13597
|
|
13049
13598
|
static void ggml_compute_forward_pool_1d_sk_p0(
|
@@ -13202,6 +13751,60 @@ static void ggml_compute_forward_pool_2d(
|
|
13202
13751
|
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
13203
13752
|
}
|
13204
13753
|
|
13754
|
+
// ggml_compute_forward_upscale
|
13755
|
+
|
13756
|
+
static void ggml_compute_forward_upscale_f32(
|
13757
|
+
const struct ggml_compute_params * params,
|
13758
|
+
const struct ggml_tensor * src0,
|
13759
|
+
struct ggml_tensor * dst) {
|
13760
|
+
|
13761
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13762
|
+
return;
|
13763
|
+
}
|
13764
|
+
|
13765
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
13766
|
+
|
13767
|
+
const int ith = params->ith;
|
13768
|
+
|
13769
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
13770
|
+
|
13771
|
+
const int scale_factor = dst->op_params[0];
|
13772
|
+
|
13773
|
+
// TODO: optimize
|
13774
|
+
|
13775
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
13776
|
+
for (int i02 = ith; i02 < ne02; i02++) {
|
13777
|
+
for (int m = 0; m < dst->ne[1]; m++) {
|
13778
|
+
int i01 = m / scale_factor;
|
13779
|
+
for (int n = 0; n < dst->ne[0]; n++) {
|
13780
|
+
int i00 = n / scale_factor;
|
13781
|
+
|
13782
|
+
const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
|
13783
|
+
|
13784
|
+
float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
13785
|
+
|
13786
|
+
*y = *x;
|
13787
|
+
}
|
13788
|
+
}
|
13789
|
+
}
|
13790
|
+
}
|
13791
|
+
}
|
13792
|
+
|
13793
|
+
static void ggml_compute_forward_upscale(
|
13794
|
+
const struct ggml_compute_params * params,
|
13795
|
+
const struct ggml_tensor * src0,
|
13796
|
+
struct ggml_tensor * dst) {
|
13797
|
+
switch (src0->type) {
|
13798
|
+
case GGML_TYPE_F32:
|
13799
|
+
{
|
13800
|
+
ggml_compute_forward_upscale_f32(params, src0, dst);
|
13801
|
+
} break;
|
13802
|
+
default:
|
13803
|
+
{
|
13804
|
+
GGML_ASSERT(false);
|
13805
|
+
} break;
|
13806
|
+
}
|
13807
|
+
}
|
13205
13808
|
|
13206
13809
|
// ggml_compute_forward_flash_attn
|
13207
13810
|
|
@@ -14327,42 +14930,43 @@ static void ggml_compute_forward_unary(
|
|
14327
14930
|
}
|
14328
14931
|
}
|
14329
14932
|
|
14330
|
-
//
|
14933
|
+
// ggml_compute_forward_get_rel_pos
|
14331
14934
|
|
14332
|
-
static void
|
14935
|
+
static void ggml_compute_forward_get_rel_pos_f16(
|
14333
14936
|
const struct ggml_compute_params * params,
|
14334
14937
|
const struct ggml_tensor * src0,
|
14335
|
-
struct ggml_tensor * dst
|
14336
|
-
const ggml_unary_op_f32_t fun) {
|
14337
|
-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
14338
|
-
|
14938
|
+
struct ggml_tensor * dst) {
|
14339
14939
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14340
14940
|
return;
|
14341
14941
|
}
|
14342
14942
|
|
14343
|
-
|
14344
|
-
const int nc = src0->ne[0];
|
14943
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
|
14345
14944
|
|
14346
|
-
|
14347
|
-
assert(src0->nb[0] == sizeof(float));
|
14945
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
14348
14946
|
|
14349
|
-
|
14350
|
-
|
14351
|
-
|
14352
|
-
|
14947
|
+
const int64_t w = ne1;
|
14948
|
+
|
14949
|
+
ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
|
14950
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
|
14951
|
+
|
14952
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14953
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14954
|
+
const int64_t pos = (w - i1 - 1) + i2;
|
14955
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14956
|
+
dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
|
14957
|
+
}
|
14958
|
+
}
|
14353
14959
|
}
|
14354
14960
|
}
|
14355
14961
|
|
14356
|
-
|
14357
|
-
static void ggml_compute_forward_map_unary(
|
14962
|
+
static void ggml_compute_forward_get_rel_pos(
|
14358
14963
|
const struct ggml_compute_params * params,
|
14359
14964
|
const struct ggml_tensor * src0,
|
14360
|
-
struct ggml_tensor * dst
|
14361
|
-
const ggml_unary_op_f32_t fun) {
|
14965
|
+
struct ggml_tensor * dst) {
|
14362
14966
|
switch (src0->type) {
|
14363
|
-
case
|
14967
|
+
case GGML_TYPE_F16:
|
14364
14968
|
{
|
14365
|
-
|
14969
|
+
ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
|
14366
14970
|
} break;
|
14367
14971
|
default:
|
14368
14972
|
{
|
@@ -14371,34 +14975,164 @@ static void ggml_compute_forward_map_unary(
|
|
14371
14975
|
}
|
14372
14976
|
}
|
14373
14977
|
|
14374
|
-
//
|
14978
|
+
// ggml_compute_forward_add_rel_pos
|
14375
14979
|
|
14376
|
-
static void
|
14980
|
+
static void ggml_compute_forward_add_rel_pos_f32(
|
14377
14981
|
const struct ggml_compute_params * params,
|
14378
14982
|
const struct ggml_tensor * src0,
|
14379
14983
|
const struct ggml_tensor * src1,
|
14380
|
-
struct ggml_tensor *
|
14381
|
-
|
14382
|
-
assert(params->ith == 0);
|
14383
|
-
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
14984
|
+
const struct ggml_tensor * src2,
|
14985
|
+
struct ggml_tensor * dst) {
|
14384
14986
|
|
14987
|
+
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
14988
|
+
if (!inplace && params->type == GGML_TASK_INIT) {
|
14989
|
+
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
14990
|
+
return;
|
14991
|
+
}
|
14385
14992
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14386
14993
|
return;
|
14387
14994
|
}
|
14388
14995
|
|
14389
|
-
|
14390
|
-
|
14996
|
+
int64_t t0 = ggml_perf_time_us();
|
14997
|
+
UNUSED(t0);
|
14391
14998
|
|
14392
|
-
|
14393
|
-
assert(src0->nb[0] == sizeof(float));
|
14394
|
-
assert(src1->nb[0] == sizeof(float));
|
14999
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
|
14395
15000
|
|
14396
|
-
|
14397
|
-
|
14398
|
-
|
14399
|
-
|
14400
|
-
|
14401
|
-
|
15001
|
+
float * src1_data = (float *) src1->data;
|
15002
|
+
float * src2_data = (float *) src2->data;
|
15003
|
+
float * dst_data = (float *) dst->data;
|
15004
|
+
|
15005
|
+
const int64_t ne10 = src1->ne[0];
|
15006
|
+
const int64_t ne11 = src1->ne[1];
|
15007
|
+
const int64_t ne12 = src1->ne[2];
|
15008
|
+
const int64_t ne13 = src1->ne[3];
|
15009
|
+
|
15010
|
+
const int ith = params->ith;
|
15011
|
+
const int nth = params->nth;
|
15012
|
+
|
15013
|
+
// total patches in dst
|
15014
|
+
const int np = ne13;
|
15015
|
+
|
15016
|
+
// patches per thread
|
15017
|
+
const int dp = (np + nth - 1)/nth;
|
15018
|
+
|
15019
|
+
// patch range for this thread
|
15020
|
+
const int ip0 = dp*ith;
|
15021
|
+
const int ip1 = MIN(ip0 + dp, np);
|
15022
|
+
|
15023
|
+
|
15024
|
+
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
|
15025
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
15026
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
15027
|
+
const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
|
15028
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
15029
|
+
const int64_t jp0 = jp1 + i10;
|
15030
|
+
const float src1_e = src1_data[jp0];
|
15031
|
+
const float src2_e = src2_data[jp0];
|
15032
|
+
|
15033
|
+
const int64_t jdh = jp0 * ne10;
|
15034
|
+
const int64_t jdw = jdh - (ne10 - 1) * i10;
|
15035
|
+
|
15036
|
+
for (int64_t j = 0; j < ne10; ++j) {
|
15037
|
+
dst_data[jdh + j ] += src2_e;
|
15038
|
+
dst_data[jdw + j*ne10] += src1_e;
|
15039
|
+
}
|
15040
|
+
}
|
15041
|
+
}
|
15042
|
+
}
|
15043
|
+
}
|
15044
|
+
}
|
15045
|
+
|
15046
|
+
static void ggml_compute_forward_add_rel_pos(
|
15047
|
+
const struct ggml_compute_params * params,
|
15048
|
+
const struct ggml_tensor * src0,
|
15049
|
+
const struct ggml_tensor * src1,
|
15050
|
+
const struct ggml_tensor * src2,
|
15051
|
+
struct ggml_tensor * dst) {
|
15052
|
+
switch (src0->type) {
|
15053
|
+
case GGML_TYPE_F32:
|
15054
|
+
{
|
15055
|
+
ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
|
15056
|
+
} break;
|
15057
|
+
default:
|
15058
|
+
{
|
15059
|
+
GGML_ASSERT(false);
|
15060
|
+
} break;
|
15061
|
+
}
|
15062
|
+
}
|
15063
|
+
|
15064
|
+
// ggml_compute_forward_map_unary
|
15065
|
+
|
15066
|
+
static void ggml_compute_forward_map_unary_f32(
|
15067
|
+
const struct ggml_compute_params * params,
|
15068
|
+
const struct ggml_tensor * src0,
|
15069
|
+
struct ggml_tensor * dst,
|
15070
|
+
const ggml_unary_op_f32_t fun) {
|
15071
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
15072
|
+
|
15073
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15074
|
+
return;
|
15075
|
+
}
|
15076
|
+
|
15077
|
+
const int n = ggml_nrows(src0);
|
15078
|
+
const int nc = src0->ne[0];
|
15079
|
+
|
15080
|
+
assert( dst->nb[0] == sizeof(float));
|
15081
|
+
assert(src0->nb[0] == sizeof(float));
|
15082
|
+
|
15083
|
+
for (int i = 0; i < n; i++) {
|
15084
|
+
fun(nc,
|
15085
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
15086
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
15087
|
+
}
|
15088
|
+
}
|
15089
|
+
|
15090
|
+
|
15091
|
+
static void ggml_compute_forward_map_unary(
|
15092
|
+
const struct ggml_compute_params * params,
|
15093
|
+
const struct ggml_tensor * src0,
|
15094
|
+
struct ggml_tensor * dst,
|
15095
|
+
const ggml_unary_op_f32_t fun) {
|
15096
|
+
switch (src0->type) {
|
15097
|
+
case GGML_TYPE_F32:
|
15098
|
+
{
|
15099
|
+
ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
|
15100
|
+
} break;
|
15101
|
+
default:
|
15102
|
+
{
|
15103
|
+
GGML_ASSERT(false);
|
15104
|
+
} break;
|
15105
|
+
}
|
15106
|
+
}
|
15107
|
+
|
15108
|
+
// ggml_compute_forward_map_binary
|
15109
|
+
|
15110
|
+
static void ggml_compute_forward_map_binary_f32(
|
15111
|
+
const struct ggml_compute_params * params,
|
15112
|
+
const struct ggml_tensor * src0,
|
15113
|
+
const struct ggml_tensor * src1,
|
15114
|
+
struct ggml_tensor * dst,
|
15115
|
+
const ggml_binary_op_f32_t fun) {
|
15116
|
+
assert(params->ith == 0);
|
15117
|
+
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
15118
|
+
|
15119
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15120
|
+
return;
|
15121
|
+
}
|
15122
|
+
|
15123
|
+
const int n = ggml_nrows(src0);
|
15124
|
+
const int nc = src0->ne[0];
|
15125
|
+
|
15126
|
+
assert( dst->nb[0] == sizeof(float));
|
15127
|
+
assert(src0->nb[0] == sizeof(float));
|
15128
|
+
assert(src1->nb[0] == sizeof(float));
|
15129
|
+
|
15130
|
+
for (int i = 0; i < n; i++) {
|
15131
|
+
fun(nc,
|
15132
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
15133
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])),
|
15134
|
+
(float *) ((char *) src1->data + i*(src1->nb[1])));
|
15135
|
+
}
|
14402
15136
|
}
|
14403
15137
|
|
14404
15138
|
|
@@ -14879,6 +15613,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14879
15613
|
{
|
14880
15614
|
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
|
14881
15615
|
} break;
|
15616
|
+
case GGML_OP_CONCAT:
|
15617
|
+
{
|
15618
|
+
ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
|
15619
|
+
} break;
|
14882
15620
|
case GGML_OP_SILU_BACK:
|
14883
15621
|
{
|
14884
15622
|
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14895,6 +15633,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14895
15633
|
{
|
14896
15634
|
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
|
14897
15635
|
} break;
|
15636
|
+
case GGML_OP_GROUP_NORM:
|
15637
|
+
{
|
15638
|
+
ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
|
15639
|
+
} break;
|
14898
15640
|
case GGML_OP_MUL_MAT:
|
14899
15641
|
{
|
14900
15642
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14987,6 +15729,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14987
15729
|
{
|
14988
15730
|
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14989
15731
|
} break;
|
15732
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
15733
|
+
{
|
15734
|
+
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15735
|
+
} break;
|
14990
15736
|
case GGML_OP_POOL_1D:
|
14991
15737
|
{
|
14992
15738
|
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
|
@@ -14995,6 +15741,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14995
15741
|
{
|
14996
15742
|
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
|
14997
15743
|
} break;
|
15744
|
+
case GGML_OP_UPSCALE:
|
15745
|
+
{
|
15746
|
+
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
15747
|
+
} break;
|
14998
15748
|
case GGML_OP_FLASH_ATTN:
|
14999
15749
|
{
|
15000
15750
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -15025,6 +15775,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15025
15775
|
{
|
15026
15776
|
ggml_compute_forward_unary(params, tensor->src[0], tensor);
|
15027
15777
|
} break;
|
15778
|
+
case GGML_OP_GET_REL_POS:
|
15779
|
+
{
|
15780
|
+
ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
|
15781
|
+
} break;
|
15782
|
+
case GGML_OP_ADD_REL_POS:
|
15783
|
+
{
|
15784
|
+
ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15785
|
+
} break;
|
15028
15786
|
case GGML_OP_MAP_UNARY:
|
15029
15787
|
{
|
15030
15788
|
ggml_unary_op_f32_t fun;
|
@@ -15288,6 +16046,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15288
16046
|
inplace);
|
15289
16047
|
}
|
15290
16048
|
} break;
|
16049
|
+
case GGML_OP_CONCAT:
|
16050
|
+
{
|
16051
|
+
GGML_ASSERT(false); // TODO: implement
|
16052
|
+
} break;
|
15291
16053
|
case GGML_OP_SILU_BACK:
|
15292
16054
|
{
|
15293
16055
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15310,6 +16072,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15310
16072
|
{
|
15311
16073
|
GGML_ASSERT(false); // TODO: not implemented
|
15312
16074
|
} break;
|
16075
|
+
case GGML_OP_GROUP_NORM:
|
16076
|
+
{
|
16077
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16078
|
+
} break;
|
15313
16079
|
case GGML_OP_MUL_MAT:
|
15314
16080
|
{
|
15315
16081
|
// https://cs231n.github.io/optimization-2/#staged
|
@@ -15584,6 +16350,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15584
16350
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15585
16351
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15586
16352
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16353
|
+
float freq_base;
|
16354
|
+
float freq_scale;
|
16355
|
+
float xpos_base;
|
16356
|
+
bool xpos_down;
|
16357
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16358
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16359
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16360
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16361
|
+
|
15587
16362
|
src0->grad = ggml_add_impl(ctx,
|
15588
16363
|
src0->grad,
|
15589
16364
|
ggml_rope_back(ctx,
|
@@ -15591,7 +16366,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15591
16366
|
n_past,
|
15592
16367
|
n_dims,
|
15593
16368
|
mode,
|
15594
|
-
n_ctx
|
16369
|
+
n_ctx,
|
16370
|
+
freq_base,
|
16371
|
+
freq_scale,
|
16372
|
+
xpos_base,
|
16373
|
+
xpos_down),
|
15595
16374
|
inplace);
|
15596
16375
|
}
|
15597
16376
|
} break;
|
@@ -15602,14 +16381,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15602
16381
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15603
16382
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15604
16383
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16384
|
+
float freq_base;
|
16385
|
+
float freq_scale;
|
16386
|
+
float xpos_base;
|
16387
|
+
bool xpos_down;
|
16388
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16389
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16390
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16391
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16392
|
+
|
15605
16393
|
src0->grad = ggml_add_impl(ctx,
|
15606
16394
|
src0->grad,
|
15607
|
-
|
16395
|
+
ggml_rope_impl(ctx,
|
15608
16396
|
tensor->grad,
|
15609
16397
|
n_past,
|
15610
16398
|
n_dims,
|
15611
16399
|
mode,
|
15612
|
-
n_ctx
|
16400
|
+
n_ctx,
|
16401
|
+
freq_base,
|
16402
|
+
freq_scale,
|
16403
|
+
xpos_base,
|
16404
|
+
xpos_down,
|
16405
|
+
false),
|
15613
16406
|
inplace);
|
15614
16407
|
}
|
15615
16408
|
} break;
|
@@ -15629,6 +16422,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15629
16422
|
{
|
15630
16423
|
GGML_ASSERT(false); // TODO: not implemented
|
15631
16424
|
} break;
|
16425
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
16426
|
+
{
|
16427
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16428
|
+
} break;
|
15632
16429
|
case GGML_OP_POOL_1D:
|
15633
16430
|
{
|
15634
16431
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15637,6 +16434,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15637
16434
|
{
|
15638
16435
|
GGML_ASSERT(false); // TODO: not implemented
|
15639
16436
|
} break;
|
16437
|
+
case GGML_OP_UPSCALE:
|
16438
|
+
{
|
16439
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16440
|
+
} break;
|
15640
16441
|
case GGML_OP_FLASH_ATTN:
|
15641
16442
|
{
|
15642
16443
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15878,6 +16679,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15878
16679
|
GGML_ASSERT(false);
|
15879
16680
|
}
|
15880
16681
|
} break;
|
16682
|
+
case GGML_OP_GET_REL_POS:
|
16683
|
+
case GGML_OP_ADD_REL_POS:
|
15881
16684
|
case GGML_OP_MAP_UNARY:
|
15882
16685
|
case GGML_OP_MAP_BINARY:
|
15883
16686
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16382,7 +17185,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16382
17185
|
|
16383
17186
|
size_t cur = 0;
|
16384
17187
|
if (ggml_is_quantized(node->type)) {
|
16385
|
-
cur =
|
17188
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16386
17189
|
}
|
16387
17190
|
|
16388
17191
|
work_size = MAX(work_size, cur);
|
@@ -16395,7 +17198,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16395
17198
|
size_t cur = 0;
|
16396
17199
|
|
16397
17200
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16398
|
-
cur =
|
17201
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16399
17202
|
}
|
16400
17203
|
|
16401
17204
|
work_size = MAX(work_size, cur);
|
@@ -16407,7 +17210,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16407
17210
|
size_t cur = 0;
|
16408
17211
|
|
16409
17212
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16410
|
-
cur =
|
17213
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
16411
17214
|
}
|
16412
17215
|
|
16413
17216
|
work_size = MAX(work_size, cur);
|
@@ -16454,9 +17257,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16454
17257
|
case GGML_OP_NORM:
|
16455
17258
|
case GGML_OP_RMS_NORM:
|
16456
17259
|
case GGML_OP_RMS_NORM_BACK:
|
17260
|
+
case GGML_OP_GROUP_NORM:
|
16457
17261
|
{
|
16458
17262
|
n_tasks = n_threads;
|
16459
17263
|
} break;
|
17264
|
+
case GGML_OP_CONCAT:
|
16460
17265
|
case GGML_OP_MUL_MAT:
|
16461
17266
|
case GGML_OP_OUT_PROD:
|
16462
17267
|
{
|
@@ -16490,12 +17295,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16490
17295
|
// the threads are still spinning
|
16491
17296
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16492
17297
|
// here we need memory just for single 2D matrix from src0
|
16493
|
-
cur =
|
17298
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
16494
17299
|
}
|
16495
17300
|
} else
|
16496
17301
|
#endif
|
16497
17302
|
if (node->src[1]->type != vec_dot_type) {
|
16498
|
-
cur =
|
17303
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
16499
17304
|
} else {
|
16500
17305
|
cur = 0;
|
16501
17306
|
}
|
@@ -16524,6 +17329,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16524
17329
|
case GGML_OP_SOFT_MAX_BACK:
|
16525
17330
|
case GGML_OP_ROPE:
|
16526
17331
|
case GGML_OP_ROPE_BACK:
|
17332
|
+
case GGML_OP_ADD_REL_POS:
|
16527
17333
|
{
|
16528
17334
|
n_tasks = n_threads;
|
16529
17335
|
} break;
|
@@ -16598,6 +17404,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16598
17404
|
GGML_ASSERT(false);
|
16599
17405
|
}
|
16600
17406
|
|
17407
|
+
work_size = MAX(work_size, cur);
|
17408
|
+
} break;
|
17409
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
17410
|
+
{
|
17411
|
+
n_tasks = n_threads;
|
17412
|
+
|
17413
|
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
17414
|
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
17415
|
+
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
17416
|
+
const int64_t ne03 = node->src[0]->ne[3]; // Channels In
|
17417
|
+
|
17418
|
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
17419
|
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
17420
|
+
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
17421
|
+
|
17422
|
+
size_t cur = 0;
|
17423
|
+
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
17424
|
+
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
17425
|
+
|
16601
17426
|
work_size = MAX(work_size, cur);
|
16602
17427
|
} break;
|
16603
17428
|
case GGML_OP_POOL_1D:
|
@@ -16605,6 +17430,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16605
17430
|
{
|
16606
17431
|
n_tasks = 1;
|
16607
17432
|
} break;
|
17433
|
+
case GGML_OP_UPSCALE:
|
17434
|
+
{
|
17435
|
+
n_tasks = n_threads;
|
17436
|
+
} break;
|
16608
17437
|
case GGML_OP_FLASH_ATTN:
|
16609
17438
|
{
|
16610
17439
|
n_tasks = n_threads;
|
@@ -16666,6 +17495,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16666
17495
|
} break;
|
16667
17496
|
case GGML_OP_WIN_PART:
|
16668
17497
|
case GGML_OP_WIN_UNPART:
|
17498
|
+
case GGML_OP_GET_REL_POS:
|
16669
17499
|
case GGML_OP_MAP_UNARY:
|
16670
17500
|
case GGML_OP_MAP_BINARY:
|
16671
17501
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16783,8 +17613,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16783
17613
|
|
16784
17614
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
16785
17615
|
GGML_ASSERT(rc == 0);
|
17616
|
+
UNUSED(rc);
|
16786
17617
|
}
|
16787
17618
|
}
|
17619
|
+
|
16788
17620
|
workers[0].ith = 0;
|
16789
17621
|
workers[0].shared = &state_shared;
|
16790
17622
|
|
@@ -16900,7 +17732,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16900
17732
|
// compute size of intermediate results
|
16901
17733
|
// TODO: does not take into account scratch buffers !!!!
|
16902
17734
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16903
|
-
size_eval +=
|
17735
|
+
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
16904
17736
|
}
|
16905
17737
|
|
16906
17738
|
// print
|
@@ -18301,8 +19133,8 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18301
19133
|
struct ggml_tensor * f) {
|
18302
19134
|
|
18303
19135
|
// build forward + backward compute graphs
|
18304
|
-
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
18305
|
-
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
19136
|
+
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
19137
|
+
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
18306
19138
|
|
18307
19139
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
18308
19140
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
@@ -18561,6 +19393,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18561
19393
|
|
18562
19394
|
////////////////////////////////////////////////////////////////////////////////
|
18563
19395
|
|
19396
|
+
struct gguf_str {
|
19397
|
+
uint32_t n;
|
19398
|
+
char * data;
|
19399
|
+
};
|
19400
|
+
|
19401
|
+
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
19402
|
+
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
|
19403
|
+
[GGUF_TYPE_INT8] = sizeof(int8_t),
|
19404
|
+
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
|
19405
|
+
[GGUF_TYPE_INT16] = sizeof(int16_t),
|
19406
|
+
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
|
19407
|
+
[GGUF_TYPE_INT32] = sizeof(int32_t),
|
19408
|
+
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
19409
|
+
[GGUF_TYPE_BOOL] = sizeof(bool),
|
19410
|
+
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
19411
|
+
[GGUF_TYPE_ARRAY] = 0, // undefined
|
19412
|
+
};
|
19413
|
+
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
19414
|
+
|
19415
|
+
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
19416
|
+
[GGUF_TYPE_UINT8] = "u8",
|
19417
|
+
[GGUF_TYPE_INT8] = "i8",
|
19418
|
+
[GGUF_TYPE_UINT16] = "u16",
|
19419
|
+
[GGUF_TYPE_INT16] = "i16",
|
19420
|
+
[GGUF_TYPE_UINT32] = "u32",
|
19421
|
+
[GGUF_TYPE_INT32] = "i32",
|
19422
|
+
[GGUF_TYPE_FLOAT32] = "f32",
|
19423
|
+
[GGUF_TYPE_BOOL] = "bool",
|
19424
|
+
[GGUF_TYPE_STRING] = "str",
|
19425
|
+
[GGUF_TYPE_ARRAY] = "arr",
|
19426
|
+
};
|
19427
|
+
static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
|
19428
|
+
|
19429
|
+
union gguf_value {
|
19430
|
+
uint8_t uint8;
|
19431
|
+
int8_t int8;
|
19432
|
+
uint16_t uint16;
|
19433
|
+
int16_t int16;
|
19434
|
+
uint32_t uint32;
|
19435
|
+
int32_t int32;
|
19436
|
+
float float32;
|
19437
|
+
bool bool_;
|
19438
|
+
|
19439
|
+
struct gguf_str str;
|
19440
|
+
|
19441
|
+
struct {
|
19442
|
+
enum gguf_type type;
|
19443
|
+
|
19444
|
+
uint32_t n;
|
19445
|
+
void * data;
|
19446
|
+
} arr;
|
19447
|
+
};
|
19448
|
+
|
19449
|
+
struct gguf_kv {
|
19450
|
+
struct gguf_str key;
|
19451
|
+
|
19452
|
+
uint32_t n_bytes; // TODO: is this actually needed?
|
19453
|
+
|
19454
|
+
enum gguf_type type;
|
19455
|
+
union gguf_value value;
|
19456
|
+
};
|
19457
|
+
|
19458
|
+
struct gguf_header {
|
19459
|
+
uint32_t magic;
|
19460
|
+
uint32_t version;
|
19461
|
+
uint32_t n_tensors;
|
19462
|
+
uint32_t n_kv;
|
19463
|
+
};
|
19464
|
+
|
19465
|
+
struct gguf_tensor_info {
|
19466
|
+
struct gguf_str name;
|
19467
|
+
|
19468
|
+
uint32_t n_dims;
|
19469
|
+
uint32_t ne[GGML_MAX_DIMS];
|
19470
|
+
|
19471
|
+
enum ggml_type type;
|
19472
|
+
|
19473
|
+
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
19474
|
+
|
19475
|
+
// for writing API
|
19476
|
+
const void * data;
|
19477
|
+
size_t size;
|
19478
|
+
};
|
19479
|
+
|
19480
|
+
struct gguf_context {
|
19481
|
+
struct gguf_header header;
|
19482
|
+
|
19483
|
+
struct gguf_kv * kv;
|
19484
|
+
struct gguf_tensor_info * infos;
|
19485
|
+
|
19486
|
+
size_t alignment;
|
19487
|
+
size_t offset; // offset of `data` from beginning of file
|
19488
|
+
size_t size; // size of `data` in bytes
|
19489
|
+
|
19490
|
+
//uint8_t * padding;
|
19491
|
+
void * data;
|
19492
|
+
};
|
19493
|
+
|
19494
|
+
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
19495
|
+
const size_t n = fread(dst, 1, size, file);
|
19496
|
+
*offset += n;
|
19497
|
+
return n == size;
|
19498
|
+
}
|
19499
|
+
|
19500
|
+
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
19501
|
+
p->n = 0;
|
19502
|
+
p->data = NULL;
|
19503
|
+
|
19504
|
+
bool ok = true;
|
19505
|
+
|
19506
|
+
// TODO: how to avoid mallocs for strings?
|
19507
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
19508
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19509
|
+
|
19510
|
+
return ok;
|
19511
|
+
}
|
19512
|
+
|
19513
|
+
struct gguf_context * gguf_init_empty(void) {
|
19514
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19515
|
+
|
19516
|
+
ctx->header.magic = GGUF_MAGIC;
|
19517
|
+
ctx->header.version = GGUF_VERSION;
|
19518
|
+
ctx->header.n_tensors = 0;
|
19519
|
+
ctx->header.n_kv = 0;
|
19520
|
+
|
19521
|
+
ctx->kv = NULL;
|
19522
|
+
ctx->infos = NULL;
|
19523
|
+
|
19524
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19525
|
+
ctx->offset = 0;
|
19526
|
+
ctx->size = 0;
|
19527
|
+
|
19528
|
+
ctx->data = NULL;
|
19529
|
+
|
19530
|
+
return ctx;
|
19531
|
+
}
|
19532
|
+
|
19533
|
+
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
19534
|
+
FILE * file = fopen(fname, "rb");
|
19535
|
+
if (!file) {
|
19536
|
+
return NULL;
|
19537
|
+
}
|
19538
|
+
|
19539
|
+
// offset from start of file
|
19540
|
+
size_t offset = 0;
|
19541
|
+
|
19542
|
+
uint32_t magic = 0;
|
19543
|
+
|
19544
|
+
// check the magic before making allocations
|
19545
|
+
{
|
19546
|
+
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
19547
|
+
|
19548
|
+
if (magic != GGUF_MAGIC) {
|
19549
|
+
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
19550
|
+
fclose(file);
|
19551
|
+
return NULL;
|
19552
|
+
}
|
19553
|
+
}
|
19554
|
+
|
19555
|
+
bool ok = true;
|
19556
|
+
|
19557
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19558
|
+
|
19559
|
+
// read the header
|
19560
|
+
{
|
19561
|
+
ctx->header.magic = magic;
|
19562
|
+
|
19563
|
+
ctx->kv = NULL;
|
19564
|
+
ctx->infos = NULL;
|
19565
|
+
ctx->data = NULL;
|
19566
|
+
|
19567
|
+
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
19568
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
19569
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
19570
|
+
|
19571
|
+
if (!ok) {
|
19572
|
+
fprintf(stderr, "%s: failed to read header\n", __func__);
|
19573
|
+
fclose(file);
|
19574
|
+
gguf_free(ctx);
|
19575
|
+
return NULL;
|
19576
|
+
}
|
19577
|
+
}
|
19578
|
+
|
19579
|
+
// read the kv pairs
|
19580
|
+
{
|
19581
|
+
ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19582
|
+
|
19583
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19584
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
19585
|
+
|
19586
|
+
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
19587
|
+
|
19588
|
+
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
19589
|
+
//ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
|
19590
|
+
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19591
|
+
|
19592
|
+
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
19593
|
+
|
19594
|
+
switch (kv->type) {
|
19595
|
+
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
|
19596
|
+
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
|
19597
|
+
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
|
19598
|
+
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
|
19599
|
+
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
19600
|
+
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
19601
|
+
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
19602
|
+
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
19603
|
+
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
19604
|
+
case GGUF_TYPE_ARRAY:
|
19605
|
+
{
|
19606
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19607
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19608
|
+
|
19609
|
+
switch (kv->value.arr.type) {
|
19610
|
+
case GGUF_TYPE_UINT8:
|
19611
|
+
case GGUF_TYPE_INT8:
|
19612
|
+
case GGUF_TYPE_UINT16:
|
19613
|
+
case GGUF_TYPE_INT16:
|
19614
|
+
case GGUF_TYPE_UINT32:
|
19615
|
+
case GGUF_TYPE_INT32:
|
19616
|
+
case GGUF_TYPE_FLOAT32:
|
19617
|
+
case GGUF_TYPE_BOOL:
|
19618
|
+
{
|
19619
|
+
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
19620
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
19621
|
+
} break;
|
19622
|
+
case GGUF_TYPE_STRING:
|
19623
|
+
{
|
19624
|
+
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
19625
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
19626
|
+
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19627
|
+
}
|
19628
|
+
} break;
|
19629
|
+
case GGUF_TYPE_ARRAY:
|
19630
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
19631
|
+
};
|
19632
|
+
} break;
|
19633
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
19634
|
+
};
|
19635
|
+
|
19636
|
+
if (!ok) {
|
19637
|
+
break;
|
19638
|
+
}
|
19639
|
+
}
|
19640
|
+
|
19641
|
+
if (!ok) {
|
19642
|
+
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
19643
|
+
fclose(file);
|
19644
|
+
gguf_free(ctx);
|
19645
|
+
return NULL;
|
19646
|
+
}
|
19647
|
+
}
|
19648
|
+
|
19649
|
+
// read the tensor infos
|
19650
|
+
{
|
19651
|
+
ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19652
|
+
|
19653
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19654
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19655
|
+
|
19656
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
19657
|
+
info->ne[j] = 1;
|
19658
|
+
}
|
19659
|
+
|
19660
|
+
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19661
|
+
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19662
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19663
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19664
|
+
}
|
19665
|
+
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19666
|
+
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19667
|
+
|
19668
|
+
if (!ok) {
|
19669
|
+
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19670
|
+
fclose(file);
|
19671
|
+
gguf_free(ctx);
|
19672
|
+
return NULL;
|
19673
|
+
}
|
19674
|
+
}
|
19675
|
+
}
|
19676
|
+
|
19677
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19678
|
+
|
19679
|
+
int alignment_idx = gguf_find_key(ctx, "general.alignment");
|
19680
|
+
if (alignment_idx != -1) {
|
19681
|
+
ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
|
19682
|
+
}
|
19683
|
+
|
19684
|
+
// we require the data section to be aligned, so take into account any padding
|
19685
|
+
{
|
19686
|
+
const size_t offset_pad = offset % ctx->alignment;
|
19687
|
+
|
19688
|
+
if (offset_pad != 0) {
|
19689
|
+
offset += ctx->alignment - offset_pad;
|
19690
|
+
fseek(file, offset, SEEK_SET);
|
19691
|
+
}
|
19692
|
+
}
|
19693
|
+
|
19694
|
+
// store the current file offset - this is where the data section starts
|
19695
|
+
ctx->offset = offset;
|
19696
|
+
|
19697
|
+
// compute the total size of the data section, taking into account the alignment
|
19698
|
+
{
|
19699
|
+
ctx->size = 0;
|
19700
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19701
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19702
|
+
|
19703
|
+
const int64_t ne =
|
19704
|
+
(int64_t) info->ne[0] *
|
19705
|
+
(int64_t) info->ne[1] *
|
19706
|
+
(int64_t) info->ne[2] *
|
19707
|
+
(int64_t) info->ne[3];
|
19708
|
+
|
19709
|
+
if (ne % ggml_blck_size(info->type) != 0) {
|
19710
|
+
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19711
|
+
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
19712
|
+
fclose(file);
|
19713
|
+
gguf_free(ctx);
|
19714
|
+
return NULL;
|
19715
|
+
}
|
19716
|
+
|
19717
|
+
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
19718
|
+
|
19719
|
+
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
19720
|
+
}
|
19721
|
+
}
|
19722
|
+
|
19723
|
+
// load the tensor data only if requested
|
19724
|
+
if (params.ctx != NULL) {
|
19725
|
+
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
|
19726
|
+
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
|
19727
|
+
// the ggml_tensor structs to the appropriate locations in the binary blob
|
19728
|
+
|
19729
|
+
// compute the exact size needed for the new ggml_context
|
19730
|
+
const size_t mem_size =
|
19731
|
+
params.no_alloc ?
|
19732
|
+
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
19733
|
+
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
19734
|
+
|
19735
|
+
struct ggml_init_params pdata = {
|
19736
|
+
.mem_size = mem_size,
|
19737
|
+
.mem_buffer = NULL,
|
19738
|
+
.no_alloc = params.no_alloc,
|
19739
|
+
};
|
19740
|
+
|
19741
|
+
*params.ctx = ggml_init(pdata);
|
19742
|
+
|
19743
|
+
struct ggml_context * ctx_data = *params.ctx;
|
19744
|
+
|
19745
|
+
struct ggml_tensor * data = NULL;
|
19746
|
+
|
19747
|
+
if (params.no_alloc == false) {
|
19748
|
+
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
19749
|
+
|
19750
|
+
ok = ok && data != NULL;
|
19751
|
+
|
19752
|
+
// read the binary blob with the tensor data
|
19753
|
+
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
|
19754
|
+
|
19755
|
+
if (!ok) {
|
19756
|
+
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
19757
|
+
fclose(file);
|
19758
|
+
ggml_free(ctx_data);
|
19759
|
+
gguf_free(ctx);
|
19760
|
+
return NULL;
|
19761
|
+
}
|
19762
|
+
|
19763
|
+
ctx->data = data->data;
|
19764
|
+
}
|
19765
|
+
|
19766
|
+
ggml_set_no_alloc(ctx_data, true);
|
19767
|
+
|
19768
|
+
// create the tensors
|
19769
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19770
|
+
const int64_t ne[GGML_MAX_DIMS] = {
|
19771
|
+
ctx->infos[i].ne[0],
|
19772
|
+
ctx->infos[i].ne[1],
|
19773
|
+
ctx->infos[i].ne[2],
|
19774
|
+
ctx->infos[i].ne[3],
|
19775
|
+
};
|
19776
|
+
|
19777
|
+
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
|
19778
|
+
|
19779
|
+
ok = ok && cur != NULL;
|
19780
|
+
|
19781
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
19782
|
+
|
19783
|
+
if (!ok) {
|
19784
|
+
break;
|
19785
|
+
}
|
19786
|
+
|
19787
|
+
// point the data member to the appropriate location in the binary blob using the tensor infos
|
19788
|
+
if (params.no_alloc == false) {
|
19789
|
+
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
19790
|
+
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
19791
|
+
}
|
19792
|
+
}
|
19793
|
+
|
19794
|
+
if (!ok) {
|
19795
|
+
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
|
19796
|
+
fclose(file);
|
19797
|
+
ggml_free(ctx_data);
|
19798
|
+
gguf_free(ctx);
|
19799
|
+
return NULL;
|
19800
|
+
}
|
19801
|
+
|
19802
|
+
ggml_set_no_alloc(ctx_data, params.no_alloc);
|
19803
|
+
}
|
19804
|
+
|
19805
|
+
fclose(file);
|
19806
|
+
|
19807
|
+
return ctx;
|
19808
|
+
}
|
19809
|
+
|
19810
|
+
void gguf_free(struct gguf_context * ctx) {
|
19811
|
+
if (ctx == NULL) {
|
19812
|
+
return;
|
19813
|
+
}
|
19814
|
+
|
19815
|
+
if (ctx->kv) {
|
19816
|
+
// free string memory - not great..
|
19817
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19818
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
19819
|
+
|
19820
|
+
if (kv->key.data) {
|
19821
|
+
free(kv->key.data);
|
19822
|
+
}
|
19823
|
+
|
19824
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
19825
|
+
if (kv->value.str.data) {
|
19826
|
+
free(kv->value.str.data);
|
19827
|
+
}
|
19828
|
+
}
|
19829
|
+
|
19830
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
19831
|
+
if (kv->value.arr.data) {
|
19832
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
19833
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
19834
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
19835
|
+
if (str->data) {
|
19836
|
+
free(str->data);
|
19837
|
+
}
|
19838
|
+
}
|
19839
|
+
}
|
19840
|
+
free(kv->value.arr.data);
|
19841
|
+
}
|
19842
|
+
}
|
19843
|
+
}
|
19844
|
+
|
19845
|
+
GGML_ALIGNED_FREE(ctx->kv);
|
19846
|
+
}
|
19847
|
+
|
19848
|
+
if (ctx->infos) {
|
19849
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19850
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19851
|
+
|
19852
|
+
if (info->name.data) {
|
19853
|
+
free(info->name.data);
|
19854
|
+
}
|
19855
|
+
}
|
19856
|
+
|
19857
|
+
GGML_ALIGNED_FREE(ctx->infos);
|
19858
|
+
}
|
19859
|
+
|
19860
|
+
GGML_ALIGNED_FREE(ctx);
|
19861
|
+
}
|
19862
|
+
|
19863
|
+
const char * gguf_type_name(enum gguf_type type) {
|
19864
|
+
return GGUF_TYPE_NAME[type];
|
19865
|
+
}
|
19866
|
+
|
19867
|
+
int gguf_get_version(struct gguf_context * ctx) {
|
19868
|
+
return ctx->header.version;
|
19869
|
+
}
|
19870
|
+
|
19871
|
+
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
19872
|
+
return ctx->alignment;
|
19873
|
+
}
|
19874
|
+
|
19875
|
+
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
19876
|
+
return ctx->offset;
|
19877
|
+
}
|
19878
|
+
|
19879
|
+
void * gguf_get_data(struct gguf_context * ctx) {
|
19880
|
+
return ctx->data;
|
19881
|
+
}
|
19882
|
+
|
19883
|
+
int gguf_get_n_kv(struct gguf_context * ctx) {
|
19884
|
+
return ctx->header.n_kv;
|
19885
|
+
}
|
19886
|
+
|
19887
|
+
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
19888
|
+
// return -1 if key not found
|
19889
|
+
int keyfound = -1;
|
19890
|
+
|
19891
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
19892
|
+
|
19893
|
+
for (int i = 0; i < n_kv; ++i) {
|
19894
|
+
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
19895
|
+
keyfound = i;
|
19896
|
+
break;
|
19897
|
+
}
|
19898
|
+
}
|
19899
|
+
|
19900
|
+
return keyfound;
|
19901
|
+
}
|
19902
|
+
|
19903
|
+
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
19904
|
+
return ctx->kv[i].key.data;
|
19905
|
+
}
|
19906
|
+
|
19907
|
+
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
19908
|
+
return ctx->kv[i].type;
|
19909
|
+
}
|
19910
|
+
|
19911
|
+
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
19912
|
+
return ctx->kv[i].value.arr.type;
|
19913
|
+
}
|
19914
|
+
|
19915
|
+
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
19916
|
+
return ctx->kv[i].value.arr.data;
|
19917
|
+
}
|
19918
|
+
|
19919
|
+
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
19920
|
+
struct gguf_kv * kv = &ctx->kv[key_id];
|
19921
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
19922
|
+
return str->data;
|
19923
|
+
}
|
19924
|
+
|
19925
|
+
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
19926
|
+
return ctx->kv[i].value.arr.n;
|
19927
|
+
}
|
19928
|
+
|
19929
|
+
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
19930
|
+
return ctx->kv[i].value.uint8;
|
19931
|
+
}
|
19932
|
+
|
19933
|
+
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
19934
|
+
return ctx->kv[i].value.int8;
|
19935
|
+
}
|
19936
|
+
|
19937
|
+
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
19938
|
+
return ctx->kv[i].value.uint16;
|
19939
|
+
}
|
19940
|
+
|
19941
|
+
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
19942
|
+
return ctx->kv[i].value.int16;
|
19943
|
+
}
|
19944
|
+
|
19945
|
+
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
19946
|
+
return ctx->kv[i].value.uint32;
|
19947
|
+
}
|
19948
|
+
|
19949
|
+
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
19950
|
+
return ctx->kv[i].value.int32;
|
19951
|
+
}
|
19952
|
+
|
19953
|
+
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
19954
|
+
return ctx->kv[i].value.float32;
|
19955
|
+
}
|
19956
|
+
|
19957
|
+
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
19958
|
+
return ctx->kv[i].value.bool_;
|
19959
|
+
}
|
19960
|
+
|
19961
|
+
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
19962
|
+
return ctx->kv[i].value.str.data;
|
19963
|
+
}
|
19964
|
+
|
19965
|
+
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
19966
|
+
return ctx->header.n_tensors;
|
19967
|
+
}
|
19968
|
+
|
19969
|
+
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
19970
|
+
// return -1 if tensor not found
|
19971
|
+
int tensorfound = -1;
|
19972
|
+
|
19973
|
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
19974
|
+
|
19975
|
+
for (int i = 0; i < n_tensors; ++i) {
|
19976
|
+
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
|
19977
|
+
tensorfound = i;
|
19978
|
+
break;
|
19979
|
+
}
|
19980
|
+
}
|
19981
|
+
|
19982
|
+
return tensorfound;
|
19983
|
+
}
|
19984
|
+
|
19985
|
+
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
19986
|
+
return ctx->infos[i].offset;
|
19987
|
+
}
|
19988
|
+
|
19989
|
+
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
19990
|
+
return ctx->infos[i].name.data;
|
19991
|
+
}
|
19992
|
+
|
19993
|
+
// returns the index
|
19994
|
+
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
19995
|
+
const int idx = gguf_find_key(ctx, key);
|
19996
|
+
if (idx >= 0) {
|
19997
|
+
return idx;
|
19998
|
+
}
|
19999
|
+
|
20000
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
20001
|
+
|
20002
|
+
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
20003
|
+
ctx->kv[n_kv].key.n = strlen(key) + 1;
|
20004
|
+
ctx->kv[n_kv].key.data = strdup(key);
|
20005
|
+
ctx->header.n_kv++;
|
20006
|
+
|
20007
|
+
return n_kv;
|
20008
|
+
}
|
20009
|
+
|
20010
|
+
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
20011
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20012
|
+
|
20013
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT8;
|
20014
|
+
ctx->kv[idx].value.uint8 = val;
|
20015
|
+
}
|
20016
|
+
|
20017
|
+
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
|
20018
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20019
|
+
|
20020
|
+
ctx->kv[idx].type = GGUF_TYPE_INT8;
|
20021
|
+
ctx->kv[idx].value.int8 = val;
|
20022
|
+
}
|
20023
|
+
|
20024
|
+
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
|
20025
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20026
|
+
|
20027
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT16;
|
20028
|
+
ctx->kv[idx].value.uint16 = val;
|
20029
|
+
}
|
20030
|
+
|
20031
|
+
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
|
20032
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20033
|
+
|
20034
|
+
ctx->kv[idx].type = GGUF_TYPE_INT16;
|
20035
|
+
ctx->kv[idx].value.int16 = val;
|
20036
|
+
}
|
20037
|
+
|
20038
|
+
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
|
20039
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20040
|
+
|
20041
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT32;
|
20042
|
+
ctx->kv[idx].value.uint32 = val;
|
20043
|
+
}
|
20044
|
+
|
20045
|
+
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
|
20046
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20047
|
+
|
20048
|
+
ctx->kv[idx].type = GGUF_TYPE_INT32;
|
20049
|
+
ctx->kv[idx].value.int32 = val;
|
20050
|
+
}
|
20051
|
+
|
20052
|
+
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
20053
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20054
|
+
|
20055
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
|
20056
|
+
ctx->kv[idx].value.float32 = val;
|
20057
|
+
}
|
20058
|
+
|
20059
|
+
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
20060
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20061
|
+
|
20062
|
+
ctx->kv[idx].type = GGUF_TYPE_BOOL;
|
20063
|
+
ctx->kv[idx].value.bool_ = val;
|
20064
|
+
}
|
20065
|
+
|
20066
|
+
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
|
20067
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20068
|
+
|
20069
|
+
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
20070
|
+
ctx->kv[idx].value.str.n = strlen(val) + 1;
|
20071
|
+
ctx->kv[idx].value.str.data = strdup(val);
|
20072
|
+
}
|
20073
|
+
|
20074
|
+
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
|
20075
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20076
|
+
|
20077
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20078
|
+
ctx->kv[idx].value.arr.type = type;
|
20079
|
+
ctx->kv[idx].value.arr.n = n;
|
20080
|
+
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
20081
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
20082
|
+
}
|
20083
|
+
|
20084
|
+
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
20085
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20086
|
+
|
20087
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20088
|
+
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
20089
|
+
ctx->kv[idx].value.arr.n = n;
|
20090
|
+
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
20091
|
+
for (int i = 0; i < n; i++) {
|
20092
|
+
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
20093
|
+
str->n = strlen(data[i]) + 1;
|
20094
|
+
str->data = strdup(data[i]);
|
20095
|
+
}
|
20096
|
+
}
|
20097
|
+
|
20098
|
+
// set or add KV pairs from another context
|
20099
|
+
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
20100
|
+
for (uint32_t i = 0; i < src->header.n_kv; i++) {
|
20101
|
+
switch (src->kv[i].type) {
|
20102
|
+
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
|
20103
|
+
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
|
20104
|
+
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
|
20105
|
+
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
|
20106
|
+
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
20107
|
+
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
20108
|
+
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
20109
|
+
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
20110
|
+
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
20111
|
+
case GGUF_TYPE_ARRAY:
|
20112
|
+
{
|
20113
|
+
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
20114
|
+
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
20115
|
+
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
20116
|
+
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
20117
|
+
}
|
20118
|
+
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
20119
|
+
free(data);
|
20120
|
+
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
20121
|
+
GGML_ASSERT(false && "nested arrays not supported");
|
20122
|
+
} else {
|
20123
|
+
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
20124
|
+
}
|
20125
|
+
} break;
|
20126
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20127
|
+
}
|
20128
|
+
}
|
20129
|
+
}
|
20130
|
+
|
20131
|
+
void gguf_add_tensor(
|
20132
|
+
struct gguf_context * ctx,
|
20133
|
+
const struct ggml_tensor * tensor) {
|
20134
|
+
const int idx = ctx->header.n_tensors;
|
20135
|
+
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
20136
|
+
|
20137
|
+
ctx->infos[idx].name.n = strlen(tensor->name) + 1;
|
20138
|
+
ctx->infos[idx].name.data = strdup(tensor->name);
|
20139
|
+
|
20140
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
20141
|
+
ctx->infos[idx].ne[i] = 1;
|
20142
|
+
}
|
20143
|
+
|
20144
|
+
ctx->infos[idx].n_dims = tensor->n_dims;
|
20145
|
+
for (int i = 0; i < tensor->n_dims; i++) {
|
20146
|
+
ctx->infos[idx].ne[i] = tensor->ne[i];
|
20147
|
+
}
|
20148
|
+
|
20149
|
+
ctx->infos[idx].type = tensor->type;
|
20150
|
+
ctx->infos[idx].offset = 0;
|
20151
|
+
ctx->infos[idx].data = tensor->data;
|
20152
|
+
ctx->infos[idx].size = ggml_nbytes(tensor);
|
20153
|
+
|
20154
|
+
if (ctx->header.n_tensors > 0) {
|
20155
|
+
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
|
20156
|
+
}
|
20157
|
+
|
20158
|
+
ctx->header.n_tensors++;
|
20159
|
+
}
|
20160
|
+
|
20161
|
+
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
20162
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20163
|
+
if (idx < 0) {
|
20164
|
+
GGML_ASSERT(false && "tensor not found");
|
20165
|
+
}
|
20166
|
+
|
20167
|
+
ctx->infos[idx].type = type;
|
20168
|
+
}
|
20169
|
+
|
20170
|
+
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
|
20171
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20172
|
+
if (idx < 0) {
|
20173
|
+
GGML_ASSERT(false && "tensor not found");
|
20174
|
+
}
|
20175
|
+
|
20176
|
+
ctx->infos[idx].data = data;
|
20177
|
+
ctx->infos[idx].size = size;
|
20178
|
+
|
20179
|
+
// update offsets
|
20180
|
+
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
|
20181
|
+
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
|
20182
|
+
}
|
20183
|
+
}
|
20184
|
+
|
20185
|
+
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
|
20186
|
+
// fwrite(&val->n, sizeof(val->n), 1, file);
|
20187
|
+
// fwrite(val->data, sizeof(char), val->n, file);
|
20188
|
+
//}
|
20189
|
+
//
|
20190
|
+
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
|
20191
|
+
// fwrite(val, sizeof(char), size, file);
|
20192
|
+
//}
|
20193
|
+
|
20194
|
+
struct gguf_buf {
|
20195
|
+
void * data;
|
20196
|
+
size_t size;
|
20197
|
+
size_t offset;
|
20198
|
+
};
|
20199
|
+
|
20200
|
+
static struct gguf_buf gguf_buf_init(size_t size) {
|
20201
|
+
struct gguf_buf buf = {
|
20202
|
+
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
20203
|
+
/*buf.size =*/ size,
|
20204
|
+
/*buf.offset =*/ 0,
|
20205
|
+
};
|
20206
|
+
|
20207
|
+
return buf;
|
20208
|
+
}
|
20209
|
+
|
20210
|
+
static void gguf_buf_free(struct gguf_buf buf) {
|
20211
|
+
if (buf.data) {
|
20212
|
+
free(buf.data);
|
20213
|
+
}
|
20214
|
+
}
|
20215
|
+
|
20216
|
+
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
|
20217
|
+
if (buf->offset + size > buf->size) {
|
20218
|
+
buf->size = 1.5*(buf->offset + size);
|
20219
|
+
if (buf->data) {
|
20220
|
+
buf->data = realloc(buf->data, buf->size);
|
20221
|
+
}
|
20222
|
+
}
|
20223
|
+
}
|
20224
|
+
|
20225
|
+
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
|
20226
|
+
gguf_buf_grow(buf, sizeof(val->n) + val->n);
|
20227
|
+
|
20228
|
+
if (buf->data) {
|
20229
|
+
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
|
20230
|
+
}
|
20231
|
+
buf->offset += sizeof(val->n);
|
20232
|
+
|
20233
|
+
if (buf->data) {
|
20234
|
+
memcpy((char *) buf->data + buf->offset, val->data, val->n);
|
20235
|
+
}
|
20236
|
+
buf->offset += val->n;
|
20237
|
+
}
|
20238
|
+
|
20239
|
+
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
|
20240
|
+
gguf_buf_grow(buf, el_size);
|
20241
|
+
|
20242
|
+
if (buf->data) {
|
20243
|
+
memcpy((char *) buf->data + buf->offset, val, el_size);
|
20244
|
+
}
|
20245
|
+
buf->offset += el_size;
|
20246
|
+
}
|
20247
|
+
|
20248
|
+
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20249
|
+
// write header
|
20250
|
+
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20251
|
+
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
20252
|
+
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
|
20253
|
+
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
|
20254
|
+
|
20255
|
+
// write key-value pairs
|
20256
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
20257
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
20258
|
+
|
20259
|
+
gguf_bwrite_str(buf, &kv->key);
|
20260
|
+
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
|
20261
|
+
|
20262
|
+
switch (kv->type) {
|
20263
|
+
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
|
20264
|
+
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
|
20265
|
+
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
|
20266
|
+
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
|
20267
|
+
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
20268
|
+
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
20269
|
+
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
20270
|
+
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
20271
|
+
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
20272
|
+
case GGUF_TYPE_ARRAY:
|
20273
|
+
{
|
20274
|
+
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
|
20275
|
+
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
|
20276
|
+
|
20277
|
+
switch (kv->value.arr.type) {
|
20278
|
+
case GGUF_TYPE_UINT8:
|
20279
|
+
case GGUF_TYPE_INT8:
|
20280
|
+
case GGUF_TYPE_UINT16:
|
20281
|
+
case GGUF_TYPE_INT16:
|
20282
|
+
case GGUF_TYPE_UINT32:
|
20283
|
+
case GGUF_TYPE_INT32:
|
20284
|
+
case GGUF_TYPE_FLOAT32:
|
20285
|
+
case GGUF_TYPE_BOOL:
|
20286
|
+
{
|
20287
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
20288
|
+
} break;
|
20289
|
+
case GGUF_TYPE_STRING:
|
20290
|
+
{
|
20291
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
20292
|
+
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
|
20293
|
+
}
|
20294
|
+
} break;
|
20295
|
+
case GGUF_TYPE_ARRAY:
|
20296
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20297
|
+
};
|
20298
|
+
} break;
|
20299
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
20300
|
+
};
|
20301
|
+
}
|
20302
|
+
|
20303
|
+
// write tensor infos
|
20304
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20305
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20306
|
+
|
20307
|
+
gguf_bwrite_str(buf, &info->name);
|
20308
|
+
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
|
20309
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
20310
|
+
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
|
20311
|
+
}
|
20312
|
+
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
|
20313
|
+
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
|
20314
|
+
}
|
20315
|
+
|
20316
|
+
// we require the data section to be aligned, so take into account any padding
|
20317
|
+
{
|
20318
|
+
const size_t offset = buf->offset;
|
20319
|
+
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
|
20320
|
+
|
20321
|
+
if (offset_pad != offset) {
|
20322
|
+
uint8_t pad = 0;
|
20323
|
+
for (size_t i = 0; i < offset_pad - offset; ++i) {
|
20324
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20325
|
+
}
|
20326
|
+
}
|
20327
|
+
}
|
20328
|
+
|
20329
|
+
if (only_meta) {
|
20330
|
+
return;
|
20331
|
+
}
|
20332
|
+
|
20333
|
+
size_t offset = 0;
|
20334
|
+
|
20335
|
+
// write tensor data
|
20336
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20337
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20338
|
+
|
20339
|
+
const size_t size = info->size;
|
20340
|
+
const size_t size_pad = GGML_PAD(size, ctx->alignment);
|
20341
|
+
|
20342
|
+
gguf_bwrite_el(buf, info->data, size);
|
20343
|
+
|
20344
|
+
if (size_pad != size) {
|
20345
|
+
uint8_t pad = 0;
|
20346
|
+
for (size_t j = 0; j < size_pad - size; ++j) {
|
20347
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20348
|
+
}
|
20349
|
+
}
|
20350
|
+
|
20351
|
+
GGML_ASSERT(offset == info->offset);
|
20352
|
+
|
20353
|
+
offset += size_pad;
|
20354
|
+
}
|
20355
|
+
}
|
20356
|
+
|
20357
|
+
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20358
|
+
FILE * file = fopen(fname, "wb");
|
20359
|
+
if (!file) {
|
20360
|
+
GGML_ASSERT(false && "failed to open file for writing");
|
20361
|
+
}
|
20362
|
+
|
20363
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20364
|
+
|
20365
|
+
gguf_write_to_buf(ctx, &buf, only_meta);
|
20366
|
+
|
20367
|
+
fwrite(buf.data, 1, buf.offset, file);
|
20368
|
+
|
20369
|
+
gguf_buf_free(buf);
|
20370
|
+
|
20371
|
+
fclose(file);
|
20372
|
+
}
|
20373
|
+
|
20374
|
+
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20375
|
+
// no allocs - only compute size
|
20376
|
+
struct gguf_buf buf = gguf_buf_init(0);
|
20377
|
+
|
20378
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20379
|
+
|
20380
|
+
return buf.offset;
|
20381
|
+
}
|
20382
|
+
|
20383
|
+
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20384
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20385
|
+
|
20386
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20387
|
+
|
20388
|
+
memcpy(data, buf.data, buf.offset);
|
20389
|
+
|
20390
|
+
gguf_buf_free(buf);
|
20391
|
+
}
|
20392
|
+
|
20393
|
+
////////////////////////////////////////////////////////////////////////////////
|
20394
|
+
|
18564
20395
|
int ggml_cpu_has_avx(void) {
|
18565
20396
|
#if defined(__AVX__)
|
18566
20397
|
return 1;
|