llama_cpp 0.3.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
|
|
123
123
|
#define GGML_GELU_FP16
|
124
124
|
#define GGML_GELU_QUICK_FP16
|
125
125
|
#define GGML_SILU_FP16
|
126
|
+
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
127
|
+
// #define GGML_FLASH_ATTN_EXP_FP16
|
126
128
|
|
127
129
|
#define GGML_SOFT_MAX_UNROLL 4
|
128
130
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -157,12 +159,6 @@ typedef void * thread_ret_t;
|
|
157
159
|
//#define GGML_SOFT_MAX_ACCELERATE
|
158
160
|
#endif
|
159
161
|
|
160
|
-
#if UINTPTR_MAX == 0xFFFFFFFF
|
161
|
-
#define GGML_MEM_ALIGN 4
|
162
|
-
#else
|
163
|
-
#define GGML_MEM_ALIGN 16
|
164
|
-
#endif
|
165
|
-
|
166
162
|
//
|
167
163
|
// logging
|
168
164
|
//
|
@@ -192,8 +188,8 @@ typedef void * thread_ret_t;
|
|
192
188
|
//
|
193
189
|
|
194
190
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
195
|
-
#define GGML_ALIGNED_MALLOC(size)
|
196
|
-
#define GGML_ALIGNED_FREE(ptr)
|
191
|
+
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
192
|
+
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
193
|
#else
|
198
194
|
inline static void * ggml_aligned_malloc(size_t size) {
|
199
195
|
void * aligned_memory = NULL;
|
@@ -213,14 +209,13 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
213
209
|
error_desc = "insufficient memory";
|
214
210
|
break;
|
215
211
|
}
|
216
|
-
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
217
|
-
__func__, error_desc, size/(1024.0*1024.0));
|
212
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
218
213
|
return NULL;
|
219
214
|
}
|
220
215
|
return aligned_memory;
|
221
216
|
}
|
222
|
-
#define GGML_ALIGNED_MALLOC(size)
|
223
|
-
#define GGML_ALIGNED_FREE(ptr)
|
217
|
+
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
218
|
+
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
224
219
|
#endif
|
225
220
|
|
226
221
|
#define UNUSED GGML_UNUSED
|
@@ -306,6 +301,10 @@ typedef double ggml_float;
|
|
306
301
|
#endif
|
307
302
|
#endif
|
308
303
|
|
304
|
+
#ifdef __riscv_v_intrinsic
|
305
|
+
#include <riscv_vector.h>
|
306
|
+
#endif
|
307
|
+
|
309
308
|
#ifdef __F16C__
|
310
309
|
|
311
310
|
#ifdef _MSC_VER
|
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
1643
1642
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1644
1643
|
|
1645
1644
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1645
|
+
[GGML_TYPE_I8] = {
|
1646
|
+
.type_name = "i8",
|
1647
|
+
.blck_size = 1,
|
1648
|
+
.type_size = sizeof(int8_t),
|
1649
|
+
.is_quantized = false,
|
1650
|
+
},
|
1651
|
+
[GGML_TYPE_I16] = {
|
1652
|
+
.type_name = "i16",
|
1653
|
+
.blck_size = 1,
|
1654
|
+
.type_size = sizeof(int16_t),
|
1655
|
+
.is_quantized = false,
|
1656
|
+
},
|
1657
|
+
[GGML_TYPE_I32] = {
|
1658
|
+
.type_name = "i32",
|
1659
|
+
.blck_size = 1,
|
1660
|
+
.type_size = sizeof(int32_t),
|
1661
|
+
.is_quantized = false,
|
1662
|
+
},
|
1646
1663
|
[GGML_TYPE_F32] = {
|
1664
|
+
.type_name = "f32",
|
1665
|
+
.blck_size = 1,
|
1666
|
+
.type_size = sizeof(float),
|
1667
|
+
.is_quantized = false,
|
1647
1668
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1648
1669
|
.vec_dot_type = GGML_TYPE_F32,
|
1649
1670
|
},
|
1650
1671
|
[GGML_TYPE_F16] = {
|
1672
|
+
.type_name = "f16",
|
1673
|
+
.blck_size = 1,
|
1674
|
+
.type_size = sizeof(ggml_fp16_t),
|
1675
|
+
.is_quantized = false,
|
1651
1676
|
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1652
1677
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1653
1678
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1655
1680
|
.vec_dot_type = GGML_TYPE_F16,
|
1656
1681
|
},
|
1657
1682
|
[GGML_TYPE_Q4_0] = {
|
1683
|
+
.type_name = "q4_0",
|
1684
|
+
.blck_size = QK4_0,
|
1685
|
+
.type_size = sizeof(block_q4_0),
|
1686
|
+
.is_quantized = true,
|
1658
1687
|
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1659
1688
|
.from_float = quantize_row_q4_0,
|
1660
1689
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1662
1691
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1663
1692
|
},
|
1664
1693
|
[GGML_TYPE_Q4_1] = {
|
1694
|
+
.type_name = "q4_1",
|
1695
|
+
.blck_size = QK4_1,
|
1696
|
+
.type_size = sizeof(block_q4_1),
|
1697
|
+
.is_quantized = true,
|
1665
1698
|
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1666
1699
|
.from_float = quantize_row_q4_1,
|
1667
1700
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1669
1702
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1670
1703
|
},
|
1671
1704
|
[GGML_TYPE_Q5_0] = {
|
1705
|
+
.type_name = "q5_0",
|
1706
|
+
.blck_size = QK5_0,
|
1707
|
+
.type_size = sizeof(block_q5_0),
|
1708
|
+
.is_quantized = true,
|
1672
1709
|
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1673
1710
|
.from_float = quantize_row_q5_0,
|
1674
1711
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1676
1713
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1677
1714
|
},
|
1678
1715
|
[GGML_TYPE_Q5_1] = {
|
1716
|
+
.type_name = "q5_1",
|
1717
|
+
.blck_size = QK5_1,
|
1718
|
+
.type_size = sizeof(block_q5_1),
|
1719
|
+
.is_quantized = true,
|
1679
1720
|
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1680
1721
|
.from_float = quantize_row_q5_1,
|
1681
1722
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1683
1724
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1684
1725
|
},
|
1685
1726
|
[GGML_TYPE_Q8_0] = {
|
1727
|
+
.type_name = "q8_0",
|
1728
|
+
.blck_size = QK8_0,
|
1729
|
+
.type_size = sizeof(block_q8_0),
|
1730
|
+
.is_quantized = true,
|
1686
1731
|
.to_float = dequantize_row_q8_0,
|
1687
1732
|
.from_float = quantize_row_q8_0,
|
1688
1733
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1690
1735
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1691
1736
|
},
|
1692
1737
|
[GGML_TYPE_Q8_1] = {
|
1738
|
+
.type_name = "q8_1",
|
1739
|
+
.blck_size = QK8_1,
|
1740
|
+
.type_size = sizeof(block_q8_1),
|
1741
|
+
.is_quantized = true,
|
1693
1742
|
.from_float = quantize_row_q8_1,
|
1694
1743
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1695
1744
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1696
1745
|
},
|
1697
1746
|
#ifdef GGML_USE_K_QUANTS
|
1698
1747
|
[GGML_TYPE_Q2_K] = {
|
1748
|
+
.type_name = "q2_K",
|
1749
|
+
.blck_size = QK_K,
|
1750
|
+
.type_size = sizeof(block_q2_K),
|
1751
|
+
.is_quantized = true,
|
1699
1752
|
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1700
1753
|
.from_float = quantize_row_q2_K,
|
1701
1754
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1703
1756
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1704
1757
|
},
|
1705
1758
|
[GGML_TYPE_Q3_K] = {
|
1759
|
+
.type_name = "q3_K",
|
1760
|
+
.blck_size = QK_K,
|
1761
|
+
.type_size = sizeof(block_q3_K),
|
1762
|
+
.is_quantized = true,
|
1706
1763
|
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1707
1764
|
.from_float = quantize_row_q3_K,
|
1708
1765
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1710
1767
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1711
1768
|
},
|
1712
1769
|
[GGML_TYPE_Q4_K] = {
|
1770
|
+
.type_name = "q4_K",
|
1771
|
+
.blck_size = QK_K,
|
1772
|
+
.type_size = sizeof(block_q4_K),
|
1773
|
+
.is_quantized = true,
|
1713
1774
|
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1714
1775
|
.from_float = quantize_row_q4_K,
|
1715
1776
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1717
1778
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1718
1779
|
},
|
1719
1780
|
[GGML_TYPE_Q5_K] = {
|
1781
|
+
.type_name = "q5_K",
|
1782
|
+
.blck_size = QK_K,
|
1783
|
+
.type_size = sizeof(block_q5_K),
|
1784
|
+
.is_quantized = true,
|
1720
1785
|
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1721
1786
|
.from_float = quantize_row_q5_K,
|
1722
1787
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1724
1789
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1725
1790
|
},
|
1726
1791
|
[GGML_TYPE_Q6_K] = {
|
1792
|
+
.type_name = "q6_K",
|
1793
|
+
.blck_size = QK_K,
|
1794
|
+
.type_size = sizeof(block_q6_K),
|
1795
|
+
.is_quantized = true,
|
1727
1796
|
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1728
1797
|
.from_float = quantize_row_q6_K,
|
1729
1798
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1731
1800
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1732
1801
|
},
|
1733
1802
|
[GGML_TYPE_Q8_K] = {
|
1803
|
+
.type_name = "q8_K",
|
1804
|
+
.blck_size = QK_K,
|
1805
|
+
.type_size = sizeof(block_q8_K),
|
1806
|
+
.is_quantized = true,
|
1734
1807
|
.from_float = quantize_row_q8_K,
|
1735
1808
|
}
|
1736
1809
|
#endif
|
1737
1810
|
};
|
1738
1811
|
|
1739
1812
|
// For internal test use
|
1740
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type
|
1741
|
-
GGML_ASSERT(
|
1742
|
-
return type_traits[
|
1813
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
1814
|
+
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
1815
|
+
return type_traits[type];
|
1743
1816
|
}
|
1744
1817
|
|
1745
1818
|
|
@@ -2363,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2363
2436
|
const int nb = n / qk;
|
2364
2437
|
|
2365
2438
|
assert(n % qk == 0);
|
2366
|
-
assert(nb % 2 == 0);
|
2367
2439
|
|
2368
2440
|
const block_q4_0 * restrict x = vx;
|
2369
2441
|
const block_q8_0 * restrict y = vy;
|
@@ -2372,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2372
2444
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
2373
2445
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
2374
2446
|
|
2447
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2375
2448
|
for (int i = 0; i < nb; i += 2) {
|
2376
2449
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
2377
2450
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
@@ -2550,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2550
2623
|
}
|
2551
2624
|
|
2552
2625
|
// Main loop
|
2626
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2553
2627
|
for (int i = 2; i < nb; i+=2) {
|
2554
2628
|
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
|
2555
2629
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
@@ -2607,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2607
2681
|
}
|
2608
2682
|
|
2609
2683
|
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
2684
|
+
#elif defined(__riscv_v_intrinsic)
|
2685
|
+
float sumf = 0.0;
|
2686
|
+
|
2687
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2688
|
+
|
2689
|
+
for (int i = 0; i < nb; i++) {
|
2690
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2691
|
+
|
2692
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2693
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2694
|
+
|
2695
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2696
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2697
|
+
|
2698
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2699
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2700
|
+
|
2701
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
|
2702
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
|
2703
|
+
|
2704
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2705
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2706
|
+
|
2707
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2708
|
+
|
2709
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2710
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2711
|
+
|
2712
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2713
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2714
|
+
|
2715
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2716
|
+
}
|
2717
|
+
|
2718
|
+
*s = sumf;
|
2610
2719
|
#else
|
2611
2720
|
// scalar
|
2612
2721
|
float sumf = 0.0;
|
@@ -2633,7 +2742,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2633
2742
|
const int nb = n / qk;
|
2634
2743
|
|
2635
2744
|
assert(n % qk == 0);
|
2636
|
-
assert(nb % 2 == 0);
|
2637
2745
|
|
2638
2746
|
const block_q4_1 * restrict x = vx;
|
2639
2747
|
const block_q8_1 * restrict y = vy;
|
@@ -2645,6 +2753,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2645
2753
|
|
2646
2754
|
float summs = 0;
|
2647
2755
|
|
2756
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2648
2757
|
for (int i = 0; i < nb; i += 2) {
|
2649
2758
|
const block_q4_1 * restrict x0 = &x[i + 0];
|
2650
2759
|
const block_q4_1 * restrict x1 = &x[i + 1];
|
@@ -2733,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2733
2842
|
}
|
2734
2843
|
|
2735
2844
|
*s = hsum_float_8(acc) + summs;
|
2845
|
+
#elif defined(__riscv_v_intrinsic)
|
2846
|
+
float sumf = 0.0;
|
2847
|
+
|
2848
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2849
|
+
|
2850
|
+
for (int i = 0; i < nb; i++) {
|
2851
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2852
|
+
|
2853
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2854
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2855
|
+
|
2856
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2857
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2858
|
+
|
2859
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2860
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2861
|
+
|
2862
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2863
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2864
|
+
|
2865
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2866
|
+
|
2867
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2868
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2869
|
+
|
2870
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2871
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2872
|
+
|
2873
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2874
|
+
}
|
2875
|
+
|
2876
|
+
*s = sumf;
|
2736
2877
|
#else
|
2737
2878
|
// scalar
|
2738
2879
|
float sumf = 0.0;
|
@@ -2759,7 +2900,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2759
2900
|
const int nb = n / qk;
|
2760
2901
|
|
2761
2902
|
assert(n % qk == 0);
|
2762
|
-
assert(nb % 2 == 0);
|
2763
2903
|
assert(qk == QK5_0);
|
2764
2904
|
|
2765
2905
|
const block_q5_0 * restrict x = vx;
|
@@ -2775,6 +2915,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2775
2915
|
uint64_t tmp0[4];
|
2776
2916
|
uint64_t tmp1[4];
|
2777
2917
|
|
2918
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2778
2919
|
for (int i = 0; i < nb; i += 2) {
|
2779
2920
|
const block_q5_0 * restrict x0 = &x[i];
|
2780
2921
|
const block_q5_0 * restrict x1 = &x[i + 1];
|
@@ -2967,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2967
3108
|
}
|
2968
3109
|
|
2969
3110
|
*s = hsum_float_8(acc);
|
3111
|
+
#elif defined(__riscv_v_intrinsic)
|
3112
|
+
float sumf = 0.0;
|
3113
|
+
|
3114
|
+
uint32_t qh;
|
3115
|
+
|
3116
|
+
// These temp values are for masking and shift operations
|
3117
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3118
|
+
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
3119
|
+
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
|
3120
|
+
|
3121
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3122
|
+
|
3123
|
+
for (int i = 0; i < nb; i++) {
|
3124
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3125
|
+
|
3126
|
+
// temporary registers
|
3127
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
|
3128
|
+
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3129
|
+
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
|
3130
|
+
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
|
3131
|
+
|
3132
|
+
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
3133
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
|
3134
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
|
3135
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3136
|
+
|
3137
|
+
// ((qh & (1u << (j + 16))) >> (j + 12));
|
3138
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
|
3139
|
+
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
|
3140
|
+
|
3141
|
+
// narrowing
|
3142
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
|
3143
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3144
|
+
|
3145
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
|
3146
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3147
|
+
|
3148
|
+
// load
|
3149
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3150
|
+
|
3151
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3152
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3153
|
+
|
3154
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3155
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3156
|
+
|
3157
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3158
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3159
|
+
|
3160
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3161
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3162
|
+
|
3163
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
|
3164
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
|
3165
|
+
|
3166
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3167
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3168
|
+
|
3169
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3170
|
+
|
3171
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3172
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3173
|
+
|
3174
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3175
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3176
|
+
|
3177
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
3178
|
+
}
|
3179
|
+
|
3180
|
+
*s = sumf;
|
2970
3181
|
#else
|
2971
3182
|
// scalar
|
2972
3183
|
float sumf = 0.0;
|
@@ -2999,7 +3210,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2999
3210
|
const int nb = n / qk;
|
3000
3211
|
|
3001
3212
|
assert(n % qk == 0);
|
3002
|
-
assert(nb % 2 == 0);
|
3003
3213
|
assert(qk == QK5_1);
|
3004
3214
|
|
3005
3215
|
const block_q5_1 * restrict x = vx;
|
@@ -3018,6 +3228,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3018
3228
|
uint64_t tmp0[4];
|
3019
3229
|
uint64_t tmp1[4];
|
3020
3230
|
|
3231
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3021
3232
|
for (int i = 0; i < nb; i += 2) {
|
3022
3233
|
const block_q5_1 * restrict x0 = &x[i];
|
3023
3234
|
const block_q5_1 * restrict x1 = &x[i + 1];
|
@@ -3223,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3223
3434
|
}
|
3224
3435
|
|
3225
3436
|
*s = hsum_float_8(acc) + summs;
|
3437
|
+
#elif defined(__riscv_v_intrinsic)
|
3438
|
+
float sumf = 0.0;
|
3439
|
+
|
3440
|
+
uint32_t qh;
|
3441
|
+
|
3442
|
+
// These temp values are for shift operations
|
3443
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3444
|
+
|
3445
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3446
|
+
|
3447
|
+
for (int i = 0; i < nb; i++) {
|
3448
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3449
|
+
|
3450
|
+
// temporary registers
|
3451
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3452
|
+
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
|
3453
|
+
|
3454
|
+
// load qh
|
3455
|
+
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
|
3456
|
+
|
3457
|
+
// ((qh >> (j + 0)) << 4) & 0x10;
|
3458
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
|
3459
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3460
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
|
3461
|
+
|
3462
|
+
// ((qh >> (j + 12)) ) & 0x10;
|
3463
|
+
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
|
3464
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
|
3465
|
+
|
3466
|
+
// narrowing
|
3467
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
|
3468
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3469
|
+
|
3470
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
|
3471
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3472
|
+
|
3473
|
+
// load
|
3474
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3475
|
+
|
3476
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3477
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3478
|
+
|
3479
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3480
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3481
|
+
|
3482
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3483
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3484
|
+
|
3485
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3486
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3487
|
+
|
3488
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3489
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3490
|
+
|
3491
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3492
|
+
|
3493
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3494
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3495
|
+
|
3496
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3497
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3498
|
+
|
3499
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
3500
|
+
}
|
3501
|
+
|
3502
|
+
*s = sumf;
|
3226
3503
|
#else
|
3227
3504
|
// scalar
|
3228
3505
|
float sumf = 0.0;
|
@@ -3255,7 +3532,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3255
3532
|
const int nb = n / qk;
|
3256
3533
|
|
3257
3534
|
assert(n % qk == 0);
|
3258
|
-
assert(nb % 2 == 0);
|
3259
3535
|
|
3260
3536
|
const block_q8_0 * restrict x = vx;
|
3261
3537
|
const block_q8_0 * restrict y = vy;
|
@@ -3264,6 +3540,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3264
3540
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3265
3541
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3266
3542
|
|
3543
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3267
3544
|
for (int i = 0; i < nb; i += 2) {
|
3268
3545
|
const block_q8_0 * restrict x0 = &x[i + 0];
|
3269
3546
|
const block_q8_0 * restrict x1 = &x[i + 1];
|
@@ -3334,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3334
3611
|
}
|
3335
3612
|
|
3336
3613
|
*s = hsum_float_8(acc);
|
3614
|
+
#elif defined(__riscv_v_intrinsic)
|
3615
|
+
float sumf = 0.0;
|
3616
|
+
size_t vl = __riscv_vsetvl_e8m1(qk);
|
3617
|
+
|
3618
|
+
for (int i = 0; i < nb; i++) {
|
3619
|
+
// load elements
|
3620
|
+
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
3621
|
+
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3622
|
+
|
3623
|
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
3624
|
+
|
3625
|
+
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3626
|
+
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
3627
|
+
|
3628
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
3629
|
+
|
3630
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3631
|
+
}
|
3632
|
+
|
3633
|
+
*s = sumf;
|
3337
3634
|
#else
|
3338
3635
|
// scalar
|
3339
3636
|
float sumf = 0.0;
|
@@ -3481,9 +3778,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
3481
3778
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
3482
3779
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3483
3780
|
|
3484
|
-
static const float GELU_COEF_A
|
3485
|
-
static const float GELU_QUICK_COEF
|
3486
|
-
static const float SQRT_2_OVER_PI
|
3781
|
+
static const float GELU_COEF_A = 0.044715f;
|
3782
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3783
|
+
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3487
3784
|
|
3488
3785
|
inline static float ggml_gelu_f32(float x) {
|
3489
3786
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
@@ -3652,95 +3949,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
|
3652
3949
|
// data types
|
3653
3950
|
//
|
3654
3951
|
|
3655
|
-
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
3656
|
-
[GGML_TYPE_F32] = 1,
|
3657
|
-
[GGML_TYPE_F16] = 1,
|
3658
|
-
[GGML_TYPE_Q4_0] = QK4_0,
|
3659
|
-
[GGML_TYPE_Q4_1] = QK4_1,
|
3660
|
-
[GGML_TYPE_Q5_0] = QK5_0,
|
3661
|
-
[GGML_TYPE_Q5_1] = QK5_1,
|
3662
|
-
[GGML_TYPE_Q8_0] = QK8_0,
|
3663
|
-
[GGML_TYPE_Q8_1] = QK8_1,
|
3664
|
-
#ifdef GGML_USE_K_QUANTS
|
3665
|
-
[GGML_TYPE_Q2_K] = QK_K,
|
3666
|
-
[GGML_TYPE_Q3_K] = QK_K,
|
3667
|
-
[GGML_TYPE_Q4_K] = QK_K,
|
3668
|
-
[GGML_TYPE_Q5_K] = QK_K,
|
3669
|
-
[GGML_TYPE_Q6_K] = QK_K,
|
3670
|
-
[GGML_TYPE_Q8_K] = QK_K,
|
3671
|
-
#endif
|
3672
|
-
[GGML_TYPE_I8] = 1,
|
3673
|
-
[GGML_TYPE_I16] = 1,
|
3674
|
-
[GGML_TYPE_I32] = 1,
|
3675
|
-
};
|
3676
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3677
|
-
|
3678
|
-
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3679
|
-
[GGML_TYPE_F32] = sizeof(float),
|
3680
|
-
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
3681
|
-
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
3682
|
-
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
3683
|
-
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
3684
|
-
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3685
|
-
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3686
|
-
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3687
|
-
#ifdef GGML_USE_K_QUANTS
|
3688
|
-
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3689
|
-
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3690
|
-
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3691
|
-
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3692
|
-
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3693
|
-
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3694
|
-
#endif
|
3695
|
-
[GGML_TYPE_I8] = sizeof(int8_t),
|
3696
|
-
[GGML_TYPE_I16] = sizeof(int16_t),
|
3697
|
-
[GGML_TYPE_I32] = sizeof(int32_t),
|
3698
|
-
};
|
3699
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3700
|
-
|
3701
|
-
|
3702
|
-
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
3703
|
-
[GGML_TYPE_F32] = "f32",
|
3704
|
-
[GGML_TYPE_F16] = "f16",
|
3705
|
-
[GGML_TYPE_Q4_0] = "q4_0",
|
3706
|
-
[GGML_TYPE_Q4_1] = "q4_1",
|
3707
|
-
[GGML_TYPE_Q5_0] = "q5_0",
|
3708
|
-
[GGML_TYPE_Q5_1] = "q5_1",
|
3709
|
-
[GGML_TYPE_Q8_0] = "q8_0",
|
3710
|
-
[GGML_TYPE_Q8_1] = "q8_1",
|
3711
|
-
[GGML_TYPE_Q2_K] = "q2_K",
|
3712
|
-
[GGML_TYPE_Q3_K] = "q3_K",
|
3713
|
-
[GGML_TYPE_Q4_K] = "q4_K",
|
3714
|
-
[GGML_TYPE_Q5_K] = "q5_K",
|
3715
|
-
[GGML_TYPE_Q6_K] = "q6_K",
|
3716
|
-
[GGML_TYPE_Q8_K] = "q8_K",
|
3717
|
-
[GGML_TYPE_I8] = "i8",
|
3718
|
-
[GGML_TYPE_I16] = "i16",
|
3719
|
-
[GGML_TYPE_I32] = "i32",
|
3720
|
-
};
|
3721
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3722
|
-
|
3723
|
-
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3724
|
-
[GGML_TYPE_F32] = false,
|
3725
|
-
[GGML_TYPE_F16] = false,
|
3726
|
-
[GGML_TYPE_Q4_0] = true,
|
3727
|
-
[GGML_TYPE_Q4_1] = true,
|
3728
|
-
[GGML_TYPE_Q5_0] = true,
|
3729
|
-
[GGML_TYPE_Q5_1] = true,
|
3730
|
-
[GGML_TYPE_Q8_0] = true,
|
3731
|
-
[GGML_TYPE_Q8_1] = true,
|
3732
|
-
[GGML_TYPE_Q2_K] = true,
|
3733
|
-
[GGML_TYPE_Q3_K] = true,
|
3734
|
-
[GGML_TYPE_Q4_K] = true,
|
3735
|
-
[GGML_TYPE_Q5_K] = true,
|
3736
|
-
[GGML_TYPE_Q6_K] = true,
|
3737
|
-
[GGML_TYPE_Q8_K] = true,
|
3738
|
-
[GGML_TYPE_I8] = false,
|
3739
|
-
[GGML_TYPE_I16] = false,
|
3740
|
-
[GGML_TYPE_I32] = false,
|
3741
|
-
};
|
3742
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3743
|
-
|
3744
3952
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3745
3953
|
"NONE",
|
3746
3954
|
|
@@ -3760,10 +3968,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3760
3968
|
"ARGMAX",
|
3761
3969
|
"REPEAT",
|
3762
3970
|
"REPEAT_BACK",
|
3971
|
+
"CONCAT",
|
3763
3972
|
"SILU_BACK",
|
3764
3973
|
"NORM",
|
3765
3974
|
"RMS_NORM",
|
3766
3975
|
"RMS_NORM_BACK",
|
3976
|
+
"GROUP_NORM",
|
3767
3977
|
|
3768
3978
|
"MUL_MAT",
|
3769
3979
|
"OUT_PROD",
|
@@ -3789,20 +3999,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3789
3999
|
"CLAMP",
|
3790
4000
|
"CONV_1D",
|
3791
4001
|
"CONV_2D",
|
4002
|
+
"CONV_TRANSPOSE_2D",
|
3792
4003
|
"POOL_1D",
|
3793
4004
|
"POOL_2D",
|
4005
|
+
"UPSCALE",
|
3794
4006
|
|
3795
4007
|
"FLASH_ATTN",
|
3796
4008
|
"FLASH_FF",
|
3797
4009
|
"FLASH_ATTN_BACK",
|
3798
4010
|
"WIN_PART",
|
3799
4011
|
"WIN_UNPART",
|
4012
|
+
"GET_REL_POS",
|
4013
|
+
"ADD_REL_POS",
|
3800
4014
|
|
3801
4015
|
"UNARY",
|
3802
4016
|
|
3803
4017
|
"MAP_UNARY",
|
3804
4018
|
"MAP_BINARY",
|
3805
4019
|
|
4020
|
+
"MAP_CUSTOM1_F32",
|
4021
|
+
"MAP_CUSTOM2_F32",
|
4022
|
+
"MAP_CUSTOM3_F32",
|
4023
|
+
|
3806
4024
|
"MAP_CUSTOM1",
|
3807
4025
|
"MAP_CUSTOM2",
|
3808
4026
|
"MAP_CUSTOM3",
|
@@ -3811,7 +4029,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
4029
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
4030
|
};
|
3813
4031
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
4032
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3815
4033
|
|
3816
4034
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
4035
|
"none",
|
@@ -3832,10 +4050,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3832
4050
|
"argmax(x)",
|
3833
4051
|
"repeat(x)",
|
3834
4052
|
"repeat_back(x)",
|
4053
|
+
"concat(x, y)",
|
3835
4054
|
"silu_back(x)",
|
3836
4055
|
"norm(x)",
|
3837
4056
|
"rms_norm(x)",
|
3838
4057
|
"rms_norm_back(x)",
|
4058
|
+
"group_norm(x)",
|
3839
4059
|
|
3840
4060
|
"X*Y",
|
3841
4061
|
"X*Y",
|
@@ -3861,20 +4081,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3861
4081
|
"clamp(x)",
|
3862
4082
|
"conv_1d(x)",
|
3863
4083
|
"conv_2d(x)",
|
4084
|
+
"conv_transpose_2d(x)",
|
3864
4085
|
"pool_1d(x)",
|
3865
4086
|
"pool_2d(x)",
|
4087
|
+
"upscale(x)",
|
3866
4088
|
|
3867
4089
|
"flash_attn(x)",
|
3868
4090
|
"flash_ff(x)",
|
3869
4091
|
"flash_attn_back(x)",
|
3870
4092
|
"win_part(x)",
|
3871
4093
|
"win_unpart(x)",
|
4094
|
+
"get_rel_pos(x)",
|
4095
|
+
"add_rel_pos(x)",
|
3872
4096
|
|
3873
4097
|
"unary(x)",
|
3874
4098
|
|
3875
4099
|
"f(x)",
|
3876
4100
|
"f(x,y)",
|
3877
4101
|
|
4102
|
+
"custom_f32(x)",
|
4103
|
+
"custom_f32(x,y)",
|
4104
|
+
"custom_f32(x,y,z)",
|
4105
|
+
|
3878
4106
|
"custom(x)",
|
3879
4107
|
"custom(x,y)",
|
3880
4108
|
"custom(x,y,z)",
|
@@ -3883,7 +4111,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
4111
|
"cross_entropy_loss_back(x,y)",
|
3884
4112
|
};
|
3885
4113
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
4114
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3887
4115
|
|
3888
4116
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
4117
|
|
@@ -3913,8 +4141,10 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
3913
4141
|
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3914
4142
|
p[GGML_OP_CONV_1D ] = true;
|
3915
4143
|
p[GGML_OP_CONV_2D ] = true;
|
4144
|
+
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
3916
4145
|
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3917
4146
|
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
4147
|
+
p[GGML_OP_ADD_REL_POS ] = true;
|
3918
4148
|
}
|
3919
4149
|
|
3920
4150
|
{ // FINALIZE
|
@@ -4101,38 +4331,41 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4101
4331
|
}
|
4102
4332
|
|
4103
4333
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4104
|
-
|
4105
|
-
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
//
|
4111
|
-
// is enough, but just in case, adding the second part
|
4334
|
+
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
4335
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4336
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4337
|
+
}
|
4338
|
+
return nbytes;
|
4339
|
+
}
|
4112
4340
|
|
4113
|
-
|
4341
|
+
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
4342
|
+
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
4114
4343
|
}
|
4115
4344
|
|
4116
4345
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
4117
4346
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4118
4347
|
|
4119
|
-
return (nrows_split*tensor->ne[0]*
|
4348
|
+
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
4120
4349
|
}
|
4121
4350
|
|
4122
4351
|
int ggml_blck_size(enum ggml_type type) {
|
4123
|
-
return
|
4352
|
+
return type_traits[type].blck_size;
|
4124
4353
|
}
|
4125
4354
|
|
4126
4355
|
size_t ggml_type_size(enum ggml_type type) {
|
4127
|
-
return
|
4356
|
+
return type_traits[type].type_size;
|
4128
4357
|
}
|
4129
4358
|
|
4130
4359
|
float ggml_type_sizef(enum ggml_type type) {
|
4131
|
-
return ((float)(
|
4360
|
+
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
|
4132
4361
|
}
|
4133
4362
|
|
4134
4363
|
const char * ggml_type_name(enum ggml_type type) {
|
4135
|
-
return
|
4364
|
+
return type_traits[type].type_name;
|
4365
|
+
}
|
4366
|
+
|
4367
|
+
bool ggml_is_quantized(enum ggml_type type) {
|
4368
|
+
return type_traits[type].is_quantized;
|
4136
4369
|
}
|
4137
4370
|
|
4138
4371
|
const char * ggml_op_name(enum ggml_op op) {
|
@@ -4144,7 +4377,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
4144
4377
|
}
|
4145
4378
|
|
4146
4379
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
4147
|
-
return
|
4380
|
+
return ggml_type_size(tensor->type);
|
4148
4381
|
}
|
4149
4382
|
|
4150
4383
|
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
@@ -4182,10 +4415,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
|
|
4182
4415
|
(t0->ne[3] == t1->ne[3]);
|
4183
4416
|
}
|
4184
4417
|
|
4185
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
4186
|
-
return GGML_IS_QUANTIZED[type];
|
4187
|
-
}
|
4188
|
-
|
4189
4418
|
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
4190
4419
|
enum ggml_type wtype = GGML_TYPE_COUNT;
|
4191
4420
|
|
@@ -4223,8 +4452,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
4223
4452
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4224
4453
|
|
4225
4454
|
return
|
4226
|
-
tensor->nb[0] ==
|
4227
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/
|
4455
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4456
|
+
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
4228
4457
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4229
4458
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4230
4459
|
}
|
@@ -4233,7 +4462,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
4233
4462
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4234
4463
|
|
4235
4464
|
return
|
4236
|
-
tensor->nb[0] ==
|
4465
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4237
4466
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4238
4467
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4239
4468
|
}
|
@@ -4248,7 +4477,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4248
4477
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4249
4478
|
|
4250
4479
|
return
|
4251
|
-
tensor->nb[0] ==
|
4480
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4252
4481
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4253
4482
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4483
|
}
|
@@ -4560,36 +4789,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4560
4789
|
enum ggml_type type,
|
4561
4790
|
int n_dims,
|
4562
4791
|
const int64_t * ne,
|
4563
|
-
|
4792
|
+
struct ggml_tensor * view_src,
|
4793
|
+
size_t view_offs) {
|
4564
4794
|
|
4565
4795
|
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4566
4796
|
|
4567
|
-
|
4797
|
+
// find the base tensor and absolute offset
|
4798
|
+
if (view_src != NULL && view_src->view_src != NULL) {
|
4799
|
+
view_offs += view_src->view_offs;
|
4800
|
+
view_src = view_src->view_src;
|
4801
|
+
}
|
4802
|
+
|
4803
|
+
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
4804
|
+
for (int i = 1; i < n_dims; i++) {
|
4805
|
+
data_size *= ne[i];
|
4806
|
+
}
|
4807
|
+
|
4808
|
+
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
4568
4809
|
|
4569
|
-
|
4570
|
-
|
4571
|
-
|
4572
|
-
data_size *= ne[i];
|
4573
|
-
}
|
4810
|
+
void * data = view_src != NULL ? view_src->data : NULL;
|
4811
|
+
if (data != NULL) {
|
4812
|
+
data = (char *) data + view_offs;
|
4574
4813
|
}
|
4575
4814
|
|
4576
|
-
|
4577
|
-
// allocate tensor data in the scratch buffer
|
4578
|
-
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
4579
|
-
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4580
|
-
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
4581
|
-
assert(false);
|
4582
|
-
return NULL;
|
4583
|
-
}
|
4815
|
+
size_t obj_alloc_size = 0;
|
4584
4816
|
|
4585
|
-
|
4817
|
+
if (view_src == NULL && ctx->no_alloc == false) {
|
4818
|
+
if (ctx->scratch.data != NULL) {
|
4819
|
+
// allocate tensor data in the scratch buffer
|
4820
|
+
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
4821
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4822
|
+
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
4823
|
+
assert(false);
|
4824
|
+
return NULL;
|
4825
|
+
}
|
4586
4826
|
|
4587
|
-
|
4827
|
+
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
4588
4828
|
|
4589
|
-
|
4829
|
+
ctx->scratch.offs += data_size;
|
4830
|
+
} else {
|
4831
|
+
// allocate tensor data in the context's memory pool
|
4832
|
+
obj_alloc_size = data_size;
|
4833
|
+
}
|
4590
4834
|
}
|
4591
4835
|
|
4592
|
-
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE +
|
4836
|
+
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
|
4593
4837
|
|
4594
4838
|
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
4595
4839
|
|
@@ -4609,7 +4853,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4609
4853
|
/*.perf_runs =*/ 0,
|
4610
4854
|
/*.perf_cycles =*/ 0,
|
4611
4855
|
/*.perf_time_us =*/ 0,
|
4612
|
-
/*.
|
4856
|
+
/*.view_src =*/ view_src,
|
4857
|
+
/*.view_offs =*/ view_offs,
|
4858
|
+
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
|
4613
4859
|
/*.name =*/ { 0 },
|
4614
4860
|
/*.extra =*/ NULL,
|
4615
4861
|
/*.padding =*/ { 0 },
|
@@ -4622,8 +4868,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4622
4868
|
result->ne[i] = ne[i];
|
4623
4869
|
}
|
4624
4870
|
|
4625
|
-
result->nb[0] =
|
4626
|
-
result->nb[1] = result->nb[0]*(result->ne[0]/
|
4871
|
+
result->nb[0] = ggml_type_size(type);
|
4872
|
+
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
4627
4873
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
4628
4874
|
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
4629
4875
|
}
|
@@ -4633,28 +4879,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4633
4879
|
return result;
|
4634
4880
|
}
|
4635
4881
|
|
4636
|
-
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4637
|
-
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4638
|
-
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4639
|
-
memcpy(tensor->op_params, params, params_size);
|
4640
|
-
}
|
4641
|
-
|
4642
|
-
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4643
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4644
|
-
return ((const int32_t *)(tensor->op_params))[i];
|
4645
|
-
}
|
4646
|
-
|
4647
|
-
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4648
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4649
|
-
((int32_t *)(tensor->op_params))[i] = value;
|
4650
|
-
}
|
4651
|
-
|
4652
4882
|
struct ggml_tensor * ggml_new_tensor(
|
4653
4883
|
struct ggml_context * ctx,
|
4654
4884
|
enum ggml_type type,
|
4655
4885
|
int n_dims,
|
4656
4886
|
const int64_t * ne) {
|
4657
|
-
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4887
|
+
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
|
4658
4888
|
}
|
4659
4889
|
|
4660
4890
|
struct ggml_tensor * ggml_new_tensor_1d(
|
@@ -4719,7 +4949,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
4719
4949
|
}
|
4720
4950
|
|
4721
4951
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
4722
|
-
return
|
4952
|
+
return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
|
4953
|
+
}
|
4954
|
+
|
4955
|
+
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4956
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4957
|
+
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4958
|
+
memcpy(tensor->op_params, params, params_size);
|
4959
|
+
}
|
4960
|
+
|
4961
|
+
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4962
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4963
|
+
return ((const int32_t *)(tensor->op_params))[i];
|
4964
|
+
}
|
4965
|
+
|
4966
|
+
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4967
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4968
|
+
((int32_t *)(tensor->op_params))[i] = value;
|
4723
4969
|
}
|
4724
4970
|
|
4725
4971
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
@@ -5005,14 +5251,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
5005
5251
|
|
5006
5252
|
struct ggml_tensor * ggml_view_tensor(
|
5007
5253
|
struct ggml_context * ctx,
|
5008
|
-
|
5009
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src
|
5254
|
+
struct ggml_tensor * src) {
|
5255
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
|
5010
5256
|
ggml_format_name(result, "%s (view)", src->name);
|
5011
5257
|
|
5012
|
-
|
5013
|
-
|
5014
|
-
|
5015
|
-
result->nb[3] = src->nb[3];
|
5258
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
5259
|
+
result->nb[i] = src->nb[i];
|
5260
|
+
}
|
5016
5261
|
|
5017
5262
|
return result;
|
5018
5263
|
}
|
@@ -5545,10 +5790,6 @@ struct ggml_tensor * ggml_repeat(
|
|
5545
5790
|
is_node = true;
|
5546
5791
|
}
|
5547
5792
|
|
5548
|
-
if (ggml_are_same_shape(a, b) && !is_node) {
|
5549
|
-
return a;
|
5550
|
-
}
|
5551
|
-
|
5552
5793
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
5553
5794
|
|
5554
5795
|
result->op = GGML_OP_REPEAT;
|
@@ -5587,6 +5828,30 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5587
5828
|
return result;
|
5588
5829
|
}
|
5589
5830
|
|
5831
|
+
// ggml_concat
|
5832
|
+
|
5833
|
+
struct ggml_tensor * ggml_concat(
|
5834
|
+
struct ggml_context* ctx,
|
5835
|
+
struct ggml_tensor* a,
|
5836
|
+
struct ggml_tensor* b) {
|
5837
|
+
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
5838
|
+
|
5839
|
+
bool is_node = false;
|
5840
|
+
|
5841
|
+
if (a->grad || b->grad) {
|
5842
|
+
is_node = true;
|
5843
|
+
}
|
5844
|
+
|
5845
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
5846
|
+
|
5847
|
+
result->op = GGML_OP_CONCAT;
|
5848
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5849
|
+
result->src[0] = a;
|
5850
|
+
result->src[1] = b;
|
5851
|
+
|
5852
|
+
return result;
|
5853
|
+
}
|
5854
|
+
|
5590
5855
|
// ggml_abs
|
5591
5856
|
|
5592
5857
|
struct ggml_tensor * ggml_abs(
|
@@ -5755,6 +6020,7 @@ struct ggml_tensor * ggml_silu_back(
|
|
5755
6020
|
static struct ggml_tensor * ggml_norm_impl(
|
5756
6021
|
struct ggml_context * ctx,
|
5757
6022
|
struct ggml_tensor * a,
|
6023
|
+
float eps,
|
5758
6024
|
bool inplace) {
|
5759
6025
|
bool is_node = false;
|
5760
6026
|
|
@@ -5765,7 +6031,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5765
6031
|
|
5766
6032
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5767
6033
|
|
5768
|
-
|
6034
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
5769
6035
|
|
5770
6036
|
result->op = GGML_OP_NORM;
|
5771
6037
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5776,16 +6042,20 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5776
6042
|
|
5777
6043
|
struct ggml_tensor * ggml_norm(
|
5778
6044
|
struct ggml_context * ctx,
|
5779
|
-
struct ggml_tensor * a
|
5780
|
-
|
6045
|
+
struct ggml_tensor * a,
|
6046
|
+
float eps) {
|
6047
|
+
return ggml_norm_impl(ctx, a, eps, false);
|
5781
6048
|
}
|
5782
6049
|
|
5783
6050
|
struct ggml_tensor * ggml_norm_inplace(
|
5784
6051
|
struct ggml_context * ctx,
|
5785
|
-
struct ggml_tensor * a
|
5786
|
-
|
6052
|
+
struct ggml_tensor * a,
|
6053
|
+
float eps) {
|
6054
|
+
return ggml_norm_impl(ctx, a, eps, true);
|
5787
6055
|
}
|
5788
6056
|
|
6057
|
+
// ggml_rms_norm
|
6058
|
+
|
5789
6059
|
static struct ggml_tensor * ggml_rms_norm_impl(
|
5790
6060
|
struct ggml_context * ctx,
|
5791
6061
|
struct ggml_tensor * a,
|
@@ -5822,10 +6092,13 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
5822
6092
|
return ggml_rms_norm_impl(ctx, a, eps, true);
|
5823
6093
|
}
|
5824
6094
|
|
6095
|
+
// ggml_rms_norm_back
|
6096
|
+
|
5825
6097
|
struct ggml_tensor * ggml_rms_norm_back(
|
5826
6098
|
struct ggml_context * ctx,
|
5827
6099
|
struct ggml_tensor * a,
|
5828
|
-
struct ggml_tensor * b
|
6100
|
+
struct ggml_tensor * b,
|
6101
|
+
float eps) {
|
5829
6102
|
bool is_node = false;
|
5830
6103
|
|
5831
6104
|
if (a->grad) {
|
@@ -5835,6 +6108,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5835
6108
|
|
5836
6109
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5837
6110
|
|
6111
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
6112
|
+
|
5838
6113
|
result->op = GGML_OP_RMS_NORM_BACK;
|
5839
6114
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5840
6115
|
result->src[0] = a;
|
@@ -5843,6 +6118,44 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5843
6118
|
return result;
|
5844
6119
|
}
|
5845
6120
|
|
6121
|
+
// ggml_group_norm
|
6122
|
+
|
6123
|
+
static struct ggml_tensor * ggml_group_norm_impl(
|
6124
|
+
struct ggml_context * ctx,
|
6125
|
+
struct ggml_tensor * a,
|
6126
|
+
int n_groups,
|
6127
|
+
bool inplace) {
|
6128
|
+
|
6129
|
+
bool is_node = false;
|
6130
|
+
if (!inplace && (a->grad)) {
|
6131
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6132
|
+
is_node = true;
|
6133
|
+
}
|
6134
|
+
|
6135
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6136
|
+
|
6137
|
+
result->op = GGML_OP_GROUP_NORM;
|
6138
|
+
result->op_params[0] = n_groups;
|
6139
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6140
|
+
result->src[0] = a;
|
6141
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
6142
|
+
|
6143
|
+
return result;
|
6144
|
+
}
|
6145
|
+
|
6146
|
+
struct ggml_tensor * ggml_group_norm(
|
6147
|
+
struct ggml_context * ctx,
|
6148
|
+
struct ggml_tensor * a,
|
6149
|
+
int n_groups) {
|
6150
|
+
return ggml_group_norm_impl(ctx, a, n_groups, false);
|
6151
|
+
}
|
6152
|
+
|
6153
|
+
struct ggml_tensor * ggml_group_norm_inplace(
|
6154
|
+
struct ggml_context * ctx,
|
6155
|
+
struct ggml_tensor * a,
|
6156
|
+
int n_groups) {
|
6157
|
+
return ggml_group_norm_impl(ctx, a, n_groups, true);
|
6158
|
+
}
|
5846
6159
|
|
5847
6160
|
// ggml_mul_mat
|
5848
6161
|
|
@@ -6126,7 +6439,7 @@ struct ggml_tensor * ggml_reshape(
|
|
6126
6439
|
//GGML_ASSERT(false);
|
6127
6440
|
}
|
6128
6441
|
|
6129
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a
|
6442
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
|
6130
6443
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6131
6444
|
|
6132
6445
|
result->op = GGML_OP_RESHAPE;
|
@@ -6150,7 +6463,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6150
6463
|
}
|
6151
6464
|
|
6152
6465
|
const int64_t ne[1] = { ne0 };
|
6153
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a
|
6466
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
|
6154
6467
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6155
6468
|
|
6156
6469
|
result->op = GGML_OP_RESHAPE;
|
@@ -6175,7 +6488,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6175
6488
|
}
|
6176
6489
|
|
6177
6490
|
const int64_t ne[2] = { ne0, ne1 };
|
6178
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a
|
6491
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
|
6179
6492
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6180
6493
|
|
6181
6494
|
result->op = GGML_OP_RESHAPE;
|
@@ -6201,7 +6514,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6201
6514
|
}
|
6202
6515
|
|
6203
6516
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6204
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a
|
6517
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
|
6205
6518
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6206
6519
|
|
6207
6520
|
result->op = GGML_OP_RESHAPE;
|
@@ -6211,7 +6524,6 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6211
6524
|
return result;
|
6212
6525
|
}
|
6213
6526
|
|
6214
|
-
|
6215
6527
|
struct ggml_tensor * ggml_reshape_4d(
|
6216
6528
|
struct ggml_context * ctx,
|
6217
6529
|
struct ggml_tensor * a,
|
@@ -6229,7 +6541,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6229
6541
|
}
|
6230
6542
|
|
6231
6543
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6232
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a
|
6544
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
|
6233
6545
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6234
6546
|
|
6235
6547
|
result->op = GGML_OP_RESHAPE;
|
@@ -6239,46 +6551,40 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6239
6551
|
return result;
|
6240
6552
|
}
|
6241
6553
|
|
6242
|
-
|
6243
|
-
|
6244
|
-
static struct ggml_tensor * ggml_view_tensor_offset(
|
6554
|
+
static struct ggml_tensor * ggml_view_impl(
|
6245
6555
|
struct ggml_context * ctx,
|
6246
6556
|
struct ggml_tensor * a,
|
6247
6557
|
int n_dims,
|
6248
6558
|
const int64_t * ne,
|
6249
6559
|
size_t offset) {
|
6250
|
-
// don't calculate an offset from an unallocated tensor
|
6251
|
-
void * data = NULL;
|
6252
|
-
if (a->data != NULL) {
|
6253
|
-
data = (char *) a->data + offset;
|
6254
|
-
}
|
6255
6560
|
|
6256
|
-
|
6561
|
+
bool is_node = false;
|
6562
|
+
|
6563
|
+
if (a->grad) {
|
6564
|
+
is_node = true;
|
6565
|
+
}
|
6257
6566
|
|
6567
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
6258
6568
|
ggml_format_name(result, "%s (view)", a->name);
|
6259
6569
|
|
6260
6570
|
ggml_set_op_params(result, &offset, sizeof(offset));
|
6261
6571
|
|
6572
|
+
result->op = GGML_OP_VIEW;
|
6573
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6574
|
+
result->src[0] = a;
|
6575
|
+
|
6262
6576
|
return result;
|
6263
6577
|
}
|
6264
6578
|
|
6579
|
+
// ggml_view_1d
|
6580
|
+
|
6265
6581
|
struct ggml_tensor * ggml_view_1d(
|
6266
6582
|
struct ggml_context * ctx,
|
6267
6583
|
struct ggml_tensor * a,
|
6268
6584
|
int64_t ne0,
|
6269
6585
|
size_t offset) {
|
6270
6586
|
|
6271
|
-
|
6272
|
-
|
6273
|
-
if (a->grad) {
|
6274
|
-
is_node = true;
|
6275
|
-
}
|
6276
|
-
|
6277
|
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6278
|
-
|
6279
|
-
result->op = GGML_OP_VIEW;
|
6280
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6281
|
-
result->src[0] = a;
|
6587
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
|
6282
6588
|
|
6283
6589
|
return result;
|
6284
6590
|
}
|
@@ -6293,24 +6599,14 @@ struct ggml_tensor * ggml_view_2d(
|
|
6293
6599
|
size_t nb1,
|
6294
6600
|
size_t offset) {
|
6295
6601
|
|
6296
|
-
|
6602
|
+
const int64_t ne[2] = { ne0, ne1 };
|
6297
6603
|
|
6298
|
-
|
6299
|
-
is_node = true;
|
6300
|
-
}
|
6301
|
-
|
6302
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6303
|
-
|
6304
|
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
6604
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
|
6305
6605
|
|
6306
6606
|
result->nb[1] = nb1;
|
6307
6607
|
result->nb[2] = result->nb[1]*ne1;
|
6308
6608
|
result->nb[3] = result->nb[2];
|
6309
6609
|
|
6310
|
-
result->op = GGML_OP_VIEW;
|
6311
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6312
|
-
result->src[0] = a;
|
6313
|
-
|
6314
6610
|
return result;
|
6315
6611
|
}
|
6316
6612
|
|
@@ -6326,24 +6622,14 @@ struct ggml_tensor * ggml_view_3d(
|
|
6326
6622
|
size_t nb2,
|
6327
6623
|
size_t offset) {
|
6328
6624
|
|
6329
|
-
|
6330
|
-
|
6331
|
-
if (a->grad) {
|
6332
|
-
is_node = true;
|
6333
|
-
}
|
6334
|
-
|
6335
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6625
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6336
6626
|
|
6337
|
-
struct ggml_tensor * result =
|
6627
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
|
6338
6628
|
|
6339
6629
|
result->nb[1] = nb1;
|
6340
6630
|
result->nb[2] = nb2;
|
6341
6631
|
result->nb[3] = result->nb[2]*ne2;
|
6342
6632
|
|
6343
|
-
result->op = GGML_OP_VIEW;
|
6344
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6345
|
-
result->src[0] = a;
|
6346
|
-
|
6347
6633
|
return result;
|
6348
6634
|
}
|
6349
6635
|
|
@@ -6361,24 +6647,14 @@ struct ggml_tensor * ggml_view_4d(
|
|
6361
6647
|
size_t nb3,
|
6362
6648
|
size_t offset) {
|
6363
6649
|
|
6364
|
-
|
6365
|
-
|
6366
|
-
if (a->grad) {
|
6367
|
-
is_node = true;
|
6368
|
-
}
|
6369
|
-
|
6370
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6650
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6371
6651
|
|
6372
|
-
struct ggml_tensor * result =
|
6652
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
|
6373
6653
|
|
6374
6654
|
result->nb[1] = nb1;
|
6375
6655
|
result->nb[2] = nb2;
|
6376
6656
|
result->nb[3] = nb3;
|
6377
6657
|
|
6378
|
-
result->op = GGML_OP_VIEW;
|
6379
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6380
|
-
result->src[0] = a;
|
6381
|
-
|
6382
6658
|
return result;
|
6383
6659
|
}
|
6384
6660
|
|
@@ -6565,7 +6841,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6565
6841
|
|
6566
6842
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6567
6843
|
|
6568
|
-
int32_t params[] = { n_past
|
6844
|
+
int32_t params[] = { n_past };
|
6569
6845
|
ggml_set_op_params(result, params, sizeof(params));
|
6570
6846
|
|
6571
6847
|
result->op = GGML_OP_DIAG_MASK_INF;
|
@@ -6582,7 +6858,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
6582
6858
|
return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
|
6583
6859
|
}
|
6584
6860
|
|
6585
|
-
|
6586
6861
|
struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
6587
6862
|
struct ggml_context * ctx,
|
6588
6863
|
struct ggml_tensor * a,
|
@@ -6605,7 +6880,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6605
6880
|
|
6606
6881
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6607
6882
|
|
6608
|
-
int32_t params[] = { n_past
|
6883
|
+
int32_t params[] = { n_past };
|
6609
6884
|
ggml_set_op_params(result, params, sizeof(params));
|
6610
6885
|
|
6611
6886
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
@@ -6711,6 +6986,8 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6711
6986
|
int n_ctx,
|
6712
6987
|
float freq_base,
|
6713
6988
|
float freq_scale,
|
6989
|
+
float xpos_base,
|
6990
|
+
bool xpos_down,
|
6714
6991
|
bool inplace) {
|
6715
6992
|
GGML_ASSERT(n_past >= 0);
|
6716
6993
|
bool is_node = false;
|
@@ -6721,9 +6998,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6721
6998
|
|
6722
6999
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6723
7000
|
|
6724
|
-
int32_t params[
|
7001
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
6725
7002
|
memcpy(params + 4, &freq_base, sizeof(float));
|
6726
7003
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
7004
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
7005
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6727
7006
|
ggml_set_op_params(result, params, sizeof(params));
|
6728
7007
|
|
6729
7008
|
result->op = GGML_OP_ROPE;
|
@@ -6740,7 +7019,7 @@ struct ggml_tensor * ggml_rope(
|
|
6740
7019
|
int n_dims,
|
6741
7020
|
int mode,
|
6742
7021
|
int n_ctx) {
|
6743
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
7022
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
6744
7023
|
}
|
6745
7024
|
|
6746
7025
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6750,7 +7029,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6750
7029
|
int n_dims,
|
6751
7030
|
int mode,
|
6752
7031
|
int n_ctx) {
|
6753
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
7032
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
6754
7033
|
}
|
6755
7034
|
|
6756
7035
|
struct ggml_tensor * ggml_rope_custom(
|
@@ -6762,7 +7041,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6762
7041
|
int n_ctx,
|
6763
7042
|
float freq_base,
|
6764
7043
|
float freq_scale) {
|
6765
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
7044
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
6766
7045
|
}
|
6767
7046
|
|
6768
7047
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -6774,7 +7053,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6774
7053
|
int n_ctx,
|
6775
7054
|
float freq_base,
|
6776
7055
|
float freq_scale) {
|
6777
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
7056
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
7057
|
+
}
|
7058
|
+
|
7059
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
7060
|
+
struct ggml_context * ctx,
|
7061
|
+
struct ggml_tensor * a,
|
7062
|
+
int n_past,
|
7063
|
+
int n_dims,
|
7064
|
+
float base,
|
7065
|
+
bool down) {
|
7066
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
6778
7067
|
}
|
6779
7068
|
|
6780
7069
|
// ggml_rope_back
|
@@ -6785,7 +7074,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6785
7074
|
int n_past,
|
6786
7075
|
int n_dims,
|
6787
7076
|
int mode,
|
6788
|
-
int n_ctx
|
7077
|
+
int n_ctx,
|
7078
|
+
float freq_base,
|
7079
|
+
float freq_scale,
|
7080
|
+
float xpos_base,
|
7081
|
+
bool xpos_down) {
|
6789
7082
|
GGML_ASSERT(n_past >= 0);
|
6790
7083
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
6791
7084
|
|
@@ -6797,7 +7090,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6797
7090
|
|
6798
7091
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6799
7092
|
|
6800
|
-
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
7093
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
7094
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
7095
|
+
memcpy(params + 5, &freq_scale, sizeof(float));
|
7096
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
7097
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6801
7098
|
ggml_set_op_params(result, params, sizeof(params));
|
6802
7099
|
|
6803
7100
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -6904,6 +7201,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6904
7201
|
return result;
|
6905
7202
|
}
|
6906
7203
|
|
7204
|
+
// ggml_conv_1d_ph
|
7205
|
+
|
7206
|
+
struct ggml_tensor* ggml_conv_1d_ph(
|
7207
|
+
struct ggml_context * ctx,
|
7208
|
+
struct ggml_tensor * a,
|
7209
|
+
struct ggml_tensor * b,
|
7210
|
+
int s,
|
7211
|
+
int d) {
|
7212
|
+
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7213
|
+
}
|
7214
|
+
|
6907
7215
|
// ggml_conv_2d
|
6908
7216
|
|
6909
7217
|
struct ggml_tensor * ggml_conv_2d(
|
@@ -6944,17 +7252,61 @@ struct ggml_tensor * ggml_conv_2d(
|
|
6944
7252
|
|
6945
7253
|
}
|
6946
7254
|
|
6947
|
-
//
|
7255
|
+
// ggml_conv_2d_sk_p0
|
6948
7256
|
|
6949
|
-
struct ggml_tensor *
|
7257
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6950
7258
|
struct ggml_context * ctx,
|
6951
7259
|
struct ggml_tensor * a,
|
6952
|
-
struct ggml_tensor * b
|
6953
|
-
|
6954
|
-
|
6955
|
-
|
7260
|
+
struct ggml_tensor * b) {
|
7261
|
+
return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
|
7262
|
+
}
|
7263
|
+
|
7264
|
+
// ggml_conv_2d_s1_ph
|
7265
|
+
|
7266
|
+
struct ggml_tensor * ggml_conv_2d_s1_ph(
|
7267
|
+
struct ggml_context * ctx,
|
7268
|
+
struct ggml_tensor * a,
|
7269
|
+
struct ggml_tensor * b) {
|
7270
|
+
return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
|
7271
|
+
}
|
7272
|
+
|
7273
|
+
// ggml_conv_transpose_2d_p0
|
7274
|
+
|
7275
|
+
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
7276
|
+
return (ins - 1) * s - 2 * p + ks;
|
6956
7277
|
}
|
6957
7278
|
|
7279
|
+
struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
7280
|
+
struct ggml_context * ctx,
|
7281
|
+
struct ggml_tensor * a,
|
7282
|
+
struct ggml_tensor * b,
|
7283
|
+
int stride) {
|
7284
|
+
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
7285
|
+
|
7286
|
+
bool is_node = false;
|
7287
|
+
|
7288
|
+
if (a->grad || b->grad) {
|
7289
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7290
|
+
is_node = true;
|
7291
|
+
}
|
7292
|
+
|
7293
|
+
const int64_t ne[4] = {
|
7294
|
+
ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
|
7295
|
+
ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
|
7296
|
+
a->ne[2], b->ne[3],
|
7297
|
+
};
|
7298
|
+
|
7299
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7300
|
+
|
7301
|
+
ggml_set_op_params_i32(result, 0, stride);
|
7302
|
+
|
7303
|
+
result->op = GGML_OP_CONV_TRANSPOSE_2D;
|
7304
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7305
|
+
result->src[0] = a;
|
7306
|
+
result->src[1] = b;
|
7307
|
+
|
7308
|
+
return result;
|
7309
|
+
}
|
6958
7310
|
|
6959
7311
|
// ggml_pool_*
|
6960
7312
|
|
@@ -7032,6 +7384,40 @@ struct ggml_tensor * ggml_pool_2d(
|
|
7032
7384
|
return result;
|
7033
7385
|
}
|
7034
7386
|
|
7387
|
+
// ggml_upscale
|
7388
|
+
|
7389
|
+
static struct ggml_tensor * ggml_upscale_impl(
|
7390
|
+
struct ggml_context * ctx,
|
7391
|
+
struct ggml_tensor * a,
|
7392
|
+
int scale_factor) {
|
7393
|
+
bool is_node = false;
|
7394
|
+
|
7395
|
+
if (a->grad) {
|
7396
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7397
|
+
is_node = true;
|
7398
|
+
}
|
7399
|
+
|
7400
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
7401
|
+
a->ne[0] * scale_factor,
|
7402
|
+
a->ne[1] * scale_factor,
|
7403
|
+
a->ne[2], a->ne[3]);
|
7404
|
+
|
7405
|
+
result->op = GGML_OP_UPSCALE;
|
7406
|
+
result->op_params[0] = scale_factor;
|
7407
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7408
|
+
result->src[0] = a;
|
7409
|
+
result->src[1] = NULL;
|
7410
|
+
|
7411
|
+
return result;
|
7412
|
+
}
|
7413
|
+
|
7414
|
+
struct ggml_tensor * ggml_upscale(
|
7415
|
+
struct ggml_context * ctx,
|
7416
|
+
struct ggml_tensor * a,
|
7417
|
+
int scale_factor) {
|
7418
|
+
return ggml_upscale_impl(ctx, a, scale_factor);
|
7419
|
+
}
|
7420
|
+
|
7035
7421
|
// ggml_flash_attn
|
7036
7422
|
|
7037
7423
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -7230,6 +7616,87 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7230
7616
|
return result;
|
7231
7617
|
}
|
7232
7618
|
|
7619
|
+
// ggml_get_rel_pos
|
7620
|
+
|
7621
|
+
struct ggml_tensor * ggml_get_rel_pos(
|
7622
|
+
struct ggml_context * ctx,
|
7623
|
+
struct ggml_tensor * a,
|
7624
|
+
int qh,
|
7625
|
+
int kh) {
|
7626
|
+
GGML_ASSERT(qh == kh);
|
7627
|
+
GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
|
7628
|
+
|
7629
|
+
bool is_node = false;
|
7630
|
+
|
7631
|
+
if (a->grad) {
|
7632
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7633
|
+
is_node = true;
|
7634
|
+
}
|
7635
|
+
|
7636
|
+
const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
|
7637
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
|
7638
|
+
|
7639
|
+
result->op = GGML_OP_GET_REL_POS;
|
7640
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7641
|
+
result->src[0] = a;
|
7642
|
+
result->src[1] = NULL;
|
7643
|
+
|
7644
|
+
return result;
|
7645
|
+
}
|
7646
|
+
|
7647
|
+
// ggml_add_rel_pos
|
7648
|
+
|
7649
|
+
static struct ggml_tensor * ggml_add_rel_pos_impl(
|
7650
|
+
struct ggml_context * ctx,
|
7651
|
+
struct ggml_tensor * a,
|
7652
|
+
struct ggml_tensor * pw,
|
7653
|
+
struct ggml_tensor * ph,
|
7654
|
+
bool inplace) {
|
7655
|
+
GGML_ASSERT(ggml_are_same_shape(pw, ph));
|
7656
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
7657
|
+
GGML_ASSERT(ggml_is_contiguous(pw));
|
7658
|
+
GGML_ASSERT(ggml_is_contiguous(ph));
|
7659
|
+
GGML_ASSERT(ph->type == GGML_TYPE_F32);
|
7660
|
+
GGML_ASSERT(pw->type == GGML_TYPE_F32);
|
7661
|
+
GGML_ASSERT(pw->ne[3] == a->ne[2]);
|
7662
|
+
GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
|
7663
|
+
GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
|
7664
|
+
|
7665
|
+
bool is_node = false;
|
7666
|
+
|
7667
|
+
if (!inplace && (a->grad || pw->grad || ph->grad)) {
|
7668
|
+
is_node = true;
|
7669
|
+
}
|
7670
|
+
|
7671
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7672
|
+
ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
|
7673
|
+
|
7674
|
+
result->op = GGML_OP_ADD_REL_POS;
|
7675
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7676
|
+
result->src[0] = a;
|
7677
|
+
result->src[1] = pw;
|
7678
|
+
result->src[2] = ph;
|
7679
|
+
|
7680
|
+
return result;
|
7681
|
+
}
|
7682
|
+
|
7683
|
+
|
7684
|
+
struct ggml_tensor * ggml_add_rel_pos(
|
7685
|
+
struct ggml_context * ctx,
|
7686
|
+
struct ggml_tensor * a,
|
7687
|
+
struct ggml_tensor * pw,
|
7688
|
+
struct ggml_tensor * ph) {
|
7689
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
|
7690
|
+
}
|
7691
|
+
|
7692
|
+
struct ggml_tensor * ggml_add_rel_pos_inplace(
|
7693
|
+
struct ggml_context * ctx,
|
7694
|
+
struct ggml_tensor * a,
|
7695
|
+
struct ggml_tensor * pw,
|
7696
|
+
struct ggml_tensor * ph) {
|
7697
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7698
|
+
}
|
7699
|
+
|
7233
7700
|
// gmml_unary
|
7234
7701
|
|
7235
7702
|
static struct ggml_tensor * ggml_unary_impl(
|
@@ -7745,7 +8212,7 @@ static void ggml_compute_forward_dup_same_cont(
|
|
7745
8212
|
memcpy(
|
7746
8213
|
((char *) dst->data + ie0*nb0),
|
7747
8214
|
((char *) src0->data + ie0*nb00),
|
7748
|
-
(ie1 - ie0) *
|
8215
|
+
(ie1 - ie0) * ggml_type_size(src0->type));
|
7749
8216
|
}
|
7750
8217
|
|
7751
8218
|
}
|
@@ -7779,7 +8246,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7779
8246
|
|
7780
8247
|
if (src0->type == dst->type &&
|
7781
8248
|
ne00 == ne0 &&
|
7782
|
-
nb00 ==
|
8249
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
7783
8250
|
// copy by rows
|
7784
8251
|
const size_t rs = ne00*nb00;
|
7785
8252
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -7837,7 +8304,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7837
8304
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7838
8305
|
|
7839
8306
|
size_t id = 0;
|
7840
|
-
size_t rs = nb0 * (ne00 /
|
8307
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
7841
8308
|
char * dst_ptr = (char *) dst->data;
|
7842
8309
|
|
7843
8310
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8050,7 +8517,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8050
8517
|
|
8051
8518
|
if (src0->type == dst->type &&
|
8052
8519
|
ne00 == ne0 &&
|
8053
|
-
nb00 ==
|
8520
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
8054
8521
|
// copy by rows
|
8055
8522
|
const size_t rs = ne00*nb00;
|
8056
8523
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -8089,7 +8556,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8089
8556
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
8090
8557
|
|
8091
8558
|
size_t id = 0;
|
8092
|
-
size_t rs = nb0 * (ne00 /
|
8559
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
8093
8560
|
char * dst_ptr = (char *) dst->data;
|
8094
8561
|
|
8095
8562
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8501,7 +8968,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
8501
8968
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8502
8969
|
|
8503
8970
|
// we don't support permuted src0 or src1
|
8504
|
-
GGML_ASSERT(nb00 ==
|
8971
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8505
8972
|
GGML_ASSERT(nb10 == sizeof(float));
|
8506
8973
|
|
8507
8974
|
// dst cannot be transposed or permuted
|
@@ -8775,7 +9242,7 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8775
9242
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8776
9243
|
|
8777
9244
|
// we don't support permuted src0
|
8778
|
-
GGML_ASSERT(nb00 ==
|
9245
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8779
9246
|
|
8780
9247
|
// dst cannot be transposed or permuted
|
8781
9248
|
GGML_ASSERT(nb0 <= nb1);
|
@@ -9137,6 +9604,8 @@ static void ggml_compute_forward_mul(
|
|
9137
9604
|
const struct ggml_tensor * src0,
|
9138
9605
|
const struct ggml_tensor * src1,
|
9139
9606
|
struct ggml_tensor * dst) {
|
9607
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
|
9608
|
+
|
9140
9609
|
switch (src0->type) {
|
9141
9610
|
case GGML_TYPE_F32:
|
9142
9611
|
{
|
@@ -9179,6 +9648,8 @@ static void ggml_compute_forward_div_f32(
|
|
9179
9648
|
|
9180
9649
|
|
9181
9650
|
#ifdef GGML_USE_ACCELERATE
|
9651
|
+
UNUSED(ggml_vec_div_f32);
|
9652
|
+
|
9182
9653
|
vDSP_vdiv(
|
9183
9654
|
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
9184
9655
|
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
@@ -9731,6 +10202,72 @@ static void ggml_compute_forward_repeat_back(
|
|
9731
10202
|
}
|
9732
10203
|
}
|
9733
10204
|
|
10205
|
+
// ggml_compute_forward_concat
|
10206
|
+
|
10207
|
+
static void ggml_compute_forward_concat_f32(
|
10208
|
+
const struct ggml_compute_params * params,
|
10209
|
+
const struct ggml_tensor * src0,
|
10210
|
+
const struct ggml_tensor * src1,
|
10211
|
+
struct ggml_tensor * dst) {
|
10212
|
+
|
10213
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10214
|
+
return;
|
10215
|
+
}
|
10216
|
+
|
10217
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10218
|
+
|
10219
|
+
const int ith = params->ith;
|
10220
|
+
|
10221
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10222
|
+
|
10223
|
+
// TODO: support for transposed / permuted tensors
|
10224
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
10225
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
10226
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
10227
|
+
|
10228
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
10229
|
+
for (int i2 = ith; i2 < ne2; i2++) {
|
10230
|
+
if (i2 < ne02) { // src0
|
10231
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10232
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10233
|
+
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
10234
|
+
|
10235
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10236
|
+
*y = *x;
|
10237
|
+
}
|
10238
|
+
}
|
10239
|
+
} // src1
|
10240
|
+
else {
|
10241
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10242
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10243
|
+
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10244
|
+
|
10245
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10246
|
+
*y = *x;
|
10247
|
+
}
|
10248
|
+
}
|
10249
|
+
}
|
10250
|
+
}
|
10251
|
+
}
|
10252
|
+
}
|
10253
|
+
|
10254
|
+
static void ggml_compute_forward_concat(
|
10255
|
+
const struct ggml_compute_params* params,
|
10256
|
+
const struct ggml_tensor* src0,
|
10257
|
+
const struct ggml_tensor* src1,
|
10258
|
+
struct ggml_tensor* dst) {
|
10259
|
+
switch (src0->type) {
|
10260
|
+
case GGML_TYPE_F32:
|
10261
|
+
{
|
10262
|
+
ggml_compute_forward_concat_f32(params, src0, src1, dst);
|
10263
|
+
} break;
|
10264
|
+
default:
|
10265
|
+
{
|
10266
|
+
GGML_ASSERT(false);
|
10267
|
+
} break;
|
10268
|
+
}
|
10269
|
+
}
|
10270
|
+
|
9734
10271
|
// ggml_compute_forward_abs
|
9735
10272
|
|
9736
10273
|
static void ggml_compute_forward_abs_f32(
|
@@ -10285,7 +10822,8 @@ static void ggml_compute_forward_norm_f32(
|
|
10285
10822
|
|
10286
10823
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
10287
10824
|
|
10288
|
-
|
10825
|
+
float eps;
|
10826
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10289
10827
|
|
10290
10828
|
// TODO: optimize
|
10291
10829
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -10334,6 +10872,8 @@ static void ggml_compute_forward_norm(
|
|
10334
10872
|
}
|
10335
10873
|
}
|
10336
10874
|
|
10875
|
+
// ggml_compute_forward_group_rms_norm
|
10876
|
+
|
10337
10877
|
static void ggml_compute_forward_rms_norm_f32(
|
10338
10878
|
const struct ggml_compute_params * params,
|
10339
10879
|
const struct ggml_tensor * src0,
|
@@ -10398,7 +10938,6 @@ static void ggml_compute_forward_rms_norm(
|
|
10398
10938
|
}
|
10399
10939
|
}
|
10400
10940
|
|
10401
|
-
|
10402
10941
|
static void ggml_compute_forward_rms_norm_back_f32(
|
10403
10942
|
const struct ggml_compute_params * params,
|
10404
10943
|
const struct ggml_tensor * src0,
|
@@ -10417,7 +10956,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
10417
10956
|
|
10418
10957
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
10419
10958
|
|
10420
|
-
|
10959
|
+
float eps;
|
10960
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10421
10961
|
|
10422
10962
|
// TODO: optimize
|
10423
10963
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -10572,54 +11112,144 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10572
11112
|
}
|
10573
11113
|
}
|
10574
11114
|
|
10575
|
-
//
|
10576
|
-
|
10577
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10578
|
-
// helper function to determine if it is better to use BLAS or not
|
10579
|
-
// for large matrices, BLAS is faster
|
10580
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
10581
|
-
const struct ggml_tensor * src0,
|
10582
|
-
const struct ggml_tensor * src1,
|
10583
|
-
struct ggml_tensor * dst) {
|
10584
|
-
//const int64_t ne00 = src0->ne[0];
|
10585
|
-
//const int64_t ne01 = src0->ne[1];
|
10586
|
-
|
10587
|
-
const int64_t ne10 = src1->ne[0];
|
10588
|
-
|
10589
|
-
const int64_t ne0 = dst->ne[0];
|
10590
|
-
const int64_t ne1 = dst->ne[1];
|
11115
|
+
// ggml_compute_forward_group_norm
|
10591
11116
|
|
10592
|
-
|
10593
|
-
|
10594
|
-
|
10595
|
-
|
11117
|
+
static void ggml_compute_forward_group_norm_f32(
|
11118
|
+
const struct ggml_compute_params * params,
|
11119
|
+
const struct ggml_tensor * src0,
|
11120
|
+
struct ggml_tensor * dst) {
|
11121
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10596
11122
|
|
10597
|
-
|
10598
|
-
return
|
11123
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11124
|
+
return;
|
10599
11125
|
}
|
10600
11126
|
|
10601
|
-
|
10602
|
-
}
|
10603
|
-
#endif
|
10604
|
-
|
10605
|
-
static void ggml_compute_forward_mul_mat(
|
10606
|
-
const struct ggml_compute_params * params,
|
10607
|
-
const struct ggml_tensor * src0,
|
10608
|
-
const struct ggml_tensor * src1,
|
10609
|
-
struct ggml_tensor * dst) {
|
10610
|
-
int64_t t0 = ggml_perf_time_us();
|
10611
|
-
UNUSED(t0);
|
10612
|
-
|
10613
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
11127
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10614
11128
|
|
10615
11129
|
const int ith = params->ith;
|
10616
11130
|
const int nth = params->nth;
|
10617
11131
|
|
10618
|
-
|
11132
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10619
11133
|
|
10620
|
-
const
|
11134
|
+
const float eps = 1e-6f; // TODO: make this a parameter
|
10621
11135
|
|
10622
|
-
|
11136
|
+
// TODO: optimize
|
11137
|
+
|
11138
|
+
int n_channels = src0->ne[2];
|
11139
|
+
int n_groups = dst->op_params[0];
|
11140
|
+
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
11141
|
+
for (int i = ith; i < n_groups; i+=nth) {
|
11142
|
+
int start = i * n_channels_per_group;
|
11143
|
+
int end = start + n_channels_per_group;
|
11144
|
+
if (end > n_channels) {
|
11145
|
+
end = n_channels;
|
11146
|
+
}
|
11147
|
+
int step = end - start;
|
11148
|
+
|
11149
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
11150
|
+
ggml_float sum = 0.0;
|
11151
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
11152
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11153
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
11154
|
+
|
11155
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11156
|
+
sum += (ggml_float)x[i00];
|
11157
|
+
}
|
11158
|
+
}
|
11159
|
+
}
|
11160
|
+
float mean = sum / (ne00 * ne01 * step);
|
11161
|
+
ggml_float sum2 = 0.0;
|
11162
|
+
|
11163
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
11164
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11165
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
11166
|
+
|
11167
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
11168
|
+
|
11169
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11170
|
+
float v = x[i00] - mean;
|
11171
|
+
y[i00] = v;
|
11172
|
+
sum2 += (ggml_float)(v * v);
|
11173
|
+
}
|
11174
|
+
}
|
11175
|
+
}
|
11176
|
+
float variance = sum2 / (ne00 * ne01 * step);
|
11177
|
+
const float scale = 1.0f / sqrtf(variance + eps);
|
11178
|
+
|
11179
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
11180
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11181
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
11182
|
+
ggml_vec_scale_f32(ne00, y, scale);
|
11183
|
+
}
|
11184
|
+
}
|
11185
|
+
}
|
11186
|
+
}
|
11187
|
+
}
|
11188
|
+
|
11189
|
+
static void ggml_compute_forward_group_norm(
|
11190
|
+
const struct ggml_compute_params * params,
|
11191
|
+
const struct ggml_tensor * src0,
|
11192
|
+
struct ggml_tensor * dst) {
|
11193
|
+
switch (src0->type) {
|
11194
|
+
case GGML_TYPE_F32:
|
11195
|
+
{
|
11196
|
+
ggml_compute_forward_group_norm_f32(params, src0, dst);
|
11197
|
+
} break;
|
11198
|
+
default:
|
11199
|
+
{
|
11200
|
+
GGML_ASSERT(false);
|
11201
|
+
} break;
|
11202
|
+
}
|
11203
|
+
}
|
11204
|
+
|
11205
|
+
// ggml_compute_forward_mul_mat
|
11206
|
+
|
11207
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
11208
|
+
// helper function to determine if it is better to use BLAS or not
|
11209
|
+
// for large matrices, BLAS is faster
|
11210
|
+
static bool ggml_compute_forward_mul_mat_use_blas(
|
11211
|
+
const struct ggml_tensor * src0,
|
11212
|
+
const struct ggml_tensor * src1,
|
11213
|
+
struct ggml_tensor * dst) {
|
11214
|
+
//const int64_t ne00 = src0->ne[0];
|
11215
|
+
//const int64_t ne01 = src0->ne[1];
|
11216
|
+
|
11217
|
+
const int64_t ne10 = src1->ne[0];
|
11218
|
+
|
11219
|
+
const int64_t ne0 = dst->ne[0];
|
11220
|
+
const int64_t ne1 = dst->ne[1];
|
11221
|
+
|
11222
|
+
// TODO: find the optimal values for these
|
11223
|
+
if (ggml_is_contiguous(src0) &&
|
11224
|
+
ggml_is_contiguous(src1) &&
|
11225
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
11226
|
+
|
11227
|
+
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
11228
|
+
return true;
|
11229
|
+
}
|
11230
|
+
|
11231
|
+
return false;
|
11232
|
+
}
|
11233
|
+
#endif
|
11234
|
+
|
11235
|
+
static void ggml_compute_forward_mul_mat(
|
11236
|
+
const struct ggml_compute_params * params,
|
11237
|
+
const struct ggml_tensor * src0,
|
11238
|
+
const struct ggml_tensor * src1,
|
11239
|
+
struct ggml_tensor * dst) {
|
11240
|
+
int64_t t0 = ggml_perf_time_us();
|
11241
|
+
UNUSED(t0);
|
11242
|
+
|
11243
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
11244
|
+
|
11245
|
+
const int ith = params->ith;
|
11246
|
+
const int nth = params->nth;
|
11247
|
+
|
11248
|
+
const enum ggml_type type = src0->type;
|
11249
|
+
|
11250
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
11251
|
+
|
11252
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10623
11253
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10624
11254
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10625
11255
|
|
@@ -10629,7 +11259,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10629
11259
|
GGML_ASSERT(ne3 == ne13);
|
10630
11260
|
|
10631
11261
|
// we don't support permuted src0 or src1
|
10632
|
-
GGML_ASSERT(nb00 ==
|
11262
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10633
11263
|
GGML_ASSERT(nb10 == sizeof(float));
|
10634
11264
|
|
10635
11265
|
// dst cannot be transposed or permuted
|
@@ -10638,6 +11268,10 @@ static void ggml_compute_forward_mul_mat(
|
|
10638
11268
|
GGML_ASSERT(nb1 <= nb2);
|
10639
11269
|
GGML_ASSERT(nb2 <= nb3);
|
10640
11270
|
|
11271
|
+
// broadcast factors
|
11272
|
+
const int64_t r2 = ne12/ne02;
|
11273
|
+
const int64_t r3 = ne13/ne03;
|
11274
|
+
|
10641
11275
|
// nb01 >= nb00 - src0 is not transposed
|
10642
11276
|
// compute by src0 rows
|
10643
11277
|
|
@@ -10657,11 +11291,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10657
11291
|
|
10658
11292
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10659
11293
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10660
|
-
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10661
|
-
// ref: https://github.com/ggerganov/ggml/pull/224
|
10662
|
-
GGML_ASSERT(ne02 == ne12);
|
10663
|
-
GGML_ASSERT(ne03 == ne13);
|
10664
|
-
|
10665
11294
|
if (params->ith != 0) {
|
10666
11295
|
return;
|
10667
11296
|
}
|
@@ -10674,12 +11303,16 @@ static void ggml_compute_forward_mul_mat(
|
|
10674
11303
|
return;
|
10675
11304
|
}
|
10676
11305
|
|
10677
|
-
for (int64_t
|
10678
|
-
for (int64_t
|
10679
|
-
|
10680
|
-
const
|
11306
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
11307
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
11308
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
11309
|
+
const int64_t i03 = i13/r3;
|
11310
|
+
const int64_t i02 = i12/r2;
|
10681
11311
|
|
10682
|
-
|
11312
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
11313
|
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
11314
|
+
|
11315
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
10683
11316
|
|
10684
11317
|
if (type != GGML_TYPE_F32) {
|
10685
11318
|
float * const wdata = params->wdata;
|
@@ -10687,7 +11320,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10687
11320
|
|
10688
11321
|
size_t id = 0;
|
10689
11322
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
10690
|
-
to_float((char *)
|
11323
|
+
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
10691
11324
|
id += ne00;
|
10692
11325
|
}
|
10693
11326
|
|
@@ -10712,7 +11345,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10712
11345
|
if (params->type == GGML_TASK_INIT) {
|
10713
11346
|
if (src1->type != vec_dot_type) {
|
10714
11347
|
char * wdata = params->wdata;
|
10715
|
-
const size_t row_size = ne10*
|
11348
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10716
11349
|
|
10717
11350
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10718
11351
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -10732,7 +11365,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10732
11365
|
}
|
10733
11366
|
|
10734
11367
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
-
const size_t row_size = ne10*
|
11368
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10736
11369
|
|
10737
11370
|
const int64_t nr0 = ne01; // src0 rows
|
10738
11371
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
@@ -10767,10 +11400,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10767
11400
|
assert(ne12 % ne02 == 0);
|
10768
11401
|
assert(ne13 % ne03 == 0);
|
10769
11402
|
|
10770
|
-
// broadcast factors
|
10771
|
-
const int64_t r2 = ne12/ne02;
|
10772
|
-
const int64_t r3 = ne13/ne03;
|
10773
|
-
|
10774
11403
|
// block-tiling attempt
|
10775
11404
|
const int64_t blck_0 = 16;
|
10776
11405
|
const int64_t blck_1 = 16;
|
@@ -11205,7 +11834,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11205
11834
|
|
11206
11835
|
assert( dst->ne[0] == nc);
|
11207
11836
|
assert( dst->ne[1] == nr);
|
11208
|
-
assert(src0->nb[0] ==
|
11837
|
+
assert(src0->nb[0] == ggml_type_size(type));
|
11209
11838
|
|
11210
11839
|
for (int i = 0; i < nr; ++i) {
|
11211
11840
|
const int r = ((int32_t *) src1->data)[i];
|
@@ -11506,8 +12135,8 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
11506
12135
|
const int ith = params->ith;
|
11507
12136
|
const int nth = params->nth;
|
11508
12137
|
|
11509
|
-
const int n_past =
|
11510
|
-
const bool inplace =
|
12138
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
12139
|
+
const bool inplace = src0->data == dst->data;
|
11511
12140
|
|
11512
12141
|
GGML_ASSERT(n_past >= 0);
|
11513
12142
|
|
@@ -11718,6 +12347,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
11718
12347
|
// dx = J * dy
|
11719
12348
|
// dxk = sum_i(Jki * dyi)
|
11720
12349
|
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
12350
|
+
// dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
11721
12351
|
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
11722
12352
|
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
11723
12353
|
// dxk = -yk * dot(y, dy) + yk*dyk
|
@@ -11926,7 +12556,6 @@ static void ggml_compute_forward_alibi(
|
|
11926
12556
|
}
|
11927
12557
|
}
|
11928
12558
|
|
11929
|
-
|
11930
12559
|
// ggml_compute_forward_clamp
|
11931
12560
|
|
11932
12561
|
static void ggml_compute_forward_clamp_f32(
|
@@ -12015,12 +12644,18 @@ static void ggml_compute_forward_rope_f32(
|
|
12015
12644
|
float freq_base;
|
12016
12645
|
float freq_scale;
|
12017
12646
|
|
12647
|
+
// these two only relevant for xPos RoPE:
|
12648
|
+
float xpos_base;
|
12649
|
+
bool xpos_down;
|
12650
|
+
|
12018
12651
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12019
12652
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12020
12653
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12021
12654
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
12022
12655
|
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12023
12656
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12657
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12658
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12024
12659
|
|
12025
12660
|
assert(n_past >= 0);
|
12026
12661
|
|
@@ -12092,6 +12727,9 @@ static void ggml_compute_forward_rope_f32(
|
|
12092
12727
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12093
12728
|
const float cos_theta = cosf(theta);
|
12094
12729
|
const float sin_theta = sinf(theta);
|
12730
|
+
// zeta scaling for xPos only:
|
12731
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12732
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12095
12733
|
|
12096
12734
|
theta *= theta_scale;
|
12097
12735
|
|
@@ -12101,11 +12739,11 @@ static void ggml_compute_forward_rope_f32(
|
|
12101
12739
|
const float x0 = src[0];
|
12102
12740
|
const float x1 = src[1];
|
12103
12741
|
|
12104
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12105
|
-
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
12742
|
+
dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
|
12743
|
+
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
12106
12744
|
}
|
12107
12745
|
} else {
|
12108
|
-
// TODO: this
|
12746
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12109
12747
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12110
12748
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12111
12749
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12234,7 +12872,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12234
12872
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12235
12873
|
}
|
12236
12874
|
} else {
|
12237
|
-
// TODO: this
|
12875
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12238
12876
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12239
12877
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12240
12878
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12296,9 +12934,21 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12296
12934
|
// dx = rope_back(dy, src1)
|
12297
12935
|
// src0 is dy, src1 contains options
|
12298
12936
|
|
12937
|
+
float freq_base;
|
12938
|
+
float freq_scale;
|
12939
|
+
|
12940
|
+
// these two only relevant for xPos RoPE:
|
12941
|
+
float xpos_base;
|
12942
|
+
bool xpos_down;
|
12943
|
+
|
12299
12944
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12300
12945
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12301
12946
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12947
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
12948
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12949
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12950
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12951
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12302
12952
|
|
12303
12953
|
assert(n_past >= 0);
|
12304
12954
|
|
@@ -12324,7 +12974,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12324
12974
|
// row index used to determine which thread to use
|
12325
12975
|
int ir = 0;
|
12326
12976
|
|
12327
|
-
const float theta_scale = powf(
|
12977
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12328
12978
|
|
12329
12979
|
const bool is_neox = mode & 2;
|
12330
12980
|
|
@@ -12335,12 +12985,15 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12335
12985
|
if (ir++ < ir0) continue;
|
12336
12986
|
if (ir > ir1) break;
|
12337
12987
|
|
12338
|
-
float theta = (float)p;
|
12988
|
+
float theta = freq_scale * (float)p;
|
12339
12989
|
|
12340
12990
|
if (!is_neox) {
|
12341
12991
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12342
12992
|
const float cos_theta = cosf(theta);
|
12343
12993
|
const float sin_theta = sinf(theta);
|
12994
|
+
// zeta scaling for xPos only:
|
12995
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12996
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12344
12997
|
|
12345
12998
|
theta *= theta_scale;
|
12346
12999
|
|
@@ -12350,8 +13003,8 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12350
13003
|
const float dy0 = dy[0];
|
12351
13004
|
const float dy1 = dy[1];
|
12352
13005
|
|
12353
|
-
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
12354
|
-
dx[1] = - dy0*sin_theta + dy1*cos_theta;
|
13006
|
+
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
13007
|
+
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
12355
13008
|
}
|
12356
13009
|
} else {
|
12357
13010
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
@@ -13044,6 +13697,106 @@ static void ggml_compute_forward_conv_2d(
|
|
13044
13697
|
}
|
13045
13698
|
}
|
13046
13699
|
|
13700
|
+
// ggml_compute_forward_conv_transpose_2d
|
13701
|
+
|
13702
|
+
static void ggml_compute_forward_conv_transpose_2d(
|
13703
|
+
const struct ggml_compute_params * params,
|
13704
|
+
const struct ggml_tensor * src0,
|
13705
|
+
const struct ggml_tensor * src1,
|
13706
|
+
struct ggml_tensor * dst) {
|
13707
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13708
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13709
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13710
|
+
|
13711
|
+
int64_t t0 = ggml_perf_time_us();
|
13712
|
+
UNUSED(t0);
|
13713
|
+
|
13714
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13715
|
+
|
13716
|
+
const int ith = params->ith;
|
13717
|
+
const int nth = params->nth;
|
13718
|
+
|
13719
|
+
const int nk = ne00*ne01*ne02*ne03;
|
13720
|
+
|
13721
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13722
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13723
|
+
|
13724
|
+
if (params->type == GGML_TASK_INIT) {
|
13725
|
+
memset(params->wdata, 0, params->wsize);
|
13726
|
+
|
13727
|
+
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
13728
|
+
{
|
13729
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13730
|
+
|
13731
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
13732
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
13733
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
13734
|
+
ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
13735
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
13736
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
13737
|
+
dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
|
13738
|
+
}
|
13739
|
+
}
|
13740
|
+
}
|
13741
|
+
}
|
13742
|
+
}
|
13743
|
+
|
13744
|
+
// permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
|
13745
|
+
{
|
13746
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
13747
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13748
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13749
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
13750
|
+
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
13751
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13752
|
+
dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
|
13753
|
+
}
|
13754
|
+
}
|
13755
|
+
}
|
13756
|
+
}
|
13757
|
+
|
13758
|
+
return;
|
13759
|
+
}
|
13760
|
+
|
13761
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13762
|
+
return;
|
13763
|
+
}
|
13764
|
+
|
13765
|
+
const int32_t stride = ggml_get_op_params_i32(dst, 0);
|
13766
|
+
|
13767
|
+
// total patches in dst
|
13768
|
+
const int np = ne2;
|
13769
|
+
|
13770
|
+
// patches per thread
|
13771
|
+
const int dp = (np + nth - 1)/nth;
|
13772
|
+
|
13773
|
+
// patch range for this thread
|
13774
|
+
const int ip0 = dp*ith;
|
13775
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13776
|
+
|
13777
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13778
|
+
ggml_fp16_t * const wdata_src = wdata + nk;
|
13779
|
+
|
13780
|
+
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
13781
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13782
|
+
ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
13783
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13784
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13785
|
+
const int i1n = i11*ne10*ne12 + i10*ne12;
|
13786
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
13787
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
13788
|
+
float v = 0;
|
13789
|
+
ggml_vec_dot_f16(ne03, &v,
|
13790
|
+
wdata_src + i1n,
|
13791
|
+
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
13792
|
+
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
13793
|
+
}
|
13794
|
+
}
|
13795
|
+
}
|
13796
|
+
}
|
13797
|
+
}
|
13798
|
+
}
|
13799
|
+
|
13047
13800
|
// ggml_compute_forward_pool_1d_sk_p0
|
13048
13801
|
|
13049
13802
|
static void ggml_compute_forward_pool_1d_sk_p0(
|
@@ -13202,6 +13955,60 @@ static void ggml_compute_forward_pool_2d(
|
|
13202
13955
|
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
13203
13956
|
}
|
13204
13957
|
|
13958
|
+
// ggml_compute_forward_upscale
|
13959
|
+
|
13960
|
+
static void ggml_compute_forward_upscale_f32(
|
13961
|
+
const struct ggml_compute_params * params,
|
13962
|
+
const struct ggml_tensor * src0,
|
13963
|
+
struct ggml_tensor * dst) {
|
13964
|
+
|
13965
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13966
|
+
return;
|
13967
|
+
}
|
13968
|
+
|
13969
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
13970
|
+
|
13971
|
+
const int ith = params->ith;
|
13972
|
+
|
13973
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
13974
|
+
|
13975
|
+
const int scale_factor = dst->op_params[0];
|
13976
|
+
|
13977
|
+
// TODO: optimize
|
13978
|
+
|
13979
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
13980
|
+
for (int i02 = ith; i02 < ne02; i02++) {
|
13981
|
+
for (int m = 0; m < dst->ne[1]; m++) {
|
13982
|
+
int i01 = m / scale_factor;
|
13983
|
+
for (int n = 0; n < dst->ne[0]; n++) {
|
13984
|
+
int i00 = n / scale_factor;
|
13985
|
+
|
13986
|
+
const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
|
13987
|
+
|
13988
|
+
float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
13989
|
+
|
13990
|
+
*y = *x;
|
13991
|
+
}
|
13992
|
+
}
|
13993
|
+
}
|
13994
|
+
}
|
13995
|
+
}
|
13996
|
+
|
13997
|
+
static void ggml_compute_forward_upscale(
|
13998
|
+
const struct ggml_compute_params * params,
|
13999
|
+
const struct ggml_tensor * src0,
|
14000
|
+
struct ggml_tensor * dst) {
|
14001
|
+
switch (src0->type) {
|
14002
|
+
case GGML_TYPE_F32:
|
14003
|
+
{
|
14004
|
+
ggml_compute_forward_upscale_f32(params, src0, dst);
|
14005
|
+
} break;
|
14006
|
+
default:
|
14007
|
+
{
|
14008
|
+
GGML_ASSERT(false);
|
14009
|
+
} break;
|
14010
|
+
}
|
14011
|
+
}
|
13205
14012
|
|
13206
14013
|
// ggml_compute_forward_flash_attn
|
13207
14014
|
|
@@ -13331,7 +14138,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13331
14138
|
vvexpf(S, S, &Mup);
|
13332
14139
|
ggml_vec_sum_f32(Mup, &sum, S);
|
13333
14140
|
#else
|
13334
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14141
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
13335
14142
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
13336
14143
|
|
13337
14144
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -13341,9 +14148,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13341
14148
|
if (SS[j] == -INFINITY) {
|
13342
14149
|
SS[j] = 0.0f;
|
13343
14150
|
} else {
|
14151
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14152
|
+
const float val = expf(SS[j] - max);
|
14153
|
+
#else
|
13344
14154
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
13345
14155
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
13346
14156
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14157
|
+
#endif
|
13347
14158
|
sump[j] += (ggml_float)val;
|
13348
14159
|
SS[j] = val;
|
13349
14160
|
}
|
@@ -13921,7 +14732,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
13921
14732
|
vvexpf(SM, SM, &Mup);
|
13922
14733
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
13923
14734
|
#else
|
13924
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14735
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
13925
14736
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
13926
14737
|
|
13927
14738
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -13932,9 +14743,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
13932
14743
|
if (SR[j] == -INFINITY) {
|
13933
14744
|
SW[j] = 0.0f;
|
13934
14745
|
} else {
|
14746
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14747
|
+
const float val = expf(SR[j] - max);
|
14748
|
+
#else
|
13935
14749
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
13936
14750
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
13937
14751
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14752
|
+
#endif
|
13938
14753
|
sump[j] += (ggml_float)val;
|
13939
14754
|
SW[j] = val;
|
13940
14755
|
}
|
@@ -14327,38 +15142,169 @@ static void ggml_compute_forward_unary(
|
|
14327
15142
|
}
|
14328
15143
|
}
|
14329
15144
|
|
14330
|
-
//
|
15145
|
+
// ggml_compute_forward_get_rel_pos
|
14331
15146
|
|
14332
|
-
static void
|
15147
|
+
static void ggml_compute_forward_get_rel_pos_f16(
|
14333
15148
|
const struct ggml_compute_params * params,
|
14334
15149
|
const struct ggml_tensor * src0,
|
14335
|
-
struct ggml_tensor * dst
|
14336
|
-
const ggml_unary_op_f32_t fun) {
|
14337
|
-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
14338
|
-
|
15150
|
+
struct ggml_tensor * dst) {
|
14339
15151
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14340
15152
|
return;
|
14341
15153
|
}
|
14342
15154
|
|
14343
|
-
|
14344
|
-
const int nc = src0->ne[0];
|
15155
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
|
14345
15156
|
|
14346
|
-
|
14347
|
-
assert(src0->nb[0] == sizeof(float));
|
15157
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
14348
15158
|
|
14349
|
-
|
14350
|
-
|
14351
|
-
|
14352
|
-
|
15159
|
+
const int64_t w = ne1;
|
15160
|
+
|
15161
|
+
ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
|
15162
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
|
15163
|
+
|
15164
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
15165
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
15166
|
+
const int64_t pos = (w - i1 - 1) + i2;
|
15167
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
15168
|
+
dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
|
15169
|
+
}
|
15170
|
+
}
|
14353
15171
|
}
|
14354
15172
|
}
|
14355
15173
|
|
14356
|
-
|
14357
|
-
static void ggml_compute_forward_map_unary(
|
15174
|
+
static void ggml_compute_forward_get_rel_pos(
|
14358
15175
|
const struct ggml_compute_params * params,
|
14359
15176
|
const struct ggml_tensor * src0,
|
14360
|
-
struct ggml_tensor * dst
|
14361
|
-
|
15177
|
+
struct ggml_tensor * dst) {
|
15178
|
+
switch (src0->type) {
|
15179
|
+
case GGML_TYPE_F16:
|
15180
|
+
{
|
15181
|
+
ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
|
15182
|
+
} break;
|
15183
|
+
default:
|
15184
|
+
{
|
15185
|
+
GGML_ASSERT(false);
|
15186
|
+
} break;
|
15187
|
+
}
|
15188
|
+
}
|
15189
|
+
|
15190
|
+
// ggml_compute_forward_add_rel_pos
|
15191
|
+
|
15192
|
+
static void ggml_compute_forward_add_rel_pos_f32(
|
15193
|
+
const struct ggml_compute_params * params,
|
15194
|
+
const struct ggml_tensor * src0,
|
15195
|
+
const struct ggml_tensor * src1,
|
15196
|
+
const struct ggml_tensor * src2,
|
15197
|
+
struct ggml_tensor * dst) {
|
15198
|
+
|
15199
|
+
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
15200
|
+
if (!inplace && params->type == GGML_TASK_INIT) {
|
15201
|
+
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
15202
|
+
return;
|
15203
|
+
}
|
15204
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15205
|
+
return;
|
15206
|
+
}
|
15207
|
+
|
15208
|
+
int64_t t0 = ggml_perf_time_us();
|
15209
|
+
UNUSED(t0);
|
15210
|
+
|
15211
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
|
15212
|
+
|
15213
|
+
float * src1_data = (float *) src1->data;
|
15214
|
+
float * src2_data = (float *) src2->data;
|
15215
|
+
float * dst_data = (float *) dst->data;
|
15216
|
+
|
15217
|
+
const int64_t ne10 = src1->ne[0];
|
15218
|
+
const int64_t ne11 = src1->ne[1];
|
15219
|
+
const int64_t ne12 = src1->ne[2];
|
15220
|
+
const int64_t ne13 = src1->ne[3];
|
15221
|
+
|
15222
|
+
const int ith = params->ith;
|
15223
|
+
const int nth = params->nth;
|
15224
|
+
|
15225
|
+
// total patches in dst
|
15226
|
+
const int np = ne13;
|
15227
|
+
|
15228
|
+
// patches per thread
|
15229
|
+
const int dp = (np + nth - 1)/nth;
|
15230
|
+
|
15231
|
+
// patch range for this thread
|
15232
|
+
const int ip0 = dp*ith;
|
15233
|
+
const int ip1 = MIN(ip0 + dp, np);
|
15234
|
+
|
15235
|
+
|
15236
|
+
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
|
15237
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
15238
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
15239
|
+
const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
|
15240
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
15241
|
+
const int64_t jp0 = jp1 + i10;
|
15242
|
+
const float src1_e = src1_data[jp0];
|
15243
|
+
const float src2_e = src2_data[jp0];
|
15244
|
+
|
15245
|
+
const int64_t jdh = jp0 * ne10;
|
15246
|
+
const int64_t jdw = jdh - (ne10 - 1) * i10;
|
15247
|
+
|
15248
|
+
for (int64_t j = 0; j < ne10; ++j) {
|
15249
|
+
dst_data[jdh + j ] += src2_e;
|
15250
|
+
dst_data[jdw + j*ne10] += src1_e;
|
15251
|
+
}
|
15252
|
+
}
|
15253
|
+
}
|
15254
|
+
}
|
15255
|
+
}
|
15256
|
+
}
|
15257
|
+
|
15258
|
+
static void ggml_compute_forward_add_rel_pos(
|
15259
|
+
const struct ggml_compute_params * params,
|
15260
|
+
const struct ggml_tensor * src0,
|
15261
|
+
const struct ggml_tensor * src1,
|
15262
|
+
const struct ggml_tensor * src2,
|
15263
|
+
struct ggml_tensor * dst) {
|
15264
|
+
switch (src0->type) {
|
15265
|
+
case GGML_TYPE_F32:
|
15266
|
+
{
|
15267
|
+
ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
|
15268
|
+
} break;
|
15269
|
+
default:
|
15270
|
+
{
|
15271
|
+
GGML_ASSERT(false);
|
15272
|
+
} break;
|
15273
|
+
}
|
15274
|
+
}
|
15275
|
+
|
15276
|
+
// ggml_compute_forward_map_unary
|
15277
|
+
|
15278
|
+
static void ggml_compute_forward_map_unary_f32(
|
15279
|
+
const struct ggml_compute_params * params,
|
15280
|
+
const struct ggml_tensor * src0,
|
15281
|
+
struct ggml_tensor * dst,
|
15282
|
+
const ggml_unary_op_f32_t fun) {
|
15283
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
15284
|
+
|
15285
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15286
|
+
return;
|
15287
|
+
}
|
15288
|
+
|
15289
|
+
const int n = ggml_nrows(src0);
|
15290
|
+
const int nc = src0->ne[0];
|
15291
|
+
|
15292
|
+
assert( dst->nb[0] == sizeof(float));
|
15293
|
+
assert(src0->nb[0] == sizeof(float));
|
15294
|
+
|
15295
|
+
for (int i = 0; i < n; i++) {
|
15296
|
+
fun(nc,
|
15297
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
15298
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
15299
|
+
}
|
15300
|
+
}
|
15301
|
+
|
15302
|
+
|
15303
|
+
static void ggml_compute_forward_map_unary(
|
15304
|
+
const struct ggml_compute_params * params,
|
15305
|
+
const struct ggml_tensor * src0,
|
15306
|
+
struct ggml_tensor * dst,
|
15307
|
+
const ggml_unary_op_f32_t fun) {
|
14362
15308
|
switch (src0->type) {
|
14363
15309
|
case GGML_TYPE_F32:
|
14364
15310
|
{
|
@@ -14541,6 +15487,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14541
15487
|
const int nc = src0->ne[0];
|
14542
15488
|
const int nr = ggml_nrows(src0);
|
14543
15489
|
|
15490
|
+
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
15491
|
+
|
14544
15492
|
if (params->type == GGML_TASK_INIT) {
|
14545
15493
|
if (ith == 0) {
|
14546
15494
|
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
@@ -14552,7 +15500,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14552
15500
|
if (ith == 0) {
|
14553
15501
|
float * dp = (float *) dst->data;
|
14554
15502
|
ggml_vec_sum_f32(nth, dp, sums);
|
14555
|
-
dp[0] *= -1.0f;
|
15503
|
+
dp[0] *= -1.0f / (float) nr;
|
14556
15504
|
}
|
14557
15505
|
return;
|
14558
15506
|
}
|
@@ -14569,7 +15517,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14569
15517
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
14570
15518
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
14571
15519
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
14572
|
-
float * st = (float *) params->wdata + nth + ith*nc;
|
15520
|
+
float * st = ((float *) params->wdata) + nth + ith*nc;
|
14573
15521
|
|
14574
15522
|
#ifndef NDEBUG
|
14575
15523
|
for (int i = 0; i < nc; ++i) {
|
@@ -14584,15 +15532,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14584
15532
|
float max = -INFINITY;
|
14585
15533
|
ggml_vec_max_f32(nc, &max, s0);
|
14586
15534
|
|
14587
|
-
uint16_t scvt;
|
15535
|
+
uint16_t scvt; UNUSED(scvt);
|
14588
15536
|
for (int i = 0; i < nc; i++) {
|
14589
15537
|
if (s0[i] == -INFINITY) {
|
14590
15538
|
st[i] = 0.0f;
|
14591
15539
|
} else {
|
14592
|
-
|
15540
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15541
|
+
const float s = s0[i] - max;
|
15542
|
+
const float val = expf(s);
|
15543
|
+
#else
|
14593
15544
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
14594
15545
|
memcpy(&scvt, &s, sizeof(scvt));
|
14595
15546
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15547
|
+
#endif
|
14596
15548
|
sum += (ggml_float)val;
|
14597
15549
|
st[i] = val;
|
14598
15550
|
}
|
@@ -14608,7 +15560,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14608
15560
|
ggml_vec_log_f32(nc, st, st);
|
14609
15561
|
ggml_vec_mul_f32(nc, st, st, s1);
|
14610
15562
|
|
14611
|
-
|
15563
|
+
float st_sum = 0;
|
15564
|
+
ggml_vec_sum_f32(nc, &st_sum, st);
|
15565
|
+
sums[ith] += st_sum;
|
14612
15566
|
|
14613
15567
|
#ifndef NDEBUG
|
14614
15568
|
for (int i = 0; i < nc; ++i) {
|
@@ -14658,7 +15612,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14658
15612
|
return;
|
14659
15613
|
}
|
14660
15614
|
|
14661
|
-
const
|
15615
|
+
const double eps = 1e-9;
|
14662
15616
|
|
14663
15617
|
// TODO: handle transposed/permuted matrices
|
14664
15618
|
const int64_t nc = src0->ne[0];
|
@@ -14677,7 +15631,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14677
15631
|
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
14678
15632
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
14679
15633
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
14680
|
-
float * sm = (float *) params->wdata + ith*nc;
|
14681
15634
|
|
14682
15635
|
#ifndef NDEBUG
|
14683
15636
|
for (int i = 0; i < nc; ++i) {
|
@@ -14686,54 +15639,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14686
15639
|
assert(!isnan(s1[i]));
|
14687
15640
|
}
|
14688
15641
|
#endif
|
14689
|
-
// step by step explanation:
|
14690
|
-
{
|
14691
|
-
//float * sums = (float *) params->wdata;
|
14692
|
-
|
14693
|
-
// forward pass with annotated gradients from backward pass
|
14694
|
-
// (built by going in reverse operation order, adding to gradients of current operation args)
|
14695
|
-
// st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
14696
|
-
// from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
14697
|
-
// ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
14698
|
-
// ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
14699
|
-
// ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
14700
|
-
// ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
14701
|
-
// ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
14702
|
-
// ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
14703
|
-
|
14704
|
-
// substitute into grad[st1], because we can reuse softmax_back from this point on
|
14705
|
-
// grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
14706
|
-
// postorder:
|
14707
|
-
// grad[st1] := softmax(s0)
|
14708
|
-
// grad[st1] := grad[st1]*(1.0 - eps)
|
14709
|
-
// grad[st1] := grad[st1] + eps
|
14710
|
-
// grad[st1] := s1 / grad[st1]
|
14711
|
-
// grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
14712
|
-
|
14713
|
-
// src0 gradients by going through softmax_back
|
14714
|
-
// grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
14715
|
-
// from softmax_back:
|
14716
|
-
// dxk = yk * (dyk - dot(y, dy))
|
14717
|
-
// dot_y_dy := dot(y, dy)
|
14718
|
-
// dx := dy
|
14719
|
-
// dx := dx - dot_y_dy
|
14720
|
-
// dx := dx * y
|
14721
|
-
// postorder:
|
14722
|
-
// dot_st1_dst1 := dot(st1, grad[st1])
|
14723
|
-
// grad[s0] := grad[st1]
|
14724
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
14725
|
-
// grad[s0] := grad[s0] * st1
|
14726
|
-
|
14727
|
-
// prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
14728
|
-
// sm := softmax(s0)
|
14729
|
-
// grad[s0] := sm*(1.0 - eps)
|
14730
|
-
// grad[s0] := grad[s0] + eps
|
14731
|
-
// grad[s0] := s1 / grad[s0]
|
14732
|
-
// grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
14733
|
-
// dot_st1_dst1 := dot(sm, grad[s0])
|
14734
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
14735
|
-
// grad[s0] := grad[s0] * sm
|
14736
|
-
}
|
14737
15642
|
|
14738
15643
|
// soft_max
|
14739
15644
|
ggml_float sum = 0.0;
|
@@ -14741,39 +15646,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14741
15646
|
float max = -INFINITY;
|
14742
15647
|
ggml_vec_max_f32(nc, &max, s0);
|
14743
15648
|
|
14744
|
-
uint16_t scvt;
|
15649
|
+
uint16_t scvt; UNUSED(scvt);
|
14745
15650
|
for (int i = 0; i < nc; i++) {
|
14746
15651
|
if (s0[i] == -INFINITY) {
|
14747
|
-
|
15652
|
+
ds0[i] = 0.0f;
|
14748
15653
|
} else {
|
14749
|
-
|
15654
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15655
|
+
const float s = s0[i] - max;
|
15656
|
+
const float val = expf(s);
|
15657
|
+
#else
|
14750
15658
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
14751
15659
|
memcpy(&scvt, &s, sizeof(scvt));
|
14752
15660
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15661
|
+
#endif
|
14753
15662
|
sum += (ggml_float)val;
|
14754
|
-
|
15663
|
+
ds0[i] = val;
|
14755
15664
|
}
|
14756
15665
|
}
|
14757
15666
|
|
14758
15667
|
assert(sum > 0.0);
|
14759
|
-
sum = 1.0/sum;
|
15668
|
+
sum = (1.0 - eps)/sum;
|
14760
15669
|
}
|
14761
15670
|
|
14762
|
-
|
14763
|
-
ggml_vec_scale_f32(nc,
|
14764
|
-
|
14765
|
-
|
14766
|
-
|
14767
|
-
|
14768
|
-
ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
14769
|
-
ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
14770
|
-
ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
14771
|
-
ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
15671
|
+
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
15672
|
+
ggml_vec_scale_f32(nc, ds0, sum);
|
15673
|
+
ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
15674
|
+
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
15675
|
+
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
15676
|
+
|
14772
15677
|
|
14773
15678
|
#ifndef NDEBUG
|
14774
15679
|
for (int i = 0; i < nc; ++i) {
|
14775
|
-
assert(!isnan(sm[i]));
|
14776
|
-
assert(!isinf(sm[i]));
|
14777
15680
|
assert(!isnan(ds0[i]));
|
14778
15681
|
assert(!isinf(ds0[i]));
|
14779
15682
|
}
|
@@ -14879,6 +15782,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14879
15782
|
{
|
14880
15783
|
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
|
14881
15784
|
} break;
|
15785
|
+
case GGML_OP_CONCAT:
|
15786
|
+
{
|
15787
|
+
ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
|
15788
|
+
} break;
|
14882
15789
|
case GGML_OP_SILU_BACK:
|
14883
15790
|
{
|
14884
15791
|
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14895,6 +15802,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14895
15802
|
{
|
14896
15803
|
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
|
14897
15804
|
} break;
|
15805
|
+
case GGML_OP_GROUP_NORM:
|
15806
|
+
{
|
15807
|
+
ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
|
15808
|
+
} break;
|
14898
15809
|
case GGML_OP_MUL_MAT:
|
14899
15810
|
{
|
14900
15811
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14987,6 +15898,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14987
15898
|
{
|
14988
15899
|
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14989
15900
|
} break;
|
15901
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
15902
|
+
{
|
15903
|
+
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
15904
|
+
} break;
|
14990
15905
|
case GGML_OP_POOL_1D:
|
14991
15906
|
{
|
14992
15907
|
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
|
@@ -14995,6 +15910,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14995
15910
|
{
|
14996
15911
|
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
|
14997
15912
|
} break;
|
15913
|
+
case GGML_OP_UPSCALE:
|
15914
|
+
{
|
15915
|
+
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
15916
|
+
} break;
|
14998
15917
|
case GGML_OP_FLASH_ATTN:
|
14999
15918
|
{
|
15000
15919
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -15025,6 +15944,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15025
15944
|
{
|
15026
15945
|
ggml_compute_forward_unary(params, tensor->src[0], tensor);
|
15027
15946
|
} break;
|
15947
|
+
case GGML_OP_GET_REL_POS:
|
15948
|
+
{
|
15949
|
+
ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
|
15950
|
+
} break;
|
15951
|
+
case GGML_OP_ADD_REL_POS:
|
15952
|
+
{
|
15953
|
+
ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15954
|
+
} break;
|
15028
15955
|
case GGML_OP_MAP_UNARY:
|
15029
15956
|
{
|
15030
15957
|
ggml_unary_op_f32_t fun;
|
@@ -15288,6 +16215,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15288
16215
|
inplace);
|
15289
16216
|
}
|
15290
16217
|
} break;
|
16218
|
+
case GGML_OP_CONCAT:
|
16219
|
+
{
|
16220
|
+
GGML_ASSERT(false); // TODO: implement
|
16221
|
+
} break;
|
15291
16222
|
case GGML_OP_SILU_BACK:
|
15292
16223
|
{
|
15293
16224
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15300,9 +16231,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15300
16231
|
{
|
15301
16232
|
// necessary for llama
|
15302
16233
|
if (src0->grad) {
|
16234
|
+
float eps;
|
16235
|
+
memcpy(&eps, tensor->op_params, sizeof(float));
|
16236
|
+
|
15303
16237
|
src0->grad = ggml_add_impl(ctx,
|
15304
16238
|
src0->grad,
|
15305
|
-
ggml_rms_norm_back(ctx, src0, tensor->grad),
|
16239
|
+
ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
15306
16240
|
inplace);
|
15307
16241
|
}
|
15308
16242
|
} break;
|
@@ -15310,6 +16244,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15310
16244
|
{
|
15311
16245
|
GGML_ASSERT(false); // TODO: not implemented
|
15312
16246
|
} break;
|
16247
|
+
case GGML_OP_GROUP_NORM:
|
16248
|
+
{
|
16249
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16250
|
+
} break;
|
15313
16251
|
case GGML_OP_MUL_MAT:
|
15314
16252
|
{
|
15315
16253
|
// https://cs231n.github.io/optimization-2/#staged
|
@@ -15584,6 +16522,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15584
16522
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15585
16523
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15586
16524
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16525
|
+
float freq_base;
|
16526
|
+
float freq_scale;
|
16527
|
+
float xpos_base;
|
16528
|
+
bool xpos_down;
|
16529
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16530
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16531
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16532
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16533
|
+
|
15587
16534
|
src0->grad = ggml_add_impl(ctx,
|
15588
16535
|
src0->grad,
|
15589
16536
|
ggml_rope_back(ctx,
|
@@ -15591,7 +16538,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15591
16538
|
n_past,
|
15592
16539
|
n_dims,
|
15593
16540
|
mode,
|
15594
|
-
n_ctx
|
16541
|
+
n_ctx,
|
16542
|
+
freq_base,
|
16543
|
+
freq_scale,
|
16544
|
+
xpos_base,
|
16545
|
+
xpos_down),
|
15595
16546
|
inplace);
|
15596
16547
|
}
|
15597
16548
|
} break;
|
@@ -15602,14 +16553,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15602
16553
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15603
16554
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15604
16555
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16556
|
+
float freq_base;
|
16557
|
+
float freq_scale;
|
16558
|
+
float xpos_base;
|
16559
|
+
bool xpos_down;
|
16560
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16561
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16562
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16563
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16564
|
+
|
15605
16565
|
src0->grad = ggml_add_impl(ctx,
|
15606
16566
|
src0->grad,
|
15607
|
-
|
16567
|
+
ggml_rope_impl(ctx,
|
15608
16568
|
tensor->grad,
|
15609
16569
|
n_past,
|
15610
16570
|
n_dims,
|
15611
16571
|
mode,
|
15612
|
-
n_ctx
|
16572
|
+
n_ctx,
|
16573
|
+
freq_base,
|
16574
|
+
freq_scale,
|
16575
|
+
xpos_base,
|
16576
|
+
xpos_down,
|
16577
|
+
false),
|
15613
16578
|
inplace);
|
15614
16579
|
}
|
15615
16580
|
} break;
|
@@ -15629,6 +16594,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15629
16594
|
{
|
15630
16595
|
GGML_ASSERT(false); // TODO: not implemented
|
15631
16596
|
} break;
|
16597
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
16598
|
+
{
|
16599
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16600
|
+
} break;
|
15632
16601
|
case GGML_OP_POOL_1D:
|
15633
16602
|
{
|
15634
16603
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15637,6 +16606,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15637
16606
|
{
|
15638
16607
|
GGML_ASSERT(false); // TODO: not implemented
|
15639
16608
|
} break;
|
16609
|
+
case GGML_OP_UPSCALE:
|
16610
|
+
{
|
16611
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16612
|
+
} break;
|
15640
16613
|
case GGML_OP_FLASH_ATTN:
|
15641
16614
|
{
|
15642
16615
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15878,6 +16851,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15878
16851
|
GGML_ASSERT(false);
|
15879
16852
|
}
|
15880
16853
|
} break;
|
16854
|
+
case GGML_OP_GET_REL_POS:
|
16855
|
+
case GGML_OP_ADD_REL_POS:
|
15881
16856
|
case GGML_OP_MAP_UNARY:
|
15882
16857
|
case GGML_OP_MAP_BINARY:
|
15883
16858
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16029,9 +17004,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
16029
17004
|
return result;
|
16030
17005
|
}
|
16031
17006
|
|
16032
|
-
|
16033
|
-
struct ggml_cgraph result = *gf;
|
16034
|
-
|
17007
|
+
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
16035
17008
|
GGML_ASSERT(gf->n_nodes > 0);
|
16036
17009
|
|
16037
17010
|
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
@@ -16055,15 +17028,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
16055
17028
|
}
|
16056
17029
|
}
|
16057
17030
|
|
16058
|
-
for (int i =
|
17031
|
+
for (int i = 0; i < gf->n_nodes; i++) {
|
16059
17032
|
struct ggml_tensor * node = gf->nodes[i];
|
16060
17033
|
|
16061
17034
|
if (node->is_param) {
|
16062
17035
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
16063
|
-
ggml_build_forward_expand(
|
17036
|
+
ggml_build_forward_expand(gb, node->grad);
|
16064
17037
|
}
|
16065
17038
|
}
|
17039
|
+
}
|
16066
17040
|
|
17041
|
+
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
17042
|
+
struct ggml_cgraph result = *gf;
|
17043
|
+
ggml_build_backward_expand(ctx, gf, &result, keep);
|
16067
17044
|
return result;
|
16068
17045
|
}
|
16069
17046
|
|
@@ -16382,7 +17359,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16382
17359
|
|
16383
17360
|
size_t cur = 0;
|
16384
17361
|
if (ggml_is_quantized(node->type)) {
|
16385
|
-
cur =
|
17362
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16386
17363
|
}
|
16387
17364
|
|
16388
17365
|
work_size = MAX(work_size, cur);
|
@@ -16395,7 +17372,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16395
17372
|
size_t cur = 0;
|
16396
17373
|
|
16397
17374
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16398
|
-
cur =
|
17375
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16399
17376
|
}
|
16400
17377
|
|
16401
17378
|
work_size = MAX(work_size, cur);
|
@@ -16407,7 +17384,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16407
17384
|
size_t cur = 0;
|
16408
17385
|
|
16409
17386
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16410
|
-
cur =
|
17387
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
16411
17388
|
}
|
16412
17389
|
|
16413
17390
|
work_size = MAX(work_size, cur);
|
@@ -16454,9 +17431,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16454
17431
|
case GGML_OP_NORM:
|
16455
17432
|
case GGML_OP_RMS_NORM:
|
16456
17433
|
case GGML_OP_RMS_NORM_BACK:
|
17434
|
+
case GGML_OP_GROUP_NORM:
|
16457
17435
|
{
|
16458
17436
|
n_tasks = n_threads;
|
16459
17437
|
} break;
|
17438
|
+
case GGML_OP_CONCAT:
|
16460
17439
|
case GGML_OP_MUL_MAT:
|
16461
17440
|
case GGML_OP_OUT_PROD:
|
16462
17441
|
{
|
@@ -16490,12 +17469,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16490
17469
|
// the threads are still spinning
|
16491
17470
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16492
17471
|
// here we need memory just for single 2D matrix from src0
|
16493
|
-
cur =
|
17472
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
16494
17473
|
}
|
16495
17474
|
} else
|
16496
17475
|
#endif
|
16497
17476
|
if (node->src[1]->type != vec_dot_type) {
|
16498
|
-
cur =
|
17477
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
16499
17478
|
} else {
|
16500
17479
|
cur = 0;
|
16501
17480
|
}
|
@@ -16524,6 +17503,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16524
17503
|
case GGML_OP_SOFT_MAX_BACK:
|
16525
17504
|
case GGML_OP_ROPE:
|
16526
17505
|
case GGML_OP_ROPE_BACK:
|
17506
|
+
case GGML_OP_ADD_REL_POS:
|
16527
17507
|
{
|
16528
17508
|
n_tasks = n_threads;
|
16529
17509
|
} break;
|
@@ -16598,6 +17578,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16598
17578
|
GGML_ASSERT(false);
|
16599
17579
|
}
|
16600
17580
|
|
17581
|
+
work_size = MAX(work_size, cur);
|
17582
|
+
} break;
|
17583
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
17584
|
+
{
|
17585
|
+
n_tasks = n_threads;
|
17586
|
+
|
17587
|
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
17588
|
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
17589
|
+
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
17590
|
+
const int64_t ne03 = node->src[0]->ne[3]; // Channels In
|
17591
|
+
|
17592
|
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
17593
|
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
17594
|
+
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
17595
|
+
|
17596
|
+
size_t cur = 0;
|
17597
|
+
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
17598
|
+
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
17599
|
+
|
16601
17600
|
work_size = MAX(work_size, cur);
|
16602
17601
|
} break;
|
16603
17602
|
case GGML_OP_POOL_1D:
|
@@ -16605,6 +17604,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16605
17604
|
{
|
16606
17605
|
n_tasks = 1;
|
16607
17606
|
} break;
|
17607
|
+
case GGML_OP_UPSCALE:
|
17608
|
+
{
|
17609
|
+
n_tasks = n_threads;
|
17610
|
+
} break;
|
16608
17611
|
case GGML_OP_FLASH_ATTN:
|
16609
17612
|
{
|
16610
17613
|
n_tasks = n_threads;
|
@@ -16666,6 +17669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16666
17669
|
} break;
|
16667
17670
|
case GGML_OP_WIN_PART:
|
16668
17671
|
case GGML_OP_WIN_UNPART:
|
17672
|
+
case GGML_OP_GET_REL_POS:
|
16669
17673
|
case GGML_OP_MAP_UNARY:
|
16670
17674
|
case GGML_OP_MAP_BINARY:
|
16671
17675
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16712,10 +17716,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16712
17716
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16713
17717
|
{
|
16714
17718
|
n_tasks = n_threads;
|
16715
|
-
|
16716
|
-
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
16717
|
-
|
16718
|
-
work_size = MAX(work_size, cur);
|
16719
17719
|
} break;
|
16720
17720
|
case GGML_OP_NONE:
|
16721
17721
|
{
|
@@ -16783,8 +17783,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16783
17783
|
|
16784
17784
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
16785
17785
|
GGML_ASSERT(rc == 0);
|
17786
|
+
UNUSED(rc);
|
16786
17787
|
}
|
16787
17788
|
}
|
17789
|
+
|
16788
17790
|
workers[0].ith = 0;
|
16789
17791
|
workers[0].shared = &state_shared;
|
16790
17792
|
|
@@ -16900,7 +17902,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16900
17902
|
// compute size of intermediate results
|
16901
17903
|
// TODO: does not take into account scratch buffers !!!!
|
16902
17904
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16903
|
-
size_eval +=
|
17905
|
+
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
16904
17906
|
}
|
16905
17907
|
|
16906
17908
|
// print
|
@@ -17591,14 +18593,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17591
18593
|
struct ggml_opt_params params,
|
17592
18594
|
struct ggml_tensor * f,
|
17593
18595
|
struct ggml_cgraph * gf,
|
17594
|
-
struct ggml_cgraph * gb
|
18596
|
+
struct ggml_cgraph * gb,
|
18597
|
+
ggml_opt_callback callback,
|
18598
|
+
void * callback_data) {
|
17595
18599
|
GGML_ASSERT(ggml_is_scalar(f));
|
17596
18600
|
|
17597
18601
|
// these will store the parameters we want to optimize
|
17598
18602
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
17599
18603
|
|
17600
18604
|
int np = 0;
|
17601
|
-
|
18605
|
+
int64_t nx = 0;
|
17602
18606
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
17603
18607
|
if (gf->nodes[i]->is_param) {
|
17604
18608
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
@@ -17617,31 +18621,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17617
18621
|
}
|
17618
18622
|
|
17619
18623
|
// constants
|
17620
|
-
|
17621
|
-
const float
|
17622
|
-
const float
|
18624
|
+
float sched = params.adam.sched;
|
18625
|
+
const float alpha = params.adam.alpha;
|
18626
|
+
const float decay = params.adam.decay * alpha;
|
17623
18627
|
const float beta1 = params.adam.beta1;
|
17624
18628
|
const float beta2 = params.adam.beta2;
|
17625
18629
|
const float eps = params.adam.eps;
|
18630
|
+
const float gclip = params.adam.gclip;
|
18631
|
+
const int decay_min_ndim = params.adam.decay_min_ndim;
|
17626
18632
|
|
17627
|
-
float * x = opt->adam.x->data; // view of the parameters
|
17628
|
-
float * g1 = opt->adam.g1->data; // gradient
|
17629
|
-
float * g2 = opt->adam.g2->data; // gradient squared
|
17630
18633
|
float * m = opt->adam.m->data; // first moment
|
17631
18634
|
float * v = opt->adam.v->data; // second moment
|
17632
|
-
float * mh = opt->adam.mh->data; // first moment hat
|
17633
|
-
float * vh = opt->adam.vh->data; // second moment hat
|
17634
18635
|
|
17635
18636
|
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
17636
18637
|
|
17637
|
-
|
17638
|
-
|
18638
|
+
if (callback) {
|
18639
|
+
callback(callback_data, &sched);
|
18640
|
+
}
|
17639
18641
|
|
17640
18642
|
// compute the function value
|
17641
18643
|
ggml_graph_reset (gf);
|
17642
18644
|
ggml_set_f32 (f->grad, 1.0f);
|
17643
18645
|
|
17644
|
-
|
18646
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18647
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18648
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18649
|
+
ggml_graph_compute(gb, &cplan);
|
17645
18650
|
|
17646
18651
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
17647
18652
|
opt->adam.fx_best = opt->adam.fx_prev;
|
@@ -17649,6 +18654,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17649
18654
|
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
17650
18655
|
}
|
17651
18656
|
|
18657
|
+
opt->loss_before = opt->adam.fx_prev;
|
18658
|
+
opt->loss_after = opt->adam.fx_prev;
|
18659
|
+
|
17652
18660
|
// initialize
|
17653
18661
|
if (opt->just_initialized) {
|
17654
18662
|
opt->adam.n_no_improvement = 0;
|
@@ -17681,50 +18689,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17681
18689
|
UNUSED(t_start_cpu);
|
17682
18690
|
|
17683
18691
|
{
|
17684
|
-
|
17685
|
-
|
17686
|
-
|
17687
|
-
|
17688
|
-
|
17689
|
-
|
17690
|
-
|
17691
|
-
|
17692
|
-
|
17693
|
-
|
17694
|
-
|
17695
|
-
|
17696
|
-
|
17697
|
-
|
17698
|
-
|
17699
|
-
|
17700
|
-
|
17701
|
-
|
17702
|
-
|
17703
|
-
|
17704
|
-
|
17705
|
-
|
17706
|
-
|
17707
|
-
|
17708
|
-
|
17709
|
-
|
17710
|
-
|
17711
|
-
|
17712
|
-
|
17713
|
-
|
17714
|
-
|
17715
|
-
|
17716
|
-
|
18692
|
+
float gnorm = 1.0f;
|
18693
|
+
if (gclip > 0.0f) {
|
18694
|
+
// gradient clipping
|
18695
|
+
ggml_float sum = 0.0;
|
18696
|
+
for (int p = 0; p < np; ++p) {
|
18697
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18698
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18699
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j);
|
18700
|
+
sum += (ggml_float)(g*g);
|
18701
|
+
}
|
18702
|
+
}
|
18703
|
+
ggml_float norm = sqrt(sum);
|
18704
|
+
if (norm > (ggml_float) gclip) {
|
18705
|
+
gnorm = (float) ((ggml_float) gclip / norm);
|
18706
|
+
}
|
18707
|
+
}
|
18708
|
+
const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
18709
|
+
const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
18710
|
+
int64_t i = 0;
|
18711
|
+
for (int p = 0; p < np; ++p) {
|
18712
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18713
|
+
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
18714
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18715
|
+
float x = ggml_get_f32_1d(ps[p], j);
|
18716
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
18717
|
+
m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
18718
|
+
v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
18719
|
+
float mh = m[i]*beta1h;
|
18720
|
+
float vh = v[i]*beta2h;
|
18721
|
+
vh = sqrtf(vh) + eps;
|
18722
|
+
x = x*(1.0f - p_decay) - mh/vh;
|
18723
|
+
ggml_set_f32_1d(ps[p], j, x);
|
18724
|
+
++i;
|
18725
|
+
}
|
18726
|
+
}
|
18727
|
+
}
|
17717
18728
|
|
17718
|
-
|
17719
|
-
|
18729
|
+
if (callback) {
|
18730
|
+
callback(callback_data, &sched);
|
17720
18731
|
}
|
17721
18732
|
|
17722
18733
|
ggml_graph_reset (gf);
|
17723
18734
|
ggml_set_f32 (f->grad, 1.0f);
|
17724
18735
|
|
17725
|
-
|
18736
|
+
ggml_graph_compute(gb, &cplan);
|
17726
18737
|
|
17727
18738
|
const float fx = ggml_get_f32_1d(f, 0);
|
18739
|
+
opt->loss_after = fx;
|
18740
|
+
|
17728
18741
|
|
17729
18742
|
// check convergence
|
17730
18743
|
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
@@ -17793,7 +18806,6 @@ struct ggml_lbfgs_iteration_data {
|
|
17793
18806
|
};
|
17794
18807
|
|
17795
18808
|
static enum ggml_opt_result linesearch_backtracking(
|
17796
|
-
struct ggml_context * ctx,
|
17797
18809
|
const struct ggml_opt_params * params,
|
17798
18810
|
int nx,
|
17799
18811
|
float * x,
|
@@ -17805,8 +18817,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17805
18817
|
struct ggml_tensor * f,
|
17806
18818
|
struct ggml_cgraph * gf,
|
17807
18819
|
struct ggml_cgraph * gb,
|
18820
|
+
struct ggml_cplan * cplan,
|
17808
18821
|
const int np,
|
17809
|
-
struct ggml_tensor * ps[]
|
18822
|
+
struct ggml_tensor * ps[],
|
18823
|
+
ggml_opt_callback callback,
|
18824
|
+
void * callback_data) {
|
17810
18825
|
int count = 0;
|
17811
18826
|
|
17812
18827
|
float width = 0.0f;
|
@@ -17835,6 +18850,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17835
18850
|
dgtest = params->lbfgs.ftol*dginit;
|
17836
18851
|
|
17837
18852
|
while (true) {
|
18853
|
+
if (callback) {
|
18854
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18855
|
+
float sched = 0;
|
18856
|
+
callback(callback_data, &sched);
|
18857
|
+
}
|
18858
|
+
|
17838
18859
|
ggml_vec_cpy_f32(nx, x, xp);
|
17839
18860
|
ggml_vec_mad_f32(nx, x, d, *step);
|
17840
18861
|
|
@@ -17845,7 +18866,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17845
18866
|
ggml_graph_reset (gf);
|
17846
18867
|
ggml_set_f32 (f->grad, 1.0f);
|
17847
18868
|
|
17848
|
-
|
18869
|
+
ggml_graph_compute(gb, cplan);
|
17849
18870
|
|
17850
18871
|
ggml_opt_get_grad(np, ps, g);
|
17851
18872
|
|
@@ -17905,7 +18926,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17905
18926
|
struct ggml_opt_params params,
|
17906
18927
|
struct ggml_tensor * f,
|
17907
18928
|
struct ggml_cgraph * gf,
|
17908
|
-
struct ggml_cgraph * gb
|
18929
|
+
struct ggml_cgraph * gb,
|
18930
|
+
ggml_opt_callback callback,
|
18931
|
+
void * callback_data) {
|
17909
18932
|
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
17910
18933
|
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
17911
18934
|
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
@@ -17937,6 +18960,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17937
18960
|
opt->iter = iter;
|
17938
18961
|
}
|
17939
18962
|
|
18963
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18964
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18965
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18966
|
+
|
17940
18967
|
float * x = opt->lbfgs.x->data; // current parameters
|
17941
18968
|
float * xp = opt->lbfgs.xp->data; // previous parameters
|
17942
18969
|
float * g = opt->lbfgs.g->data; // current gradient
|
@@ -17958,6 +18985,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17958
18985
|
float * lm_s = opt->lbfgs.lms->data;
|
17959
18986
|
float * lm_y = opt->lbfgs.lmy->data;
|
17960
18987
|
|
18988
|
+
if (callback) {
|
18989
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18990
|
+
float sched = 0;
|
18991
|
+
callback(callback_data, &sched);
|
18992
|
+
}
|
18993
|
+
|
17961
18994
|
// evaluate the function value and its gradient
|
17962
18995
|
{
|
17963
18996
|
ggml_opt_set_params(np, ps, x);
|
@@ -17965,11 +18998,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17965
18998
|
ggml_graph_reset (gf);
|
17966
18999
|
ggml_set_f32 (f->grad, 1.0f);
|
17967
19000
|
|
17968
|
-
|
19001
|
+
ggml_graph_compute(gb, &cplan);
|
17969
19002
|
|
17970
19003
|
ggml_opt_get_grad(np, ps, g);
|
17971
19004
|
|
17972
19005
|
fx = ggml_get_f32_1d(f, 0);
|
19006
|
+
|
19007
|
+
opt->loss_before = fx;
|
19008
|
+
opt->loss_after = fx;
|
17973
19009
|
}
|
17974
19010
|
|
17975
19011
|
// search direction = -gradient
|
@@ -18024,7 +19060,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18024
19060
|
ggml_vec_cpy_f32(nx, xp, x);
|
18025
19061
|
ggml_vec_cpy_f32(nx, gp, g);
|
18026
19062
|
|
18027
|
-
ls = linesearch_backtracking(
|
19063
|
+
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
18028
19064
|
|
18029
19065
|
if (ls < 0) {
|
18030
19066
|
// linesearch failed - go back to the previous point and return
|
@@ -18034,6 +19070,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18034
19070
|
return ls;
|
18035
19071
|
}
|
18036
19072
|
|
19073
|
+
opt->loss_after = fx;
|
19074
|
+
|
18037
19075
|
ggml_vec_norm_f32(nx, &xnorm, x);
|
18038
19076
|
ggml_vec_norm_f32(nx, &gnorm, g);
|
18039
19077
|
|
@@ -18091,7 +19129,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18091
19129
|
// ys = y^t \cdot s -> 1 / \rho.
|
18092
19130
|
// yy = y^t \cdot y.
|
18093
19131
|
//
|
18094
|
-
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]
|
19132
|
+
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
18095
19133
|
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
18096
19134
|
|
18097
19135
|
lm_ys[end[0]] = ys;
|
@@ -18154,13 +19192,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18154
19192
|
.adam = {
|
18155
19193
|
.n_iter = 10000,
|
18156
19194
|
.sched = 1.000f,
|
18157
|
-
.decay = 0.
|
19195
|
+
.decay = 0.0f,
|
19196
|
+
.decay_min_ndim = 2,
|
18158
19197
|
.alpha = 0.001f,
|
18159
19198
|
.beta1 = 0.9f,
|
18160
19199
|
.beta2 = 0.999f,
|
18161
19200
|
.eps = 1e-8f,
|
18162
19201
|
.eps_f = 1e-5f,
|
18163
19202
|
.eps_g = 1e-3f,
|
19203
|
+
.gclip = 0.0f,
|
18164
19204
|
},
|
18165
19205
|
};
|
18166
19206
|
} break;
|
@@ -18210,23 +19250,13 @@ GGML_API void ggml_opt_init(
|
|
18210
19250
|
switch (opt->params.type) {
|
18211
19251
|
case GGML_OPT_ADAM:
|
18212
19252
|
{
|
18213
|
-
opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18214
|
-
opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18215
|
-
opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18216
19253
|
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18217
19254
|
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18218
|
-
opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18219
|
-
opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18220
19255
|
opt->adam.pf = params.past > 0
|
18221
19256
|
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
18222
19257
|
: NULL;
|
18223
|
-
ggml_set_zero(opt->adam.x);
|
18224
|
-
ggml_set_zero(opt->adam.g1);
|
18225
|
-
ggml_set_zero(opt->adam.g2);
|
18226
19258
|
ggml_set_zero(opt->adam.m);
|
18227
19259
|
ggml_set_zero(opt->adam.v);
|
18228
|
-
ggml_set_zero(opt->adam.mh);
|
18229
|
-
ggml_set_zero(opt->adam.vh);
|
18230
19260
|
if (opt->adam.pf) {
|
18231
19261
|
ggml_set_zero(opt->adam.pf);
|
18232
19262
|
}
|
@@ -18301,8 +19331,8 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18301
19331
|
struct ggml_tensor * f) {
|
18302
19332
|
|
18303
19333
|
// build forward + backward compute graphs
|
18304
|
-
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
18305
|
-
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
19334
|
+
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
19335
|
+
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
18306
19336
|
|
18307
19337
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
18308
19338
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
@@ -18310,7 +19340,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18310
19340
|
*gf = ggml_build_forward (f);
|
18311
19341
|
*gb = ggml_build_backward(ctx, gf, true);
|
18312
19342
|
|
18313
|
-
return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
19343
|
+
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
18314
19344
|
}
|
18315
19345
|
|
18316
19346
|
enum ggml_opt_result ggml_opt_resume_g(
|
@@ -18318,7 +19348,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18318
19348
|
struct ggml_opt_context * opt,
|
18319
19349
|
struct ggml_tensor * f,
|
18320
19350
|
struct ggml_cgraph * gf,
|
18321
|
-
struct ggml_cgraph * gb
|
19351
|
+
struct ggml_cgraph * gb,
|
19352
|
+
ggml_opt_callback callback,
|
19353
|
+
void * callback_data) {
|
18322
19354
|
|
18323
19355
|
// build forward + backward compute graphs
|
18324
19356
|
enum ggml_opt_result result = GGML_OPT_OK;
|
@@ -18326,11 +19358,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18326
19358
|
switch (opt->params.type) {
|
18327
19359
|
case GGML_OPT_ADAM:
|
18328
19360
|
{
|
18329
|
-
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
19361
|
+
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
18330
19362
|
} break;
|
18331
19363
|
case GGML_OPT_LBFGS:
|
18332
19364
|
{
|
18333
|
-
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
19365
|
+
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
18334
19366
|
} break;
|
18335
19367
|
}
|
18336
19368
|
|
@@ -18561,64 +19593,1164 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18561
19593
|
|
18562
19594
|
////////////////////////////////////////////////////////////////////////////////
|
18563
19595
|
|
18564
|
-
|
18565
|
-
|
18566
|
-
|
18567
|
-
|
18568
|
-
return 0;
|
18569
|
-
#endif
|
18570
|
-
}
|
19596
|
+
struct gguf_str {
|
19597
|
+
uint64_t n; // GGUFv2
|
19598
|
+
char * data;
|
19599
|
+
};
|
18571
19600
|
|
18572
|
-
|
18573
|
-
|
18574
|
-
|
18575
|
-
|
18576
|
-
|
18577
|
-
|
18578
|
-
|
19601
|
+
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
19602
|
+
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
|
19603
|
+
[GGUF_TYPE_INT8] = sizeof(int8_t),
|
19604
|
+
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
|
19605
|
+
[GGUF_TYPE_INT16] = sizeof(int16_t),
|
19606
|
+
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
|
19607
|
+
[GGUF_TYPE_INT32] = sizeof(int32_t),
|
19608
|
+
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
19609
|
+
[GGUF_TYPE_BOOL] = sizeof(bool),
|
19610
|
+
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
19611
|
+
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
19612
|
+
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
19613
|
+
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
19614
|
+
[GGUF_TYPE_ARRAY] = 0, // undefined
|
19615
|
+
};
|
19616
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19617
|
+
|
19618
|
+
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
19619
|
+
[GGUF_TYPE_UINT8] = "u8",
|
19620
|
+
[GGUF_TYPE_INT8] = "i8",
|
19621
|
+
[GGUF_TYPE_UINT16] = "u16",
|
19622
|
+
[GGUF_TYPE_INT16] = "i16",
|
19623
|
+
[GGUF_TYPE_UINT32] = "u32",
|
19624
|
+
[GGUF_TYPE_INT32] = "i32",
|
19625
|
+
[GGUF_TYPE_FLOAT32] = "f32",
|
19626
|
+
[GGUF_TYPE_BOOL] = "bool",
|
19627
|
+
[GGUF_TYPE_STRING] = "str",
|
19628
|
+
[GGUF_TYPE_ARRAY] = "arr",
|
19629
|
+
[GGUF_TYPE_UINT64] = "u64",
|
19630
|
+
[GGUF_TYPE_INT64] = "i64",
|
19631
|
+
[GGUF_TYPE_FLOAT64] = "f64",
|
19632
|
+
};
|
19633
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19634
|
+
|
19635
|
+
union gguf_value {
|
19636
|
+
uint8_t uint8;
|
19637
|
+
int8_t int8;
|
19638
|
+
uint16_t uint16;
|
19639
|
+
int16_t int16;
|
19640
|
+
uint32_t uint32;
|
19641
|
+
int32_t int32;
|
19642
|
+
float float32;
|
19643
|
+
uint64_t uint64;
|
19644
|
+
int64_t int64;
|
19645
|
+
double float64;
|
19646
|
+
bool bool_;
|
19647
|
+
|
19648
|
+
struct gguf_str str;
|
19649
|
+
|
19650
|
+
struct {
|
19651
|
+
enum gguf_type type;
|
19652
|
+
|
19653
|
+
uint64_t n; // GGUFv2
|
19654
|
+
void * data;
|
19655
|
+
} arr;
|
19656
|
+
};
|
18579
19657
|
|
18580
|
-
|
18581
|
-
|
18582
|
-
return 1;
|
18583
|
-
#else
|
18584
|
-
return 0;
|
18585
|
-
#endif
|
18586
|
-
}
|
19658
|
+
struct gguf_kv {
|
19659
|
+
struct gguf_str key;
|
18587
19660
|
|
18588
|
-
|
18589
|
-
|
18590
|
-
|
18591
|
-
#else
|
18592
|
-
return 0;
|
18593
|
-
#endif
|
18594
|
-
}
|
19661
|
+
enum gguf_type type;
|
19662
|
+
union gguf_value value;
|
19663
|
+
};
|
18595
19664
|
|
18596
|
-
|
18597
|
-
|
18598
|
-
|
18599
|
-
|
18600
|
-
|
18601
|
-
|
18602
|
-
}
|
19665
|
+
struct gguf_header {
|
19666
|
+
uint32_t magic;
|
19667
|
+
uint32_t version;
|
19668
|
+
uint64_t n_tensors; // GGUFv2
|
19669
|
+
uint64_t n_kv; // GGUFv2
|
19670
|
+
};
|
18603
19671
|
|
18604
|
-
|
18605
|
-
|
18606
|
-
return 1;
|
18607
|
-
#else
|
18608
|
-
return 0;
|
18609
|
-
#endif
|
18610
|
-
}
|
19672
|
+
struct gguf_tensor_info {
|
19673
|
+
struct gguf_str name;
|
18611
19674
|
|
18612
|
-
|
18613
|
-
|
18614
|
-
return 1;
|
18615
|
-
#else
|
18616
|
-
return 0;
|
18617
|
-
#endif
|
18618
|
-
}
|
19675
|
+
uint32_t n_dims;
|
19676
|
+
uint64_t ne[GGML_MAX_DIMS];
|
18619
19677
|
|
18620
|
-
|
18621
|
-
|
19678
|
+
enum ggml_type type;
|
19679
|
+
|
19680
|
+
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
19681
|
+
|
19682
|
+
// for writing API
|
19683
|
+
const void * data;
|
19684
|
+
size_t size;
|
19685
|
+
};
|
19686
|
+
|
19687
|
+
struct gguf_context {
|
19688
|
+
struct gguf_header header;
|
19689
|
+
|
19690
|
+
struct gguf_kv * kv;
|
19691
|
+
struct gguf_tensor_info * infos;
|
19692
|
+
|
19693
|
+
size_t alignment;
|
19694
|
+
size_t offset; // offset of `data` from beginning of file
|
19695
|
+
size_t size; // size of `data` in bytes
|
19696
|
+
|
19697
|
+
//uint8_t * padding;
|
19698
|
+
void * data;
|
19699
|
+
};
|
19700
|
+
|
19701
|
+
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
19702
|
+
const size_t n = fread(dst, 1, size, file);
|
19703
|
+
*offset += n;
|
19704
|
+
return n == size;
|
19705
|
+
}
|
19706
|
+
|
19707
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19708
|
+
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
19709
|
+
p->n = 0;
|
19710
|
+
p->data = NULL;
|
19711
|
+
|
19712
|
+
bool ok = true;
|
19713
|
+
|
19714
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
19715
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19716
|
+
|
19717
|
+
return ok;
|
19718
|
+
}
|
19719
|
+
|
19720
|
+
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
19721
|
+
p->n = 0;
|
19722
|
+
p->data = NULL;
|
19723
|
+
|
19724
|
+
bool ok = true;
|
19725
|
+
|
19726
|
+
uint32_t n = 0;
|
19727
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
19728
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19729
|
+
|
19730
|
+
return ok;
|
19731
|
+
}
|
19732
|
+
|
19733
|
+
struct gguf_context * gguf_init_empty(void) {
|
19734
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19735
|
+
|
19736
|
+
ctx->header.magic = GGUF_MAGIC;
|
19737
|
+
ctx->header.version = GGUF_VERSION;
|
19738
|
+
ctx->header.n_tensors = 0;
|
19739
|
+
ctx->header.n_kv = 0;
|
19740
|
+
|
19741
|
+
ctx->kv = NULL;
|
19742
|
+
ctx->infos = NULL;
|
19743
|
+
|
19744
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19745
|
+
ctx->offset = 0;
|
19746
|
+
ctx->size = 0;
|
19747
|
+
|
19748
|
+
ctx->data = NULL;
|
19749
|
+
|
19750
|
+
return ctx;
|
19751
|
+
}
|
19752
|
+
|
19753
|
+
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
19754
|
+
FILE * file = fopen(fname, "rb");
|
19755
|
+
if (!file) {
|
19756
|
+
return NULL;
|
19757
|
+
}
|
19758
|
+
|
19759
|
+
// offset from start of file
|
19760
|
+
size_t offset = 0;
|
19761
|
+
|
19762
|
+
uint32_t magic = 0;
|
19763
|
+
|
19764
|
+
// check the magic before making allocations
|
19765
|
+
{
|
19766
|
+
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
19767
|
+
|
19768
|
+
if (magic != GGUF_MAGIC) {
|
19769
|
+
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
19770
|
+
fclose(file);
|
19771
|
+
return NULL;
|
19772
|
+
}
|
19773
|
+
}
|
19774
|
+
|
19775
|
+
bool ok = true;
|
19776
|
+
|
19777
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19778
|
+
|
19779
|
+
// read the header
|
19780
|
+
{
|
19781
|
+
ctx->header.magic = magic;
|
19782
|
+
|
19783
|
+
ctx->kv = NULL;
|
19784
|
+
ctx->infos = NULL;
|
19785
|
+
ctx->data = NULL;
|
19786
|
+
|
19787
|
+
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
19788
|
+
|
19789
|
+
if (ctx->header.version == 1) {
|
19790
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19791
|
+
uint32_t n_tensors = 0;
|
19792
|
+
uint32_t n_kv = 0;
|
19793
|
+
|
19794
|
+
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
19795
|
+
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
19796
|
+
|
19797
|
+
ctx->header.n_tensors = n_tensors;
|
19798
|
+
ctx->header.n_kv = n_kv;
|
19799
|
+
} else {
|
19800
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
19801
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
19802
|
+
}
|
19803
|
+
|
19804
|
+
if (!ok) {
|
19805
|
+
fprintf(stderr, "%s: failed to read header\n", __func__);
|
19806
|
+
fclose(file);
|
19807
|
+
gguf_free(ctx);
|
19808
|
+
return NULL;
|
19809
|
+
}
|
19810
|
+
}
|
19811
|
+
|
19812
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19813
|
+
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
19814
|
+
if (ctx->header.version == 1) {
|
19815
|
+
gguf_fread_str = gguf_fread_str_v1;
|
19816
|
+
}
|
19817
|
+
|
19818
|
+
// read the kv pairs
|
19819
|
+
{
|
19820
|
+
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19821
|
+
|
19822
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19823
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
19824
|
+
|
19825
|
+
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
19826
|
+
|
19827
|
+
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
19828
|
+
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19829
|
+
|
19830
|
+
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
19831
|
+
|
19832
|
+
switch (kv->type) {
|
19833
|
+
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
|
19834
|
+
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
|
19835
|
+
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
|
19836
|
+
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
|
19837
|
+
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
19838
|
+
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
19839
|
+
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
19840
|
+
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
|
19841
|
+
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
|
19842
|
+
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
|
19843
|
+
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
19844
|
+
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
19845
|
+
case GGUF_TYPE_ARRAY:
|
19846
|
+
{
|
19847
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19848
|
+
|
19849
|
+
if (ctx->header.version == 1) {
|
19850
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19851
|
+
uint32_t n = 0;
|
19852
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
19853
|
+
kv->value.arr.n = n;
|
19854
|
+
} else {
|
19855
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19856
|
+
}
|
19857
|
+
|
19858
|
+
switch (kv->value.arr.type) {
|
19859
|
+
case GGUF_TYPE_UINT8:
|
19860
|
+
case GGUF_TYPE_INT8:
|
19861
|
+
case GGUF_TYPE_UINT16:
|
19862
|
+
case GGUF_TYPE_INT16:
|
19863
|
+
case GGUF_TYPE_UINT32:
|
19864
|
+
case GGUF_TYPE_INT32:
|
19865
|
+
case GGUF_TYPE_FLOAT32:
|
19866
|
+
case GGUF_TYPE_UINT64:
|
19867
|
+
case GGUF_TYPE_INT64:
|
19868
|
+
case GGUF_TYPE_FLOAT64:
|
19869
|
+
case GGUF_TYPE_BOOL:
|
19870
|
+
{
|
19871
|
+
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
19872
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
19873
|
+
} break;
|
19874
|
+
case GGUF_TYPE_STRING:
|
19875
|
+
{
|
19876
|
+
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
19877
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
19878
|
+
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19879
|
+
}
|
19880
|
+
} break;
|
19881
|
+
case GGUF_TYPE_ARRAY:
|
19882
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
19883
|
+
};
|
19884
|
+
} break;
|
19885
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
19886
|
+
};
|
19887
|
+
|
19888
|
+
if (!ok) {
|
19889
|
+
break;
|
19890
|
+
}
|
19891
|
+
}
|
19892
|
+
|
19893
|
+
if (!ok) {
|
19894
|
+
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
19895
|
+
fclose(file);
|
19896
|
+
gguf_free(ctx);
|
19897
|
+
return NULL;
|
19898
|
+
}
|
19899
|
+
}
|
19900
|
+
|
19901
|
+
// read the tensor infos
|
19902
|
+
{
|
19903
|
+
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19904
|
+
|
19905
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19906
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19907
|
+
|
19908
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
19909
|
+
info->ne[j] = 1;
|
19910
|
+
}
|
19911
|
+
|
19912
|
+
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19913
|
+
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19914
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19915
|
+
if (ctx->header.version == 1) {
|
19916
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19917
|
+
uint32_t t = 0;
|
19918
|
+
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
19919
|
+
info->ne[j] = t;
|
19920
|
+
} else {
|
19921
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19922
|
+
}
|
19923
|
+
}
|
19924
|
+
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19925
|
+
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19926
|
+
|
19927
|
+
if (!ok) {
|
19928
|
+
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19929
|
+
fclose(file);
|
19930
|
+
gguf_free(ctx);
|
19931
|
+
return NULL;
|
19932
|
+
}
|
19933
|
+
}
|
19934
|
+
}
|
19935
|
+
|
19936
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19937
|
+
|
19938
|
+
int alignment_idx = gguf_find_key(ctx, "general.alignment");
|
19939
|
+
if (alignment_idx != -1) {
|
19940
|
+
ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
|
19941
|
+
}
|
19942
|
+
|
19943
|
+
// we require the data section to be aligned, so take into account any padding
|
19944
|
+
{
|
19945
|
+
const size_t offset_pad = offset % ctx->alignment;
|
19946
|
+
|
19947
|
+
if (offset_pad != 0) {
|
19948
|
+
offset += ctx->alignment - offset_pad;
|
19949
|
+
fseek(file, offset, SEEK_SET);
|
19950
|
+
}
|
19951
|
+
}
|
19952
|
+
|
19953
|
+
// store the current file offset - this is where the data section starts
|
19954
|
+
ctx->offset = offset;
|
19955
|
+
|
19956
|
+
// compute the total size of the data section, taking into account the alignment
|
19957
|
+
{
|
19958
|
+
ctx->size = 0;
|
19959
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19960
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19961
|
+
|
19962
|
+
const int64_t ne =
|
19963
|
+
(int64_t) info->ne[0] *
|
19964
|
+
(int64_t) info->ne[1] *
|
19965
|
+
(int64_t) info->ne[2] *
|
19966
|
+
(int64_t) info->ne[3];
|
19967
|
+
|
19968
|
+
if (ne % ggml_blck_size(info->type) != 0) {
|
19969
|
+
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19970
|
+
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
19971
|
+
fclose(file);
|
19972
|
+
gguf_free(ctx);
|
19973
|
+
return NULL;
|
19974
|
+
}
|
19975
|
+
|
19976
|
+
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
19977
|
+
|
19978
|
+
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
19979
|
+
}
|
19980
|
+
}
|
19981
|
+
|
19982
|
+
// load the tensor data only if requested
|
19983
|
+
if (params.ctx != NULL) {
|
19984
|
+
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
|
19985
|
+
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
|
19986
|
+
// the ggml_tensor structs to the appropriate locations in the binary blob
|
19987
|
+
|
19988
|
+
// compute the exact size needed for the new ggml_context
|
19989
|
+
const size_t mem_size =
|
19990
|
+
params.no_alloc ?
|
19991
|
+
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
19992
|
+
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
19993
|
+
|
19994
|
+
struct ggml_init_params pdata = {
|
19995
|
+
.mem_size = mem_size,
|
19996
|
+
.mem_buffer = NULL,
|
19997
|
+
.no_alloc = params.no_alloc,
|
19998
|
+
};
|
19999
|
+
|
20000
|
+
*params.ctx = ggml_init(pdata);
|
20001
|
+
|
20002
|
+
struct ggml_context * ctx_data = *params.ctx;
|
20003
|
+
|
20004
|
+
struct ggml_tensor * data = NULL;
|
20005
|
+
|
20006
|
+
if (params.no_alloc == false) {
|
20007
|
+
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
20008
|
+
|
20009
|
+
ok = ok && data != NULL;
|
20010
|
+
|
20011
|
+
// read the binary blob with the tensor data
|
20012
|
+
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
|
20013
|
+
|
20014
|
+
if (!ok) {
|
20015
|
+
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
20016
|
+
fclose(file);
|
20017
|
+
ggml_free(ctx_data);
|
20018
|
+
gguf_free(ctx);
|
20019
|
+
return NULL;
|
20020
|
+
}
|
20021
|
+
|
20022
|
+
ctx->data = data->data;
|
20023
|
+
}
|
20024
|
+
|
20025
|
+
ggml_set_no_alloc(ctx_data, true);
|
20026
|
+
|
20027
|
+
// create the tensors
|
20028
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20029
|
+
const int64_t ne[GGML_MAX_DIMS] = {
|
20030
|
+
ctx->infos[i].ne[0],
|
20031
|
+
ctx->infos[i].ne[1],
|
20032
|
+
ctx->infos[i].ne[2],
|
20033
|
+
ctx->infos[i].ne[3],
|
20034
|
+
};
|
20035
|
+
|
20036
|
+
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
|
20037
|
+
|
20038
|
+
ok = ok && cur != NULL;
|
20039
|
+
|
20040
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20041
|
+
|
20042
|
+
if (!ok) {
|
20043
|
+
break;
|
20044
|
+
}
|
20045
|
+
|
20046
|
+
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20047
|
+
if (params.no_alloc == false) {
|
20048
|
+
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
20049
|
+
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
20050
|
+
}
|
20051
|
+
}
|
20052
|
+
|
20053
|
+
if (!ok) {
|
20054
|
+
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
|
20055
|
+
fclose(file);
|
20056
|
+
ggml_free(ctx_data);
|
20057
|
+
gguf_free(ctx);
|
20058
|
+
return NULL;
|
20059
|
+
}
|
20060
|
+
|
20061
|
+
ggml_set_no_alloc(ctx_data, params.no_alloc);
|
20062
|
+
}
|
20063
|
+
|
20064
|
+
fclose(file);
|
20065
|
+
|
20066
|
+
return ctx;
|
20067
|
+
}
|
20068
|
+
|
20069
|
+
void gguf_free(struct gguf_context * ctx) {
|
20070
|
+
if (ctx == NULL) {
|
20071
|
+
return;
|
20072
|
+
}
|
20073
|
+
|
20074
|
+
if (ctx->kv) {
|
20075
|
+
// free string memory - not great..
|
20076
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
20077
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
20078
|
+
|
20079
|
+
if (kv->key.data) {
|
20080
|
+
free(kv->key.data);
|
20081
|
+
}
|
20082
|
+
|
20083
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20084
|
+
if (kv->value.str.data) {
|
20085
|
+
free(kv->value.str.data);
|
20086
|
+
}
|
20087
|
+
}
|
20088
|
+
|
20089
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20090
|
+
if (kv->value.arr.data) {
|
20091
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20092
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
20093
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20094
|
+
if (str->data) {
|
20095
|
+
free(str->data);
|
20096
|
+
}
|
20097
|
+
}
|
20098
|
+
}
|
20099
|
+
free(kv->value.arr.data);
|
20100
|
+
}
|
20101
|
+
}
|
20102
|
+
}
|
20103
|
+
|
20104
|
+
free(ctx->kv);
|
20105
|
+
}
|
20106
|
+
|
20107
|
+
if (ctx->infos) {
|
20108
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20109
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20110
|
+
|
20111
|
+
if (info->name.data) {
|
20112
|
+
free(info->name.data);
|
20113
|
+
}
|
20114
|
+
}
|
20115
|
+
|
20116
|
+
free(ctx->infos);
|
20117
|
+
}
|
20118
|
+
|
20119
|
+
GGML_ALIGNED_FREE(ctx);
|
20120
|
+
}
|
20121
|
+
|
20122
|
+
const char * gguf_type_name(enum gguf_type type) {
|
20123
|
+
return GGUF_TYPE_NAME[type];
|
20124
|
+
}
|
20125
|
+
|
20126
|
+
int gguf_get_version(struct gguf_context * ctx) {
|
20127
|
+
return ctx->header.version;
|
20128
|
+
}
|
20129
|
+
|
20130
|
+
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
20131
|
+
return ctx->alignment;
|
20132
|
+
}
|
20133
|
+
|
20134
|
+
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
20135
|
+
return ctx->offset;
|
20136
|
+
}
|
20137
|
+
|
20138
|
+
void * gguf_get_data(struct gguf_context * ctx) {
|
20139
|
+
return ctx->data;
|
20140
|
+
}
|
20141
|
+
|
20142
|
+
int gguf_get_n_kv(struct gguf_context * ctx) {
|
20143
|
+
return ctx->header.n_kv;
|
20144
|
+
}
|
20145
|
+
|
20146
|
+
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
20147
|
+
// return -1 if key not found
|
20148
|
+
int keyfound = -1;
|
20149
|
+
|
20150
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
20151
|
+
|
20152
|
+
for (int i = 0; i < n_kv; ++i) {
|
20153
|
+
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
20154
|
+
keyfound = i;
|
20155
|
+
break;
|
20156
|
+
}
|
20157
|
+
}
|
20158
|
+
|
20159
|
+
return keyfound;
|
20160
|
+
}
|
20161
|
+
|
20162
|
+
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
20163
|
+
return ctx->kv[i].key.data;
|
20164
|
+
}
|
20165
|
+
|
20166
|
+
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
20167
|
+
return ctx->kv[i].type;
|
20168
|
+
}
|
20169
|
+
|
20170
|
+
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
20171
|
+
return ctx->kv[i].value.arr.type;
|
20172
|
+
}
|
20173
|
+
|
20174
|
+
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
20175
|
+
return ctx->kv[i].value.arr.data;
|
20176
|
+
}
|
20177
|
+
|
20178
|
+
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
20179
|
+
struct gguf_kv * kv = &ctx->kv[key_id];
|
20180
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
20181
|
+
return str->data;
|
20182
|
+
}
|
20183
|
+
|
20184
|
+
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
20185
|
+
return ctx->kv[i].value.arr.n;
|
20186
|
+
}
|
20187
|
+
|
20188
|
+
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
20189
|
+
return ctx->kv[i].value.uint8;
|
20190
|
+
}
|
20191
|
+
|
20192
|
+
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
20193
|
+
return ctx->kv[i].value.int8;
|
20194
|
+
}
|
20195
|
+
|
20196
|
+
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
20197
|
+
return ctx->kv[i].value.uint16;
|
20198
|
+
}
|
20199
|
+
|
20200
|
+
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
20201
|
+
return ctx->kv[i].value.int16;
|
20202
|
+
}
|
20203
|
+
|
20204
|
+
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
20205
|
+
return ctx->kv[i].value.uint32;
|
20206
|
+
}
|
20207
|
+
|
20208
|
+
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
20209
|
+
return ctx->kv[i].value.int32;
|
20210
|
+
}
|
20211
|
+
|
20212
|
+
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
20213
|
+
return ctx->kv[i].value.float32;
|
20214
|
+
}
|
20215
|
+
|
20216
|
+
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20217
|
+
return ctx->kv[i].value.uint64;
|
20218
|
+
}
|
20219
|
+
|
20220
|
+
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20221
|
+
return ctx->kv[i].value.int64;
|
20222
|
+
}
|
20223
|
+
|
20224
|
+
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20225
|
+
return ctx->kv[i].value.float64;
|
20226
|
+
}
|
20227
|
+
|
20228
|
+
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
20229
|
+
return ctx->kv[i].value.bool_;
|
20230
|
+
}
|
20231
|
+
|
20232
|
+
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
20233
|
+
return ctx->kv[i].value.str.data;
|
20234
|
+
}
|
20235
|
+
|
20236
|
+
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
20237
|
+
return ctx->header.n_tensors;
|
20238
|
+
}
|
20239
|
+
|
20240
|
+
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
20241
|
+
// return -1 if tensor not found
|
20242
|
+
int tensorfound = -1;
|
20243
|
+
|
20244
|
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
20245
|
+
|
20246
|
+
for (int i = 0; i < n_tensors; ++i) {
|
20247
|
+
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
|
20248
|
+
tensorfound = i;
|
20249
|
+
break;
|
20250
|
+
}
|
20251
|
+
}
|
20252
|
+
|
20253
|
+
return tensorfound;
|
20254
|
+
}
|
20255
|
+
|
20256
|
+
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
20257
|
+
return ctx->infos[i].offset;
|
20258
|
+
}
|
20259
|
+
|
20260
|
+
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
20261
|
+
return ctx->infos[i].name.data;
|
20262
|
+
}
|
20263
|
+
|
20264
|
+
// returns the index
|
20265
|
+
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
20266
|
+
const int idx = gguf_find_key(ctx, key);
|
20267
|
+
if (idx >= 0) {
|
20268
|
+
return idx;
|
20269
|
+
}
|
20270
|
+
|
20271
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
20272
|
+
|
20273
|
+
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
20274
|
+
ctx->kv[n_kv].key.n = strlen(key);
|
20275
|
+
ctx->kv[n_kv].key.data = strdup(key);
|
20276
|
+
ctx->header.n_kv++;
|
20277
|
+
|
20278
|
+
return n_kv;
|
20279
|
+
}
|
20280
|
+
|
20281
|
+
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
20282
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20283
|
+
|
20284
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT8;
|
20285
|
+
ctx->kv[idx].value.uint8 = val;
|
20286
|
+
}
|
20287
|
+
|
20288
|
+
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
|
20289
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20290
|
+
|
20291
|
+
ctx->kv[idx].type = GGUF_TYPE_INT8;
|
20292
|
+
ctx->kv[idx].value.int8 = val;
|
20293
|
+
}
|
20294
|
+
|
20295
|
+
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
|
20296
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20297
|
+
|
20298
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT16;
|
20299
|
+
ctx->kv[idx].value.uint16 = val;
|
20300
|
+
}
|
20301
|
+
|
20302
|
+
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
|
20303
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20304
|
+
|
20305
|
+
ctx->kv[idx].type = GGUF_TYPE_INT16;
|
20306
|
+
ctx->kv[idx].value.int16 = val;
|
20307
|
+
}
|
20308
|
+
|
20309
|
+
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
|
20310
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20311
|
+
|
20312
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT32;
|
20313
|
+
ctx->kv[idx].value.uint32 = val;
|
20314
|
+
}
|
20315
|
+
|
20316
|
+
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
|
20317
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20318
|
+
|
20319
|
+
ctx->kv[idx].type = GGUF_TYPE_INT32;
|
20320
|
+
ctx->kv[idx].value.int32 = val;
|
20321
|
+
}
|
20322
|
+
|
20323
|
+
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
20324
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20325
|
+
|
20326
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
|
20327
|
+
ctx->kv[idx].value.float32 = val;
|
20328
|
+
}
|
20329
|
+
|
20330
|
+
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
|
20331
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20332
|
+
|
20333
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT64;
|
20334
|
+
ctx->kv[idx].value.uint64 = val;
|
20335
|
+
}
|
20336
|
+
|
20337
|
+
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
|
20338
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20339
|
+
|
20340
|
+
ctx->kv[idx].type = GGUF_TYPE_INT64;
|
20341
|
+
ctx->kv[idx].value.int64 = val;
|
20342
|
+
}
|
20343
|
+
|
20344
|
+
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
|
20345
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20346
|
+
|
20347
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
|
20348
|
+
ctx->kv[idx].value.float64 = val;
|
20349
|
+
}
|
20350
|
+
|
20351
|
+
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
20352
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20353
|
+
|
20354
|
+
ctx->kv[idx].type = GGUF_TYPE_BOOL;
|
20355
|
+
ctx->kv[idx].value.bool_ = val;
|
20356
|
+
}
|
20357
|
+
|
20358
|
+
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
|
20359
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20360
|
+
|
20361
|
+
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
20362
|
+
ctx->kv[idx].value.str.n = strlen(val);
|
20363
|
+
ctx->kv[idx].value.str.data = strdup(val);
|
20364
|
+
}
|
20365
|
+
|
20366
|
+
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
|
20367
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20368
|
+
|
20369
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20370
|
+
ctx->kv[idx].value.arr.type = type;
|
20371
|
+
ctx->kv[idx].value.arr.n = n;
|
20372
|
+
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
20373
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
20374
|
+
}
|
20375
|
+
|
20376
|
+
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
20377
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20378
|
+
|
20379
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20380
|
+
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
20381
|
+
ctx->kv[idx].value.arr.n = n;
|
20382
|
+
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
20383
|
+
for (int i = 0; i < n; i++) {
|
20384
|
+
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
20385
|
+
str->n = strlen(data[i]);
|
20386
|
+
str->data = strdup(data[i]);
|
20387
|
+
}
|
20388
|
+
}
|
20389
|
+
|
20390
|
+
// set or add KV pairs from another context
|
20391
|
+
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
20392
|
+
for (uint32_t i = 0; i < src->header.n_kv; i++) {
|
20393
|
+
switch (src->kv[i].type) {
|
20394
|
+
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
|
20395
|
+
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
|
20396
|
+
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
|
20397
|
+
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
|
20398
|
+
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
20399
|
+
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
20400
|
+
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
20401
|
+
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
|
20402
|
+
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
|
20403
|
+
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
|
20404
|
+
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
20405
|
+
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
20406
|
+
case GGUF_TYPE_ARRAY:
|
20407
|
+
{
|
20408
|
+
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
20409
|
+
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
20410
|
+
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
20411
|
+
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
20412
|
+
}
|
20413
|
+
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
20414
|
+
free(data);
|
20415
|
+
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
20416
|
+
GGML_ASSERT(false && "nested arrays not supported");
|
20417
|
+
} else {
|
20418
|
+
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
20419
|
+
}
|
20420
|
+
} break;
|
20421
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20422
|
+
}
|
20423
|
+
}
|
20424
|
+
}
|
20425
|
+
|
20426
|
+
void gguf_add_tensor(
|
20427
|
+
struct gguf_context * ctx,
|
20428
|
+
const struct ggml_tensor * tensor) {
|
20429
|
+
const int idx = ctx->header.n_tensors;
|
20430
|
+
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
20431
|
+
|
20432
|
+
ctx->infos[idx].name.n = strlen(tensor->name);
|
20433
|
+
ctx->infos[idx].name.data = strdup(tensor->name);
|
20434
|
+
|
20435
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
20436
|
+
ctx->infos[idx].ne[i] = 1;
|
20437
|
+
}
|
20438
|
+
|
20439
|
+
ctx->infos[idx].n_dims = tensor->n_dims;
|
20440
|
+
for (int i = 0; i < tensor->n_dims; i++) {
|
20441
|
+
ctx->infos[idx].ne[i] = tensor->ne[i];
|
20442
|
+
}
|
20443
|
+
|
20444
|
+
ctx->infos[idx].type = tensor->type;
|
20445
|
+
ctx->infos[idx].offset = 0;
|
20446
|
+
ctx->infos[idx].data = tensor->data;
|
20447
|
+
ctx->infos[idx].size = ggml_nbytes(tensor);
|
20448
|
+
|
20449
|
+
if (ctx->header.n_tensors > 0) {
|
20450
|
+
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
|
20451
|
+
}
|
20452
|
+
|
20453
|
+
ctx->header.n_tensors++;
|
20454
|
+
}
|
20455
|
+
|
20456
|
+
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
20457
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20458
|
+
if (idx < 0) {
|
20459
|
+
GGML_ASSERT(false && "tensor not found");
|
20460
|
+
}
|
20461
|
+
|
20462
|
+
ctx->infos[idx].type = type;
|
20463
|
+
}
|
20464
|
+
|
20465
|
+
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
|
20466
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20467
|
+
if (idx < 0) {
|
20468
|
+
GGML_ASSERT(false && "tensor not found");
|
20469
|
+
}
|
20470
|
+
|
20471
|
+
ctx->infos[idx].data = data;
|
20472
|
+
ctx->infos[idx].size = size;
|
20473
|
+
|
20474
|
+
// update offsets
|
20475
|
+
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
|
20476
|
+
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
|
20477
|
+
}
|
20478
|
+
}
|
20479
|
+
|
20480
|
+
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
|
20481
|
+
// fwrite(&val->n, sizeof(val->n), 1, file);
|
20482
|
+
// fwrite(val->data, sizeof(char), val->n, file);
|
20483
|
+
//}
|
20484
|
+
//
|
20485
|
+
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
|
20486
|
+
// fwrite(val, sizeof(char), size, file);
|
20487
|
+
//}
|
20488
|
+
|
20489
|
+
struct gguf_buf {
|
20490
|
+
void * data;
|
20491
|
+
size_t size;
|
20492
|
+
size_t offset;
|
20493
|
+
};
|
20494
|
+
|
20495
|
+
static struct gguf_buf gguf_buf_init(size_t size) {
|
20496
|
+
struct gguf_buf buf = {
|
20497
|
+
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
20498
|
+
/*buf.size =*/ size,
|
20499
|
+
/*buf.offset =*/ 0,
|
20500
|
+
};
|
20501
|
+
|
20502
|
+
return buf;
|
20503
|
+
}
|
20504
|
+
|
20505
|
+
static void gguf_buf_free(struct gguf_buf buf) {
|
20506
|
+
if (buf.data) {
|
20507
|
+
free(buf.data);
|
20508
|
+
}
|
20509
|
+
}
|
20510
|
+
|
20511
|
+
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
|
20512
|
+
if (buf->offset + size > buf->size) {
|
20513
|
+
buf->size = 1.5*(buf->offset + size);
|
20514
|
+
if (buf->data) {
|
20515
|
+
buf->data = realloc(buf->data, buf->size);
|
20516
|
+
}
|
20517
|
+
}
|
20518
|
+
}
|
20519
|
+
|
20520
|
+
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
|
20521
|
+
gguf_buf_grow(buf, sizeof(val->n) + val->n);
|
20522
|
+
|
20523
|
+
if (buf->data) {
|
20524
|
+
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
|
20525
|
+
}
|
20526
|
+
buf->offset += sizeof(val->n);
|
20527
|
+
|
20528
|
+
if (buf->data) {
|
20529
|
+
memcpy((char *) buf->data + buf->offset, val->data, val->n);
|
20530
|
+
}
|
20531
|
+
buf->offset += val->n;
|
20532
|
+
}
|
20533
|
+
|
20534
|
+
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
|
20535
|
+
gguf_buf_grow(buf, el_size);
|
20536
|
+
|
20537
|
+
if (buf->data) {
|
20538
|
+
memcpy((char *) buf->data + buf->offset, val, el_size);
|
20539
|
+
}
|
20540
|
+
buf->offset += el_size;
|
20541
|
+
}
|
20542
|
+
|
20543
|
+
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20544
|
+
// write header
|
20545
|
+
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20546
|
+
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
20547
|
+
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
|
20548
|
+
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
|
20549
|
+
|
20550
|
+
// write key-value pairs
|
20551
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
20552
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
20553
|
+
|
20554
|
+
gguf_bwrite_str(buf, &kv->key);
|
20555
|
+
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
|
20556
|
+
|
20557
|
+
switch (kv->type) {
|
20558
|
+
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
|
20559
|
+
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
|
20560
|
+
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
|
20561
|
+
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
|
20562
|
+
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
20563
|
+
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
20564
|
+
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
20565
|
+
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
|
20566
|
+
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
|
20567
|
+
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
|
20568
|
+
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
20569
|
+
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
20570
|
+
case GGUF_TYPE_ARRAY:
|
20571
|
+
{
|
20572
|
+
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
|
20573
|
+
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
|
20574
|
+
|
20575
|
+
switch (kv->value.arr.type) {
|
20576
|
+
case GGUF_TYPE_UINT8:
|
20577
|
+
case GGUF_TYPE_INT8:
|
20578
|
+
case GGUF_TYPE_UINT16:
|
20579
|
+
case GGUF_TYPE_INT16:
|
20580
|
+
case GGUF_TYPE_UINT32:
|
20581
|
+
case GGUF_TYPE_INT32:
|
20582
|
+
case GGUF_TYPE_FLOAT32:
|
20583
|
+
case GGUF_TYPE_UINT64:
|
20584
|
+
case GGUF_TYPE_INT64:
|
20585
|
+
case GGUF_TYPE_FLOAT64:
|
20586
|
+
case GGUF_TYPE_BOOL:
|
20587
|
+
{
|
20588
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
20589
|
+
} break;
|
20590
|
+
case GGUF_TYPE_STRING:
|
20591
|
+
{
|
20592
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
20593
|
+
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
|
20594
|
+
}
|
20595
|
+
} break;
|
20596
|
+
case GGUF_TYPE_ARRAY:
|
20597
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20598
|
+
};
|
20599
|
+
} break;
|
20600
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
20601
|
+
};
|
20602
|
+
}
|
20603
|
+
|
20604
|
+
// write tensor infos
|
20605
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20606
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20607
|
+
|
20608
|
+
gguf_bwrite_str(buf, &info->name);
|
20609
|
+
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
|
20610
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
20611
|
+
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
|
20612
|
+
}
|
20613
|
+
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
|
20614
|
+
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
|
20615
|
+
}
|
20616
|
+
|
20617
|
+
// we require the data section to be aligned, so take into account any padding
|
20618
|
+
{
|
20619
|
+
const size_t offset = buf->offset;
|
20620
|
+
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
|
20621
|
+
|
20622
|
+
if (offset_pad != offset) {
|
20623
|
+
uint8_t pad = 0;
|
20624
|
+
for (size_t i = 0; i < offset_pad - offset; ++i) {
|
20625
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20626
|
+
}
|
20627
|
+
}
|
20628
|
+
}
|
20629
|
+
|
20630
|
+
if (only_meta) {
|
20631
|
+
return;
|
20632
|
+
}
|
20633
|
+
|
20634
|
+
size_t offset = 0;
|
20635
|
+
|
20636
|
+
// write tensor data
|
20637
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20638
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20639
|
+
|
20640
|
+
const size_t size = info->size;
|
20641
|
+
const size_t size_pad = GGML_PAD(size, ctx->alignment);
|
20642
|
+
|
20643
|
+
gguf_bwrite_el(buf, info->data, size);
|
20644
|
+
|
20645
|
+
if (size_pad != size) {
|
20646
|
+
uint8_t pad = 0;
|
20647
|
+
for (size_t j = 0; j < size_pad - size; ++j) {
|
20648
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20649
|
+
}
|
20650
|
+
}
|
20651
|
+
|
20652
|
+
GGML_ASSERT(offset == info->offset);
|
20653
|
+
|
20654
|
+
offset += size_pad;
|
20655
|
+
}
|
20656
|
+
}
|
20657
|
+
|
20658
|
+
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20659
|
+
FILE * file = fopen(fname, "wb");
|
20660
|
+
if (!file) {
|
20661
|
+
GGML_ASSERT(false && "failed to open file for writing");
|
20662
|
+
}
|
20663
|
+
|
20664
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20665
|
+
|
20666
|
+
gguf_write_to_buf(ctx, &buf, only_meta);
|
20667
|
+
|
20668
|
+
fwrite(buf.data, 1, buf.offset, file);
|
20669
|
+
|
20670
|
+
gguf_buf_free(buf);
|
20671
|
+
|
20672
|
+
fclose(file);
|
20673
|
+
}
|
20674
|
+
|
20675
|
+
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20676
|
+
// no allocs - only compute size
|
20677
|
+
struct gguf_buf buf = gguf_buf_init(0);
|
20678
|
+
|
20679
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20680
|
+
|
20681
|
+
return buf.offset;
|
20682
|
+
}
|
20683
|
+
|
20684
|
+
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20685
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20686
|
+
|
20687
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20688
|
+
|
20689
|
+
memcpy(data, buf.data, buf.offset);
|
20690
|
+
|
20691
|
+
gguf_buf_free(buf);
|
20692
|
+
}
|
20693
|
+
|
20694
|
+
////////////////////////////////////////////////////////////////////////////////
|
20695
|
+
|
20696
|
+
int ggml_cpu_has_avx(void) {
|
20697
|
+
#if defined(__AVX__)
|
20698
|
+
return 1;
|
20699
|
+
#else
|
20700
|
+
return 0;
|
20701
|
+
#endif
|
20702
|
+
}
|
20703
|
+
|
20704
|
+
int ggml_cpu_has_avx2(void) {
|
20705
|
+
#if defined(__AVX2__)
|
20706
|
+
return 1;
|
20707
|
+
#else
|
20708
|
+
return 0;
|
20709
|
+
#endif
|
20710
|
+
}
|
20711
|
+
|
20712
|
+
int ggml_cpu_has_avx512(void) {
|
20713
|
+
#if defined(__AVX512F__)
|
20714
|
+
return 1;
|
20715
|
+
#else
|
20716
|
+
return 0;
|
20717
|
+
#endif
|
20718
|
+
}
|
20719
|
+
|
20720
|
+
int ggml_cpu_has_avx512_vbmi(void) {
|
20721
|
+
#if defined(__AVX512VBMI__)
|
20722
|
+
return 1;
|
20723
|
+
#else
|
20724
|
+
return 0;
|
20725
|
+
#endif
|
20726
|
+
}
|
20727
|
+
|
20728
|
+
int ggml_cpu_has_avx512_vnni(void) {
|
20729
|
+
#if defined(__AVX512VNNI__)
|
20730
|
+
return 1;
|
20731
|
+
#else
|
20732
|
+
return 0;
|
20733
|
+
#endif
|
20734
|
+
}
|
20735
|
+
|
20736
|
+
int ggml_cpu_has_fma(void) {
|
20737
|
+
#if defined(__FMA__)
|
20738
|
+
return 1;
|
20739
|
+
#else
|
20740
|
+
return 0;
|
20741
|
+
#endif
|
20742
|
+
}
|
20743
|
+
|
20744
|
+
int ggml_cpu_has_neon(void) {
|
20745
|
+
#if defined(__ARM_NEON)
|
20746
|
+
return 1;
|
20747
|
+
#else
|
20748
|
+
return 0;
|
20749
|
+
#endif
|
20750
|
+
}
|
20751
|
+
|
20752
|
+
int ggml_cpu_has_arm_fma(void) {
|
20753
|
+
#if defined(__ARM_FEATURE_FMA)
|
18622
20754
|
return 1;
|
18623
20755
|
#else
|
18624
20756
|
return 0;
|
@@ -18685,6 +20817,14 @@ int ggml_cpu_has_sse3(void) {
|
|
18685
20817
|
#endif
|
18686
20818
|
}
|
18687
20819
|
|
20820
|
+
int ggml_cpu_has_ssse3(void) {
|
20821
|
+
#if defined(__SSSE3__)
|
20822
|
+
return 1;
|
20823
|
+
#else
|
20824
|
+
return 0;
|
20825
|
+
#endif
|
20826
|
+
}
|
20827
|
+
|
18688
20828
|
int ggml_cpu_has_vsx(void) {
|
18689
20829
|
#if defined(__POWER9_VECTOR__)
|
18690
20830
|
return 1;
|