llama_cpp 0.3.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -123,6 +123,8 @@ typedef void * thread_ret_t;
|
|
123
123
|
#define GGML_GELU_FP16
|
124
124
|
#define GGML_GELU_QUICK_FP16
|
125
125
|
#define GGML_SILU_FP16
|
126
|
+
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
127
|
+
// #define GGML_FLASH_ATTN_EXP_FP16
|
126
128
|
|
127
129
|
#define GGML_SOFT_MAX_UNROLL 4
|
128
130
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -157,12 +159,6 @@ typedef void * thread_ret_t;
|
|
157
159
|
//#define GGML_SOFT_MAX_ACCELERATE
|
158
160
|
#endif
|
159
161
|
|
160
|
-
#if UINTPTR_MAX == 0xFFFFFFFF
|
161
|
-
#define GGML_MEM_ALIGN 4
|
162
|
-
#else
|
163
|
-
#define GGML_MEM_ALIGN 16
|
164
|
-
#endif
|
165
|
-
|
166
162
|
//
|
167
163
|
// logging
|
168
164
|
//
|
@@ -192,8 +188,8 @@ typedef void * thread_ret_t;
|
|
192
188
|
//
|
193
189
|
|
194
190
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
195
|
-
#define GGML_ALIGNED_MALLOC(size)
|
196
|
-
#define GGML_ALIGNED_FREE(ptr)
|
191
|
+
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
192
|
+
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
197
193
|
#else
|
198
194
|
inline static void * ggml_aligned_malloc(size_t size) {
|
199
195
|
void * aligned_memory = NULL;
|
@@ -213,14 +209,13 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
213
209
|
error_desc = "insufficient memory";
|
214
210
|
break;
|
215
211
|
}
|
216
|
-
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
217
|
-
__func__, error_desc, size/(1024.0*1024.0));
|
212
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
218
213
|
return NULL;
|
219
214
|
}
|
220
215
|
return aligned_memory;
|
221
216
|
}
|
222
|
-
#define GGML_ALIGNED_MALLOC(size)
|
223
|
-
#define GGML_ALIGNED_FREE(ptr)
|
217
|
+
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
218
|
+
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
224
219
|
#endif
|
225
220
|
|
226
221
|
#define UNUSED GGML_UNUSED
|
@@ -306,6 +301,10 @@ typedef double ggml_float;
|
|
306
301
|
#endif
|
307
302
|
#endif
|
308
303
|
|
304
|
+
#ifdef __riscv_v_intrinsic
|
305
|
+
#include <riscv_vector.h>
|
306
|
+
#endif
|
307
|
+
|
309
308
|
#ifdef __F16C__
|
310
309
|
|
311
310
|
#ifdef _MSC_VER
|
@@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
1643
1642
|
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
1644
1643
|
|
1645
1644
|
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
1645
|
+
[GGML_TYPE_I8] = {
|
1646
|
+
.type_name = "i8",
|
1647
|
+
.blck_size = 1,
|
1648
|
+
.type_size = sizeof(int8_t),
|
1649
|
+
.is_quantized = false,
|
1650
|
+
},
|
1651
|
+
[GGML_TYPE_I16] = {
|
1652
|
+
.type_name = "i16",
|
1653
|
+
.blck_size = 1,
|
1654
|
+
.type_size = sizeof(int16_t),
|
1655
|
+
.is_quantized = false,
|
1656
|
+
},
|
1657
|
+
[GGML_TYPE_I32] = {
|
1658
|
+
.type_name = "i32",
|
1659
|
+
.blck_size = 1,
|
1660
|
+
.type_size = sizeof(int32_t),
|
1661
|
+
.is_quantized = false,
|
1662
|
+
},
|
1646
1663
|
[GGML_TYPE_F32] = {
|
1664
|
+
.type_name = "f32",
|
1665
|
+
.blck_size = 1,
|
1666
|
+
.type_size = sizeof(float),
|
1667
|
+
.is_quantized = false,
|
1647
1668
|
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
1648
1669
|
.vec_dot_type = GGML_TYPE_F32,
|
1649
1670
|
},
|
1650
1671
|
[GGML_TYPE_F16] = {
|
1672
|
+
.type_name = "f16",
|
1673
|
+
.blck_size = 1,
|
1674
|
+
.type_size = sizeof(ggml_fp16_t),
|
1675
|
+
.is_quantized = false,
|
1651
1676
|
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
1652
1677
|
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
1653
1678
|
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
@@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1655
1680
|
.vec_dot_type = GGML_TYPE_F16,
|
1656
1681
|
},
|
1657
1682
|
[GGML_TYPE_Q4_0] = {
|
1683
|
+
.type_name = "q4_0",
|
1684
|
+
.blck_size = QK4_0,
|
1685
|
+
.type_size = sizeof(block_q4_0),
|
1686
|
+
.is_quantized = true,
|
1658
1687
|
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
1659
1688
|
.from_float = quantize_row_q4_0,
|
1660
1689
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
@@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1662
1691
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1663
1692
|
},
|
1664
1693
|
[GGML_TYPE_Q4_1] = {
|
1694
|
+
.type_name = "q4_1",
|
1695
|
+
.blck_size = QK4_1,
|
1696
|
+
.type_size = sizeof(block_q4_1),
|
1697
|
+
.is_quantized = true,
|
1665
1698
|
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
1666
1699
|
.from_float = quantize_row_q4_1,
|
1667
1700
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
@@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1669
1702
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1670
1703
|
},
|
1671
1704
|
[GGML_TYPE_Q5_0] = {
|
1705
|
+
.type_name = "q5_0",
|
1706
|
+
.blck_size = QK5_0,
|
1707
|
+
.type_size = sizeof(block_q5_0),
|
1708
|
+
.is_quantized = true,
|
1672
1709
|
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
1673
1710
|
.from_float = quantize_row_q5_0,
|
1674
1711
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
@@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1676
1713
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1677
1714
|
},
|
1678
1715
|
[GGML_TYPE_Q5_1] = {
|
1716
|
+
.type_name = "q5_1",
|
1717
|
+
.blck_size = QK5_1,
|
1718
|
+
.type_size = sizeof(block_q5_1),
|
1719
|
+
.is_quantized = true,
|
1679
1720
|
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
1680
1721
|
.from_float = quantize_row_q5_1,
|
1681
1722
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
@@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1683
1724
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1684
1725
|
},
|
1685
1726
|
[GGML_TYPE_Q8_0] = {
|
1727
|
+
.type_name = "q8_0",
|
1728
|
+
.blck_size = QK8_0,
|
1729
|
+
.type_size = sizeof(block_q8_0),
|
1730
|
+
.is_quantized = true,
|
1686
1731
|
.to_float = dequantize_row_q8_0,
|
1687
1732
|
.from_float = quantize_row_q8_0,
|
1688
1733
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
@@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1690
1735
|
.vec_dot_type = GGML_TYPE_Q8_0,
|
1691
1736
|
},
|
1692
1737
|
[GGML_TYPE_Q8_1] = {
|
1738
|
+
.type_name = "q8_1",
|
1739
|
+
.blck_size = QK8_1,
|
1740
|
+
.type_size = sizeof(block_q8_1),
|
1741
|
+
.is_quantized = true,
|
1693
1742
|
.from_float = quantize_row_q8_1,
|
1694
1743
|
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
1695
1744
|
.vec_dot_type = GGML_TYPE_Q8_1,
|
1696
1745
|
},
|
1697
1746
|
#ifdef GGML_USE_K_QUANTS
|
1698
1747
|
[GGML_TYPE_Q2_K] = {
|
1748
|
+
.type_name = "q2_K",
|
1749
|
+
.blck_size = QK_K,
|
1750
|
+
.type_size = sizeof(block_q2_K),
|
1751
|
+
.is_quantized = true,
|
1699
1752
|
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
1700
1753
|
.from_float = quantize_row_q2_K,
|
1701
1754
|
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
@@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1703
1756
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1704
1757
|
},
|
1705
1758
|
[GGML_TYPE_Q3_K] = {
|
1759
|
+
.type_name = "q3_K",
|
1760
|
+
.blck_size = QK_K,
|
1761
|
+
.type_size = sizeof(block_q3_K),
|
1762
|
+
.is_quantized = true,
|
1706
1763
|
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
1707
1764
|
.from_float = quantize_row_q3_K,
|
1708
1765
|
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
@@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1710
1767
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1711
1768
|
},
|
1712
1769
|
[GGML_TYPE_Q4_K] = {
|
1770
|
+
.type_name = "q4_K",
|
1771
|
+
.blck_size = QK_K,
|
1772
|
+
.type_size = sizeof(block_q4_K),
|
1773
|
+
.is_quantized = true,
|
1713
1774
|
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
1714
1775
|
.from_float = quantize_row_q4_K,
|
1715
1776
|
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
@@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1717
1778
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1718
1779
|
},
|
1719
1780
|
[GGML_TYPE_Q5_K] = {
|
1781
|
+
.type_name = "q5_K",
|
1782
|
+
.blck_size = QK_K,
|
1783
|
+
.type_size = sizeof(block_q5_K),
|
1784
|
+
.is_quantized = true,
|
1720
1785
|
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
1721
1786
|
.from_float = quantize_row_q5_K,
|
1722
1787
|
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
@@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1724
1789
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1725
1790
|
},
|
1726
1791
|
[GGML_TYPE_Q6_K] = {
|
1792
|
+
.type_name = "q6_K",
|
1793
|
+
.blck_size = QK_K,
|
1794
|
+
.type_size = sizeof(block_q6_K),
|
1795
|
+
.is_quantized = true,
|
1727
1796
|
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
1728
1797
|
.from_float = quantize_row_q6_K,
|
1729
1798
|
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
@@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
1731
1800
|
.vec_dot_type = GGML_TYPE_Q8_K,
|
1732
1801
|
},
|
1733
1802
|
[GGML_TYPE_Q8_K] = {
|
1803
|
+
.type_name = "q8_K",
|
1804
|
+
.blck_size = QK_K,
|
1805
|
+
.type_size = sizeof(block_q8_K),
|
1806
|
+
.is_quantized = true,
|
1734
1807
|
.from_float = quantize_row_q8_K,
|
1735
1808
|
}
|
1736
1809
|
#endif
|
1737
1810
|
};
|
1738
1811
|
|
1739
1812
|
// For internal test use
|
1740
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type
|
1741
|
-
GGML_ASSERT(
|
1742
|
-
return type_traits[
|
1813
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
1814
|
+
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
1815
|
+
return type_traits[type];
|
1743
1816
|
}
|
1744
1817
|
|
1745
1818
|
|
@@ -2363,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2363
2436
|
const int nb = n / qk;
|
2364
2437
|
|
2365
2438
|
assert(n % qk == 0);
|
2366
|
-
assert(nb % 2 == 0);
|
2367
2439
|
|
2368
2440
|
const block_q4_0 * restrict x = vx;
|
2369
2441
|
const block_q8_0 * restrict y = vy;
|
@@ -2372,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2372
2444
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
2373
2445
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
2374
2446
|
|
2447
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2375
2448
|
for (int i = 0; i < nb; i += 2) {
|
2376
2449
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
2377
2450
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
@@ -2550,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2550
2623
|
}
|
2551
2624
|
|
2552
2625
|
// Main loop
|
2626
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2553
2627
|
for (int i = 2; i < nb; i+=2) {
|
2554
2628
|
_mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
|
2555
2629
|
_mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
|
@@ -2607,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
|
|
2607
2681
|
}
|
2608
2682
|
|
2609
2683
|
*s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
2684
|
+
#elif defined(__riscv_v_intrinsic)
|
2685
|
+
float sumf = 0.0;
|
2686
|
+
|
2687
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2688
|
+
|
2689
|
+
for (int i = 0; i < nb; i++) {
|
2690
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2691
|
+
|
2692
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2693
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2694
|
+
|
2695
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2696
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2697
|
+
|
2698
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2699
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2700
|
+
|
2701
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
|
2702
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
|
2703
|
+
|
2704
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2705
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2706
|
+
|
2707
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2708
|
+
|
2709
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2710
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2711
|
+
|
2712
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2713
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2714
|
+
|
2715
|
+
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
|
2716
|
+
}
|
2717
|
+
|
2718
|
+
*s = sumf;
|
2610
2719
|
#else
|
2611
2720
|
// scalar
|
2612
2721
|
float sumf = 0.0;
|
@@ -2633,7 +2742,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2633
2742
|
const int nb = n / qk;
|
2634
2743
|
|
2635
2744
|
assert(n % qk == 0);
|
2636
|
-
assert(nb % 2 == 0);
|
2637
2745
|
|
2638
2746
|
const block_q4_1 * restrict x = vx;
|
2639
2747
|
const block_q8_1 * restrict y = vy;
|
@@ -2645,6 +2753,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2645
2753
|
|
2646
2754
|
float summs = 0;
|
2647
2755
|
|
2756
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2648
2757
|
for (int i = 0; i < nb; i += 2) {
|
2649
2758
|
const block_q4_1 * restrict x0 = &x[i + 0];
|
2650
2759
|
const block_q4_1 * restrict x1 = &x[i + 1];
|
@@ -2733,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|
2733
2842
|
}
|
2734
2843
|
|
2735
2844
|
*s = hsum_float_8(acc) + summs;
|
2845
|
+
#elif defined(__riscv_v_intrinsic)
|
2846
|
+
float sumf = 0.0;
|
2847
|
+
|
2848
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
2849
|
+
|
2850
|
+
for (int i = 0; i < nb; i++) {
|
2851
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
2852
|
+
|
2853
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
2854
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
2855
|
+
|
2856
|
+
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
2857
|
+
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
2858
|
+
|
2859
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
2860
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
2861
|
+
|
2862
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
2863
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
2864
|
+
|
2865
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
2866
|
+
|
2867
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
2868
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
2869
|
+
|
2870
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
2871
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
2872
|
+
|
2873
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
2874
|
+
}
|
2875
|
+
|
2876
|
+
*s = sumf;
|
2736
2877
|
#else
|
2737
2878
|
// scalar
|
2738
2879
|
float sumf = 0.0;
|
@@ -2759,7 +2900,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2759
2900
|
const int nb = n / qk;
|
2760
2901
|
|
2761
2902
|
assert(n % qk == 0);
|
2762
|
-
assert(nb % 2 == 0);
|
2763
2903
|
assert(qk == QK5_0);
|
2764
2904
|
|
2765
2905
|
const block_q5_0 * restrict x = vx;
|
@@ -2775,6 +2915,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2775
2915
|
uint64_t tmp0[4];
|
2776
2916
|
uint64_t tmp1[4];
|
2777
2917
|
|
2918
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
2778
2919
|
for (int i = 0; i < nb; i += 2) {
|
2779
2920
|
const block_q5_0 * restrict x0 = &x[i];
|
2780
2921
|
const block_q5_0 * restrict x1 = &x[i + 1];
|
@@ -2967,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|
2967
3108
|
}
|
2968
3109
|
|
2969
3110
|
*s = hsum_float_8(acc);
|
3111
|
+
#elif defined(__riscv_v_intrinsic)
|
3112
|
+
float sumf = 0.0;
|
3113
|
+
|
3114
|
+
uint32_t qh;
|
3115
|
+
|
3116
|
+
// These temp values are for masking and shift operations
|
3117
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3118
|
+
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
|
3119
|
+
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
|
3120
|
+
|
3121
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3122
|
+
|
3123
|
+
for (int i = 0; i < nb; i++) {
|
3124
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3125
|
+
|
3126
|
+
// temporary registers
|
3127
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
|
3128
|
+
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3129
|
+
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
|
3130
|
+
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
|
3131
|
+
|
3132
|
+
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
3133
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
|
3134
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
|
3135
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3136
|
+
|
3137
|
+
// ((qh & (1u << (j + 16))) >> (j + 12));
|
3138
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
|
3139
|
+
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
|
3140
|
+
|
3141
|
+
// narrowing
|
3142
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
|
3143
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3144
|
+
|
3145
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
|
3146
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3147
|
+
|
3148
|
+
// load
|
3149
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3150
|
+
|
3151
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3152
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3153
|
+
|
3154
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3155
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3156
|
+
|
3157
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3158
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3159
|
+
|
3160
|
+
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3161
|
+
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3162
|
+
|
3163
|
+
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
|
3164
|
+
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
|
3165
|
+
|
3166
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3167
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3168
|
+
|
3169
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3170
|
+
|
3171
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3172
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3173
|
+
|
3174
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3175
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3176
|
+
|
3177
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
|
3178
|
+
}
|
3179
|
+
|
3180
|
+
*s = sumf;
|
2970
3181
|
#else
|
2971
3182
|
// scalar
|
2972
3183
|
float sumf = 0.0;
|
@@ -2999,7 +3210,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
2999
3210
|
const int nb = n / qk;
|
3000
3211
|
|
3001
3212
|
assert(n % qk == 0);
|
3002
|
-
assert(nb % 2 == 0);
|
3003
3213
|
assert(qk == QK5_1);
|
3004
3214
|
|
3005
3215
|
const block_q5_1 * restrict x = vx;
|
@@ -3018,6 +3228,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3018
3228
|
uint64_t tmp0[4];
|
3019
3229
|
uint64_t tmp1[4];
|
3020
3230
|
|
3231
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3021
3232
|
for (int i = 0; i < nb; i += 2) {
|
3022
3233
|
const block_q5_1 * restrict x0 = &x[i];
|
3023
3234
|
const block_q5_1 * restrict x1 = &x[i + 1];
|
@@ -3223,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|
3223
3434
|
}
|
3224
3435
|
|
3225
3436
|
*s = hsum_float_8(acc) + summs;
|
3437
|
+
#elif defined(__riscv_v_intrinsic)
|
3438
|
+
float sumf = 0.0;
|
3439
|
+
|
3440
|
+
uint32_t qh;
|
3441
|
+
|
3442
|
+
// These temp values are for shift operations
|
3443
|
+
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
3444
|
+
|
3445
|
+
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
3446
|
+
|
3447
|
+
for (int i = 0; i < nb; i++) {
|
3448
|
+
memcpy(&qh, x[i].qh, sizeof(uint32_t));
|
3449
|
+
|
3450
|
+
// temporary registers
|
3451
|
+
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
|
3452
|
+
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
|
3453
|
+
|
3454
|
+
// load qh
|
3455
|
+
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
|
3456
|
+
|
3457
|
+
// ((qh >> (j + 0)) << 4) & 0x10;
|
3458
|
+
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
|
3459
|
+
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
|
3460
|
+
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
|
3461
|
+
|
3462
|
+
// ((qh >> (j + 12)) ) & 0x10;
|
3463
|
+
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
|
3464
|
+
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
|
3465
|
+
|
3466
|
+
// narrowing
|
3467
|
+
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
|
3468
|
+
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
|
3469
|
+
|
3470
|
+
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
|
3471
|
+
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
|
3472
|
+
|
3473
|
+
// load
|
3474
|
+
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
|
3475
|
+
|
3476
|
+
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3477
|
+
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
|
3478
|
+
|
3479
|
+
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
3480
|
+
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
3481
|
+
|
3482
|
+
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
|
3483
|
+
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
|
3484
|
+
|
3485
|
+
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
3486
|
+
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
3487
|
+
|
3488
|
+
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
3489
|
+
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
|
3490
|
+
|
3491
|
+
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3492
|
+
|
3493
|
+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
|
3494
|
+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
3495
|
+
|
3496
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
|
3497
|
+
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
|
3498
|
+
|
3499
|
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
3500
|
+
}
|
3501
|
+
|
3502
|
+
*s = sumf;
|
3226
3503
|
#else
|
3227
3504
|
// scalar
|
3228
3505
|
float sumf = 0.0;
|
@@ -3255,7 +3532,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3255
3532
|
const int nb = n / qk;
|
3256
3533
|
|
3257
3534
|
assert(n % qk == 0);
|
3258
|
-
assert(nb % 2 == 0);
|
3259
3535
|
|
3260
3536
|
const block_q8_0 * restrict x = vx;
|
3261
3537
|
const block_q8_0 * restrict y = vy;
|
@@ -3264,6 +3540,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3264
3540
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3265
3541
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
3266
3542
|
|
3543
|
+
GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
|
3267
3544
|
for (int i = 0; i < nb; i += 2) {
|
3268
3545
|
const block_q8_0 * restrict x0 = &x[i + 0];
|
3269
3546
|
const block_q8_0 * restrict x1 = &x[i + 1];
|
@@ -3334,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
|
|
3334
3611
|
}
|
3335
3612
|
|
3336
3613
|
*s = hsum_float_8(acc);
|
3614
|
+
#elif defined(__riscv_v_intrinsic)
|
3615
|
+
float sumf = 0.0;
|
3616
|
+
size_t vl = __riscv_vsetvl_e8m1(qk);
|
3617
|
+
|
3618
|
+
for (int i = 0; i < nb; i++) {
|
3619
|
+
// load elements
|
3620
|
+
vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
|
3621
|
+
vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
|
3622
|
+
|
3623
|
+
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
|
3624
|
+
|
3625
|
+
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
3626
|
+
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
3627
|
+
|
3628
|
+
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
3629
|
+
|
3630
|
+
sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
|
3631
|
+
}
|
3632
|
+
|
3633
|
+
*s = sumf;
|
3337
3634
|
#else
|
3338
3635
|
// scalar
|
3339
3636
|
float sumf = 0.0;
|
@@ -3481,9 +3778,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
3481
3778
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
3482
3779
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3483
3780
|
|
3484
|
-
static const float GELU_COEF_A
|
3485
|
-
static const float GELU_QUICK_COEF
|
3486
|
-
static const float SQRT_2_OVER_PI
|
3781
|
+
static const float GELU_COEF_A = 0.044715f;
|
3782
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3783
|
+
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3487
3784
|
|
3488
3785
|
inline static float ggml_gelu_f32(float x) {
|
3489
3786
|
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
@@ -3652,95 +3949,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
|
3652
3949
|
// data types
|
3653
3950
|
//
|
3654
3951
|
|
3655
|
-
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
3656
|
-
[GGML_TYPE_F32] = 1,
|
3657
|
-
[GGML_TYPE_F16] = 1,
|
3658
|
-
[GGML_TYPE_Q4_0] = QK4_0,
|
3659
|
-
[GGML_TYPE_Q4_1] = QK4_1,
|
3660
|
-
[GGML_TYPE_Q5_0] = QK5_0,
|
3661
|
-
[GGML_TYPE_Q5_1] = QK5_1,
|
3662
|
-
[GGML_TYPE_Q8_0] = QK8_0,
|
3663
|
-
[GGML_TYPE_Q8_1] = QK8_1,
|
3664
|
-
#ifdef GGML_USE_K_QUANTS
|
3665
|
-
[GGML_TYPE_Q2_K] = QK_K,
|
3666
|
-
[GGML_TYPE_Q3_K] = QK_K,
|
3667
|
-
[GGML_TYPE_Q4_K] = QK_K,
|
3668
|
-
[GGML_TYPE_Q5_K] = QK_K,
|
3669
|
-
[GGML_TYPE_Q6_K] = QK_K,
|
3670
|
-
[GGML_TYPE_Q8_K] = QK_K,
|
3671
|
-
#endif
|
3672
|
-
[GGML_TYPE_I8] = 1,
|
3673
|
-
[GGML_TYPE_I16] = 1,
|
3674
|
-
[GGML_TYPE_I32] = 1,
|
3675
|
-
};
|
3676
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated");
|
3677
|
-
|
3678
|
-
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
3679
|
-
[GGML_TYPE_F32] = sizeof(float),
|
3680
|
-
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
3681
|
-
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
3682
|
-
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
3683
|
-
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
|
3684
|
-
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
|
3685
|
-
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
|
3686
|
-
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
|
3687
|
-
#ifdef GGML_USE_K_QUANTS
|
3688
|
-
[GGML_TYPE_Q2_K] = sizeof(block_q2_K),
|
3689
|
-
[GGML_TYPE_Q3_K] = sizeof(block_q3_K),
|
3690
|
-
[GGML_TYPE_Q4_K] = sizeof(block_q4_K),
|
3691
|
-
[GGML_TYPE_Q5_K] = sizeof(block_q5_K),
|
3692
|
-
[GGML_TYPE_Q6_K] = sizeof(block_q6_K),
|
3693
|
-
[GGML_TYPE_Q8_K] = sizeof(block_q8_K),
|
3694
|
-
#endif
|
3695
|
-
[GGML_TYPE_I8] = sizeof(int8_t),
|
3696
|
-
[GGML_TYPE_I16] = sizeof(int16_t),
|
3697
|
-
[GGML_TYPE_I32] = sizeof(int32_t),
|
3698
|
-
};
|
3699
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
|
3700
|
-
|
3701
|
-
|
3702
|
-
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
3703
|
-
[GGML_TYPE_F32] = "f32",
|
3704
|
-
[GGML_TYPE_F16] = "f16",
|
3705
|
-
[GGML_TYPE_Q4_0] = "q4_0",
|
3706
|
-
[GGML_TYPE_Q4_1] = "q4_1",
|
3707
|
-
[GGML_TYPE_Q5_0] = "q5_0",
|
3708
|
-
[GGML_TYPE_Q5_1] = "q5_1",
|
3709
|
-
[GGML_TYPE_Q8_0] = "q8_0",
|
3710
|
-
[GGML_TYPE_Q8_1] = "q8_1",
|
3711
|
-
[GGML_TYPE_Q2_K] = "q2_K",
|
3712
|
-
[GGML_TYPE_Q3_K] = "q3_K",
|
3713
|
-
[GGML_TYPE_Q4_K] = "q4_K",
|
3714
|
-
[GGML_TYPE_Q5_K] = "q5_K",
|
3715
|
-
[GGML_TYPE_Q6_K] = "q6_K",
|
3716
|
-
[GGML_TYPE_Q8_K] = "q8_K",
|
3717
|
-
[GGML_TYPE_I8] = "i8",
|
3718
|
-
[GGML_TYPE_I16] = "i16",
|
3719
|
-
[GGML_TYPE_I32] = "i32",
|
3720
|
-
};
|
3721
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated");
|
3722
|
-
|
3723
|
-
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
3724
|
-
[GGML_TYPE_F32] = false,
|
3725
|
-
[GGML_TYPE_F16] = false,
|
3726
|
-
[GGML_TYPE_Q4_0] = true,
|
3727
|
-
[GGML_TYPE_Q4_1] = true,
|
3728
|
-
[GGML_TYPE_Q5_0] = true,
|
3729
|
-
[GGML_TYPE_Q5_1] = true,
|
3730
|
-
[GGML_TYPE_Q8_0] = true,
|
3731
|
-
[GGML_TYPE_Q8_1] = true,
|
3732
|
-
[GGML_TYPE_Q2_K] = true,
|
3733
|
-
[GGML_TYPE_Q3_K] = true,
|
3734
|
-
[GGML_TYPE_Q4_K] = true,
|
3735
|
-
[GGML_TYPE_Q5_K] = true,
|
3736
|
-
[GGML_TYPE_Q6_K] = true,
|
3737
|
-
[GGML_TYPE_Q8_K] = true,
|
3738
|
-
[GGML_TYPE_I8] = false,
|
3739
|
-
[GGML_TYPE_I16] = false,
|
3740
|
-
[GGML_TYPE_I32] = false,
|
3741
|
-
};
|
3742
|
-
static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated");
|
3743
|
-
|
3744
3952
|
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3745
3953
|
"NONE",
|
3746
3954
|
|
@@ -3760,10 +3968,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3760
3968
|
"ARGMAX",
|
3761
3969
|
"REPEAT",
|
3762
3970
|
"REPEAT_BACK",
|
3971
|
+
"CONCAT",
|
3763
3972
|
"SILU_BACK",
|
3764
3973
|
"NORM",
|
3765
3974
|
"RMS_NORM",
|
3766
3975
|
"RMS_NORM_BACK",
|
3976
|
+
"GROUP_NORM",
|
3767
3977
|
|
3768
3978
|
"MUL_MAT",
|
3769
3979
|
"OUT_PROD",
|
@@ -3789,20 +3999,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3789
3999
|
"CLAMP",
|
3790
4000
|
"CONV_1D",
|
3791
4001
|
"CONV_2D",
|
4002
|
+
"CONV_TRANSPOSE_2D",
|
3792
4003
|
"POOL_1D",
|
3793
4004
|
"POOL_2D",
|
4005
|
+
"UPSCALE",
|
3794
4006
|
|
3795
4007
|
"FLASH_ATTN",
|
3796
4008
|
"FLASH_FF",
|
3797
4009
|
"FLASH_ATTN_BACK",
|
3798
4010
|
"WIN_PART",
|
3799
4011
|
"WIN_UNPART",
|
4012
|
+
"GET_REL_POS",
|
4013
|
+
"ADD_REL_POS",
|
3800
4014
|
|
3801
4015
|
"UNARY",
|
3802
4016
|
|
3803
4017
|
"MAP_UNARY",
|
3804
4018
|
"MAP_BINARY",
|
3805
4019
|
|
4020
|
+
"MAP_CUSTOM1_F32",
|
4021
|
+
"MAP_CUSTOM2_F32",
|
4022
|
+
"MAP_CUSTOM3_F32",
|
4023
|
+
|
3806
4024
|
"MAP_CUSTOM1",
|
3807
4025
|
"MAP_CUSTOM2",
|
3808
4026
|
"MAP_CUSTOM3",
|
@@ -3811,7 +4029,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3811
4029
|
"CROSS_ENTROPY_LOSS_BACK",
|
3812
4030
|
};
|
3813
4031
|
|
3814
|
-
static_assert(GGML_OP_COUNT ==
|
4032
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3815
4033
|
|
3816
4034
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3817
4035
|
"none",
|
@@ -3832,10 +4050,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3832
4050
|
"argmax(x)",
|
3833
4051
|
"repeat(x)",
|
3834
4052
|
"repeat_back(x)",
|
4053
|
+
"concat(x, y)",
|
3835
4054
|
"silu_back(x)",
|
3836
4055
|
"norm(x)",
|
3837
4056
|
"rms_norm(x)",
|
3838
4057
|
"rms_norm_back(x)",
|
4058
|
+
"group_norm(x)",
|
3839
4059
|
|
3840
4060
|
"X*Y",
|
3841
4061
|
"X*Y",
|
@@ -3861,20 +4081,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3861
4081
|
"clamp(x)",
|
3862
4082
|
"conv_1d(x)",
|
3863
4083
|
"conv_2d(x)",
|
4084
|
+
"conv_transpose_2d(x)",
|
3864
4085
|
"pool_1d(x)",
|
3865
4086
|
"pool_2d(x)",
|
4087
|
+
"upscale(x)",
|
3866
4088
|
|
3867
4089
|
"flash_attn(x)",
|
3868
4090
|
"flash_ff(x)",
|
3869
4091
|
"flash_attn_back(x)",
|
3870
4092
|
"win_part(x)",
|
3871
4093
|
"win_unpart(x)",
|
4094
|
+
"get_rel_pos(x)",
|
4095
|
+
"add_rel_pos(x)",
|
3872
4096
|
|
3873
4097
|
"unary(x)",
|
3874
4098
|
|
3875
4099
|
"f(x)",
|
3876
4100
|
"f(x,y)",
|
3877
4101
|
|
4102
|
+
"custom_f32(x)",
|
4103
|
+
"custom_f32(x,y)",
|
4104
|
+
"custom_f32(x,y,z)",
|
4105
|
+
|
3878
4106
|
"custom(x)",
|
3879
4107
|
"custom(x,y)",
|
3880
4108
|
"custom(x,y,z)",
|
@@ -3883,7 +4111,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3883
4111
|
"cross_entropy_loss_back(x,y)",
|
3884
4112
|
};
|
3885
4113
|
|
3886
|
-
static_assert(GGML_OP_COUNT ==
|
4114
|
+
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
3887
4115
|
|
3888
4116
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
3889
4117
|
|
@@ -3913,8 +4141,10 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
3913
4141
|
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3914
4142
|
p[GGML_OP_CONV_1D ] = true;
|
3915
4143
|
p[GGML_OP_CONV_2D ] = true;
|
4144
|
+
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
3916
4145
|
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3917
4146
|
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
4147
|
+
p[GGML_OP_ADD_REL_POS ] = true;
|
3918
4148
|
}
|
3919
4149
|
|
3920
4150
|
{ // FINALIZE
|
@@ -4101,38 +4331,41 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
|
4101
4331
|
}
|
4102
4332
|
|
4103
4333
|
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
4104
|
-
|
4105
|
-
|
4106
|
-
|
4107
|
-
|
4108
|
-
|
4109
|
-
|
4110
|
-
//
|
4111
|
-
// is enough, but just in case, adding the second part
|
4334
|
+
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
|
4335
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
4336
|
+
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
|
4337
|
+
}
|
4338
|
+
return nbytes;
|
4339
|
+
}
|
4112
4340
|
|
4113
|
-
|
4341
|
+
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
4342
|
+
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
4114
4343
|
}
|
4115
4344
|
|
4116
4345
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
4117
4346
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4118
4347
|
|
4119
|
-
return (nrows_split*tensor->ne[0]*
|
4348
|
+
return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
|
4120
4349
|
}
|
4121
4350
|
|
4122
4351
|
int ggml_blck_size(enum ggml_type type) {
|
4123
|
-
return
|
4352
|
+
return type_traits[type].blck_size;
|
4124
4353
|
}
|
4125
4354
|
|
4126
4355
|
size_t ggml_type_size(enum ggml_type type) {
|
4127
|
-
return
|
4356
|
+
return type_traits[type].type_size;
|
4128
4357
|
}
|
4129
4358
|
|
4130
4359
|
float ggml_type_sizef(enum ggml_type type) {
|
4131
|
-
return ((float)(
|
4360
|
+
return ((float)(type_traits[type].type_size))/type_traits[type].blck_size;
|
4132
4361
|
}
|
4133
4362
|
|
4134
4363
|
const char * ggml_type_name(enum ggml_type type) {
|
4135
|
-
return
|
4364
|
+
return type_traits[type].type_name;
|
4365
|
+
}
|
4366
|
+
|
4367
|
+
bool ggml_is_quantized(enum ggml_type type) {
|
4368
|
+
return type_traits[type].is_quantized;
|
4136
4369
|
}
|
4137
4370
|
|
4138
4371
|
const char * ggml_op_name(enum ggml_op op) {
|
@@ -4144,7 +4377,7 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|
4144
4377
|
}
|
4145
4378
|
|
4146
4379
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
4147
|
-
return
|
4380
|
+
return ggml_type_size(tensor->type);
|
4148
4381
|
}
|
4149
4382
|
|
4150
4383
|
static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
|
@@ -4182,10 +4415,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct
|
|
4182
4415
|
(t0->ne[3] == t1->ne[3]);
|
4183
4416
|
}
|
4184
4417
|
|
4185
|
-
bool ggml_is_quantized(enum ggml_type type) {
|
4186
|
-
return GGML_IS_QUANTIZED[type];
|
4187
|
-
}
|
4188
|
-
|
4189
4418
|
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
4190
4419
|
enum ggml_type wtype = GGML_TYPE_COUNT;
|
4191
4420
|
|
@@ -4223,8 +4452,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
4223
4452
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4224
4453
|
|
4225
4454
|
return
|
4226
|
-
tensor->nb[0] ==
|
4227
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/
|
4455
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4456
|
+
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
4228
4457
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4229
4458
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4230
4459
|
}
|
@@ -4233,7 +4462,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
4233
4462
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4234
4463
|
|
4235
4464
|
return
|
4236
|
-
tensor->nb[0] ==
|
4465
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4237
4466
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4238
4467
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4239
4468
|
}
|
@@ -4248,7 +4477,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|
4248
4477
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
4249
4478
|
|
4250
4479
|
return
|
4251
|
-
tensor->nb[0] ==
|
4480
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
4252
4481
|
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
4253
4482
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
4254
4483
|
}
|
@@ -4560,36 +4789,51 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4560
4789
|
enum ggml_type type,
|
4561
4790
|
int n_dims,
|
4562
4791
|
const int64_t * ne,
|
4563
|
-
|
4792
|
+
struct ggml_tensor * view_src,
|
4793
|
+
size_t view_offs) {
|
4564
4794
|
|
4565
4795
|
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
4566
4796
|
|
4567
|
-
|
4797
|
+
// find the base tensor and absolute offset
|
4798
|
+
if (view_src != NULL && view_src->view_src != NULL) {
|
4799
|
+
view_offs += view_src->view_offs;
|
4800
|
+
view_src = view_src->view_src;
|
4801
|
+
}
|
4802
|
+
|
4803
|
+
size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
|
4804
|
+
for (int i = 1; i < n_dims; i++) {
|
4805
|
+
data_size *= ne[i];
|
4806
|
+
}
|
4807
|
+
|
4808
|
+
GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
|
4568
4809
|
|
4569
|
-
|
4570
|
-
|
4571
|
-
|
4572
|
-
data_size *= ne[i];
|
4573
|
-
}
|
4810
|
+
void * data = view_src != NULL ? view_src->data : NULL;
|
4811
|
+
if (data != NULL) {
|
4812
|
+
data = (char *) data + view_offs;
|
4574
4813
|
}
|
4575
4814
|
|
4576
|
-
|
4577
|
-
// allocate tensor data in the scratch buffer
|
4578
|
-
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
4579
|
-
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4580
|
-
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
4581
|
-
assert(false);
|
4582
|
-
return NULL;
|
4583
|
-
}
|
4815
|
+
size_t obj_alloc_size = 0;
|
4584
4816
|
|
4585
|
-
|
4817
|
+
if (view_src == NULL && ctx->no_alloc == false) {
|
4818
|
+
if (ctx->scratch.data != NULL) {
|
4819
|
+
// allocate tensor data in the scratch buffer
|
4820
|
+
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
4821
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4822
|
+
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
4823
|
+
assert(false);
|
4824
|
+
return NULL;
|
4825
|
+
}
|
4586
4826
|
|
4587
|
-
|
4827
|
+
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
4588
4828
|
|
4589
|
-
|
4829
|
+
ctx->scratch.offs += data_size;
|
4830
|
+
} else {
|
4831
|
+
// allocate tensor data in the context's memory pool
|
4832
|
+
obj_alloc_size = data_size;
|
4833
|
+
}
|
4590
4834
|
}
|
4591
4835
|
|
4592
|
-
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE +
|
4836
|
+
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
|
4593
4837
|
|
4594
4838
|
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
4595
4839
|
|
@@ -4609,7 +4853,9 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4609
4853
|
/*.perf_runs =*/ 0,
|
4610
4854
|
/*.perf_cycles =*/ 0,
|
4611
4855
|
/*.perf_time_us =*/ 0,
|
4612
|
-
/*.
|
4856
|
+
/*.view_src =*/ view_src,
|
4857
|
+
/*.view_offs =*/ view_offs,
|
4858
|
+
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
|
4613
4859
|
/*.name =*/ { 0 },
|
4614
4860
|
/*.extra =*/ NULL,
|
4615
4861
|
/*.padding =*/ { 0 },
|
@@ -4622,8 +4868,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4622
4868
|
result->ne[i] = ne[i];
|
4623
4869
|
}
|
4624
4870
|
|
4625
|
-
result->nb[0] =
|
4626
|
-
result->nb[1] = result->nb[0]*(result->ne[0]/
|
4871
|
+
result->nb[0] = ggml_type_size(type);
|
4872
|
+
result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
|
4627
4873
|
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
4628
4874
|
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
|
4629
4875
|
}
|
@@ -4633,28 +4879,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4633
4879
|
return result;
|
4634
4880
|
}
|
4635
4881
|
|
4636
|
-
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4637
|
-
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4638
|
-
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4639
|
-
memcpy(tensor->op_params, params, params_size);
|
4640
|
-
}
|
4641
|
-
|
4642
|
-
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4643
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4644
|
-
return ((const int32_t *)(tensor->op_params))[i];
|
4645
|
-
}
|
4646
|
-
|
4647
|
-
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4648
|
-
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4649
|
-
((int32_t *)(tensor->op_params))[i] = value;
|
4650
|
-
}
|
4651
|
-
|
4652
4882
|
struct ggml_tensor * ggml_new_tensor(
|
4653
4883
|
struct ggml_context * ctx,
|
4654
4884
|
enum ggml_type type,
|
4655
4885
|
int n_dims,
|
4656
4886
|
const int64_t * ne) {
|
4657
|
-
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
4887
|
+
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
|
4658
4888
|
}
|
4659
4889
|
|
4660
4890
|
struct ggml_tensor * ggml_new_tensor_1d(
|
@@ -4719,7 +4949,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
|
4719
4949
|
}
|
4720
4950
|
|
4721
4951
|
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
|
4722
|
-
return
|
4952
|
+
return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
|
4953
|
+
}
|
4954
|
+
|
4955
|
+
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
4956
|
+
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
4957
|
+
assert(params_size <= GGML_MAX_OP_PARAMS);
|
4958
|
+
memcpy(tensor->op_params, params, params_size);
|
4959
|
+
}
|
4960
|
+
|
4961
|
+
static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
|
4962
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4963
|
+
return ((const int32_t *)(tensor->op_params))[i];
|
4964
|
+
}
|
4965
|
+
|
4966
|
+
static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
|
4967
|
+
assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
|
4968
|
+
((int32_t *)(tensor->op_params))[i] = value;
|
4723
4969
|
}
|
4724
4970
|
|
4725
4971
|
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
@@ -5005,14 +5251,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
|
|
5005
5251
|
|
5006
5252
|
struct ggml_tensor * ggml_view_tensor(
|
5007
5253
|
struct ggml_context * ctx,
|
5008
|
-
|
5009
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src
|
5254
|
+
struct ggml_tensor * src) {
|
5255
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
|
5010
5256
|
ggml_format_name(result, "%s (view)", src->name);
|
5011
5257
|
|
5012
|
-
|
5013
|
-
|
5014
|
-
|
5015
|
-
result->nb[3] = src->nb[3];
|
5258
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
5259
|
+
result->nb[i] = src->nb[i];
|
5260
|
+
}
|
5016
5261
|
|
5017
5262
|
return result;
|
5018
5263
|
}
|
@@ -5545,10 +5790,6 @@ struct ggml_tensor * ggml_repeat(
|
|
5545
5790
|
is_node = true;
|
5546
5791
|
}
|
5547
5792
|
|
5548
|
-
if (ggml_are_same_shape(a, b) && !is_node) {
|
5549
|
-
return a;
|
5550
|
-
}
|
5551
|
-
|
5552
5793
|
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
5553
5794
|
|
5554
5795
|
result->op = GGML_OP_REPEAT;
|
@@ -5587,6 +5828,30 @@ struct ggml_tensor * ggml_repeat_back(
|
|
5587
5828
|
return result;
|
5588
5829
|
}
|
5589
5830
|
|
5831
|
+
// ggml_concat
|
5832
|
+
|
5833
|
+
struct ggml_tensor * ggml_concat(
|
5834
|
+
struct ggml_context* ctx,
|
5835
|
+
struct ggml_tensor* a,
|
5836
|
+
struct ggml_tensor* b) {
|
5837
|
+
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
5838
|
+
|
5839
|
+
bool is_node = false;
|
5840
|
+
|
5841
|
+
if (a->grad || b->grad) {
|
5842
|
+
is_node = true;
|
5843
|
+
}
|
5844
|
+
|
5845
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
5846
|
+
|
5847
|
+
result->op = GGML_OP_CONCAT;
|
5848
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5849
|
+
result->src[0] = a;
|
5850
|
+
result->src[1] = b;
|
5851
|
+
|
5852
|
+
return result;
|
5853
|
+
}
|
5854
|
+
|
5590
5855
|
// ggml_abs
|
5591
5856
|
|
5592
5857
|
struct ggml_tensor * ggml_abs(
|
@@ -5755,6 +6020,7 @@ struct ggml_tensor * ggml_silu_back(
|
|
5755
6020
|
static struct ggml_tensor * ggml_norm_impl(
|
5756
6021
|
struct ggml_context * ctx,
|
5757
6022
|
struct ggml_tensor * a,
|
6023
|
+
float eps,
|
5758
6024
|
bool inplace) {
|
5759
6025
|
bool is_node = false;
|
5760
6026
|
|
@@ -5765,7 +6031,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5765
6031
|
|
5766
6032
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5767
6033
|
|
5768
|
-
|
6034
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
5769
6035
|
|
5770
6036
|
result->op = GGML_OP_NORM;
|
5771
6037
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5776,16 +6042,20 @@ static struct ggml_tensor * ggml_norm_impl(
|
|
5776
6042
|
|
5777
6043
|
struct ggml_tensor * ggml_norm(
|
5778
6044
|
struct ggml_context * ctx,
|
5779
|
-
struct ggml_tensor * a
|
5780
|
-
|
6045
|
+
struct ggml_tensor * a,
|
6046
|
+
float eps) {
|
6047
|
+
return ggml_norm_impl(ctx, a, eps, false);
|
5781
6048
|
}
|
5782
6049
|
|
5783
6050
|
struct ggml_tensor * ggml_norm_inplace(
|
5784
6051
|
struct ggml_context * ctx,
|
5785
|
-
struct ggml_tensor * a
|
5786
|
-
|
6052
|
+
struct ggml_tensor * a,
|
6053
|
+
float eps) {
|
6054
|
+
return ggml_norm_impl(ctx, a, eps, true);
|
5787
6055
|
}
|
5788
6056
|
|
6057
|
+
// ggml_rms_norm
|
6058
|
+
|
5789
6059
|
static struct ggml_tensor * ggml_rms_norm_impl(
|
5790
6060
|
struct ggml_context * ctx,
|
5791
6061
|
struct ggml_tensor * a,
|
@@ -5822,10 +6092,13 @@ struct ggml_tensor * ggml_rms_norm_inplace(
|
|
5822
6092
|
return ggml_rms_norm_impl(ctx, a, eps, true);
|
5823
6093
|
}
|
5824
6094
|
|
6095
|
+
// ggml_rms_norm_back
|
6096
|
+
|
5825
6097
|
struct ggml_tensor * ggml_rms_norm_back(
|
5826
6098
|
struct ggml_context * ctx,
|
5827
6099
|
struct ggml_tensor * a,
|
5828
|
-
struct ggml_tensor * b
|
6100
|
+
struct ggml_tensor * b,
|
6101
|
+
float eps) {
|
5829
6102
|
bool is_node = false;
|
5830
6103
|
|
5831
6104
|
if (a->grad) {
|
@@ -5835,6 +6108,8 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5835
6108
|
|
5836
6109
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
5837
6110
|
|
6111
|
+
ggml_set_op_params(result, &eps, sizeof(eps));
|
6112
|
+
|
5838
6113
|
result->op = GGML_OP_RMS_NORM_BACK;
|
5839
6114
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5840
6115
|
result->src[0] = a;
|
@@ -5843,6 +6118,44 @@ struct ggml_tensor * ggml_rms_norm_back(
|
|
5843
6118
|
return result;
|
5844
6119
|
}
|
5845
6120
|
|
6121
|
+
// ggml_group_norm
|
6122
|
+
|
6123
|
+
static struct ggml_tensor * ggml_group_norm_impl(
|
6124
|
+
struct ggml_context * ctx,
|
6125
|
+
struct ggml_tensor * a,
|
6126
|
+
int n_groups,
|
6127
|
+
bool inplace) {
|
6128
|
+
|
6129
|
+
bool is_node = false;
|
6130
|
+
if (!inplace && (a->grad)) {
|
6131
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6132
|
+
is_node = true;
|
6133
|
+
}
|
6134
|
+
|
6135
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6136
|
+
|
6137
|
+
result->op = GGML_OP_GROUP_NORM;
|
6138
|
+
result->op_params[0] = n_groups;
|
6139
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6140
|
+
result->src[0] = a;
|
6141
|
+
result->src[1] = NULL; // TODO: maybe store epsilon here?
|
6142
|
+
|
6143
|
+
return result;
|
6144
|
+
}
|
6145
|
+
|
6146
|
+
struct ggml_tensor * ggml_group_norm(
|
6147
|
+
struct ggml_context * ctx,
|
6148
|
+
struct ggml_tensor * a,
|
6149
|
+
int n_groups) {
|
6150
|
+
return ggml_group_norm_impl(ctx, a, n_groups, false);
|
6151
|
+
}
|
6152
|
+
|
6153
|
+
struct ggml_tensor * ggml_group_norm_inplace(
|
6154
|
+
struct ggml_context * ctx,
|
6155
|
+
struct ggml_tensor * a,
|
6156
|
+
int n_groups) {
|
6157
|
+
return ggml_group_norm_impl(ctx, a, n_groups, true);
|
6158
|
+
}
|
5846
6159
|
|
5847
6160
|
// ggml_mul_mat
|
5848
6161
|
|
@@ -6126,7 +6439,7 @@ struct ggml_tensor * ggml_reshape(
|
|
6126
6439
|
//GGML_ASSERT(false);
|
6127
6440
|
}
|
6128
6441
|
|
6129
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a
|
6442
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
|
6130
6443
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6131
6444
|
|
6132
6445
|
result->op = GGML_OP_RESHAPE;
|
@@ -6150,7 +6463,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
6150
6463
|
}
|
6151
6464
|
|
6152
6465
|
const int64_t ne[1] = { ne0 };
|
6153
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a
|
6466
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
|
6154
6467
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6155
6468
|
|
6156
6469
|
result->op = GGML_OP_RESHAPE;
|
@@ -6175,7 +6488,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
6175
6488
|
}
|
6176
6489
|
|
6177
6490
|
const int64_t ne[2] = { ne0, ne1 };
|
6178
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a
|
6491
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
|
6179
6492
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6180
6493
|
|
6181
6494
|
result->op = GGML_OP_RESHAPE;
|
@@ -6201,7 +6514,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6201
6514
|
}
|
6202
6515
|
|
6203
6516
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6204
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a
|
6517
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
|
6205
6518
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6206
6519
|
|
6207
6520
|
result->op = GGML_OP_RESHAPE;
|
@@ -6211,7 +6524,6 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
6211
6524
|
return result;
|
6212
6525
|
}
|
6213
6526
|
|
6214
|
-
|
6215
6527
|
struct ggml_tensor * ggml_reshape_4d(
|
6216
6528
|
struct ggml_context * ctx,
|
6217
6529
|
struct ggml_tensor * a,
|
@@ -6229,7 +6541,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6229
6541
|
}
|
6230
6542
|
|
6231
6543
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6232
|
-
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a
|
6544
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
|
6233
6545
|
ggml_format_name(result, "%s (reshaped)", a->name);
|
6234
6546
|
|
6235
6547
|
result->op = GGML_OP_RESHAPE;
|
@@ -6239,46 +6551,40 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
6239
6551
|
return result;
|
6240
6552
|
}
|
6241
6553
|
|
6242
|
-
|
6243
|
-
|
6244
|
-
static struct ggml_tensor * ggml_view_tensor_offset(
|
6554
|
+
static struct ggml_tensor * ggml_view_impl(
|
6245
6555
|
struct ggml_context * ctx,
|
6246
6556
|
struct ggml_tensor * a,
|
6247
6557
|
int n_dims,
|
6248
6558
|
const int64_t * ne,
|
6249
6559
|
size_t offset) {
|
6250
|
-
// don't calculate an offset from an unallocated tensor
|
6251
|
-
void * data = NULL;
|
6252
|
-
if (a->data != NULL) {
|
6253
|
-
data = (char *) a->data + offset;
|
6254
|
-
}
|
6255
6560
|
|
6256
|
-
|
6561
|
+
bool is_node = false;
|
6562
|
+
|
6563
|
+
if (a->grad) {
|
6564
|
+
is_node = true;
|
6565
|
+
}
|
6257
6566
|
|
6567
|
+
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
6258
6568
|
ggml_format_name(result, "%s (view)", a->name);
|
6259
6569
|
|
6260
6570
|
ggml_set_op_params(result, &offset, sizeof(offset));
|
6261
6571
|
|
6572
|
+
result->op = GGML_OP_VIEW;
|
6573
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6574
|
+
result->src[0] = a;
|
6575
|
+
|
6262
6576
|
return result;
|
6263
6577
|
}
|
6264
6578
|
|
6579
|
+
// ggml_view_1d
|
6580
|
+
|
6265
6581
|
struct ggml_tensor * ggml_view_1d(
|
6266
6582
|
struct ggml_context * ctx,
|
6267
6583
|
struct ggml_tensor * a,
|
6268
6584
|
int64_t ne0,
|
6269
6585
|
size_t offset) {
|
6270
6586
|
|
6271
|
-
|
6272
|
-
|
6273
|
-
if (a->grad) {
|
6274
|
-
is_node = true;
|
6275
|
-
}
|
6276
|
-
|
6277
|
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
6278
|
-
|
6279
|
-
result->op = GGML_OP_VIEW;
|
6280
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6281
|
-
result->src[0] = a;
|
6587
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
|
6282
6588
|
|
6283
6589
|
return result;
|
6284
6590
|
}
|
@@ -6293,24 +6599,14 @@ struct ggml_tensor * ggml_view_2d(
|
|
6293
6599
|
size_t nb1,
|
6294
6600
|
size_t offset) {
|
6295
6601
|
|
6296
|
-
|
6602
|
+
const int64_t ne[2] = { ne0, ne1 };
|
6297
6603
|
|
6298
|
-
|
6299
|
-
is_node = true;
|
6300
|
-
}
|
6301
|
-
|
6302
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6303
|
-
|
6304
|
-
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
6604
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
|
6305
6605
|
|
6306
6606
|
result->nb[1] = nb1;
|
6307
6607
|
result->nb[2] = result->nb[1]*ne1;
|
6308
6608
|
result->nb[3] = result->nb[2];
|
6309
6609
|
|
6310
|
-
result->op = GGML_OP_VIEW;
|
6311
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6312
|
-
result->src[0] = a;
|
6313
|
-
|
6314
6610
|
return result;
|
6315
6611
|
}
|
6316
6612
|
|
@@ -6326,24 +6622,14 @@ struct ggml_tensor * ggml_view_3d(
|
|
6326
6622
|
size_t nb2,
|
6327
6623
|
size_t offset) {
|
6328
6624
|
|
6329
|
-
|
6330
|
-
|
6331
|
-
if (a->grad) {
|
6332
|
-
is_node = true;
|
6333
|
-
}
|
6334
|
-
|
6335
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6625
|
+
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6336
6626
|
|
6337
|
-
struct ggml_tensor * result =
|
6627
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
|
6338
6628
|
|
6339
6629
|
result->nb[1] = nb1;
|
6340
6630
|
result->nb[2] = nb2;
|
6341
6631
|
result->nb[3] = result->nb[2]*ne2;
|
6342
6632
|
|
6343
|
-
result->op = GGML_OP_VIEW;
|
6344
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6345
|
-
result->src[0] = a;
|
6346
|
-
|
6347
6633
|
return result;
|
6348
6634
|
}
|
6349
6635
|
|
@@ -6361,24 +6647,14 @@ struct ggml_tensor * ggml_view_4d(
|
|
6361
6647
|
size_t nb3,
|
6362
6648
|
size_t offset) {
|
6363
6649
|
|
6364
|
-
|
6365
|
-
|
6366
|
-
if (a->grad) {
|
6367
|
-
is_node = true;
|
6368
|
-
}
|
6369
|
-
|
6370
|
-
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6650
|
+
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6371
6651
|
|
6372
|
-
struct ggml_tensor * result =
|
6652
|
+
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
|
6373
6653
|
|
6374
6654
|
result->nb[1] = nb1;
|
6375
6655
|
result->nb[2] = nb2;
|
6376
6656
|
result->nb[3] = nb3;
|
6377
6657
|
|
6378
|
-
result->op = GGML_OP_VIEW;
|
6379
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6380
|
-
result->src[0] = a;
|
6381
|
-
|
6382
6658
|
return result;
|
6383
6659
|
}
|
6384
6660
|
|
@@ -6565,7 +6841,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
|
6565
6841
|
|
6566
6842
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6567
6843
|
|
6568
|
-
int32_t params[] = { n_past
|
6844
|
+
int32_t params[] = { n_past };
|
6569
6845
|
ggml_set_op_params(result, params, sizeof(params));
|
6570
6846
|
|
6571
6847
|
result->op = GGML_OP_DIAG_MASK_INF;
|
@@ -6582,7 +6858,6 @@ struct ggml_tensor * ggml_diag_mask_inf(
|
|
6582
6858
|
return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
|
6583
6859
|
}
|
6584
6860
|
|
6585
|
-
|
6586
6861
|
struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
6587
6862
|
struct ggml_context * ctx,
|
6588
6863
|
struct ggml_tensor * a,
|
@@ -6605,7 +6880,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
|
6605
6880
|
|
6606
6881
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6607
6882
|
|
6608
|
-
int32_t params[] = { n_past
|
6883
|
+
int32_t params[] = { n_past };
|
6609
6884
|
ggml_set_op_params(result, params, sizeof(params));
|
6610
6885
|
|
6611
6886
|
result->op = GGML_OP_DIAG_MASK_ZERO;
|
@@ -6711,6 +6986,8 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6711
6986
|
int n_ctx,
|
6712
6987
|
float freq_base,
|
6713
6988
|
float freq_scale,
|
6989
|
+
float xpos_base,
|
6990
|
+
bool xpos_down,
|
6714
6991
|
bool inplace) {
|
6715
6992
|
GGML_ASSERT(n_past >= 0);
|
6716
6993
|
bool is_node = false;
|
@@ -6721,9 +6998,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6721
6998
|
|
6722
6999
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6723
7000
|
|
6724
|
-
int32_t params[
|
7001
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
6725
7002
|
memcpy(params + 4, &freq_base, sizeof(float));
|
6726
7003
|
memcpy(params + 5, &freq_scale, sizeof(float));
|
7004
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
7005
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6727
7006
|
ggml_set_op_params(result, params, sizeof(params));
|
6728
7007
|
|
6729
7008
|
result->op = GGML_OP_ROPE;
|
@@ -6740,7 +7019,7 @@ struct ggml_tensor * ggml_rope(
|
|
6740
7019
|
int n_dims,
|
6741
7020
|
int mode,
|
6742
7021
|
int n_ctx) {
|
6743
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
7022
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
6744
7023
|
}
|
6745
7024
|
|
6746
7025
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6750,7 +7029,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6750
7029
|
int n_dims,
|
6751
7030
|
int mode,
|
6752
7031
|
int n_ctx) {
|
6753
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
7032
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
6754
7033
|
}
|
6755
7034
|
|
6756
7035
|
struct ggml_tensor * ggml_rope_custom(
|
@@ -6762,7 +7041,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6762
7041
|
int n_ctx,
|
6763
7042
|
float freq_base,
|
6764
7043
|
float freq_scale) {
|
6765
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
7044
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
6766
7045
|
}
|
6767
7046
|
|
6768
7047
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
@@ -6774,7 +7053,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6774
7053
|
int n_ctx,
|
6775
7054
|
float freq_base,
|
6776
7055
|
float freq_scale) {
|
6777
|
-
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
7056
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
7057
|
+
}
|
7058
|
+
|
7059
|
+
struct ggml_tensor * ggml_rope_xpos_inplace(
|
7060
|
+
struct ggml_context * ctx,
|
7061
|
+
struct ggml_tensor * a,
|
7062
|
+
int n_past,
|
7063
|
+
int n_dims,
|
7064
|
+
float base,
|
7065
|
+
bool down) {
|
7066
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
6778
7067
|
}
|
6779
7068
|
|
6780
7069
|
// ggml_rope_back
|
@@ -6785,7 +7074,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6785
7074
|
int n_past,
|
6786
7075
|
int n_dims,
|
6787
7076
|
int mode,
|
6788
|
-
int n_ctx
|
7077
|
+
int n_ctx,
|
7078
|
+
float freq_base,
|
7079
|
+
float freq_scale,
|
7080
|
+
float xpos_base,
|
7081
|
+
bool xpos_down) {
|
6789
7082
|
GGML_ASSERT(n_past >= 0);
|
6790
7083
|
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
6791
7084
|
|
@@ -6797,7 +7090,11 @@ struct ggml_tensor * ggml_rope_back(
|
|
6797
7090
|
|
6798
7091
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6799
7092
|
|
6800
|
-
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
7093
|
+
int32_t params[8] = { n_past, n_dims, mode, n_ctx };
|
7094
|
+
memcpy(params + 4, &freq_base, sizeof(float));
|
7095
|
+
memcpy(params + 5, &freq_scale, sizeof(float));
|
7096
|
+
memcpy(params + 6, &xpos_base, sizeof(float));
|
7097
|
+
memcpy(params + 7, &xpos_down, sizeof(bool));
|
6801
7098
|
ggml_set_op_params(result, params, sizeof(params));
|
6802
7099
|
|
6803
7100
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -6904,6 +7201,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
6904
7201
|
return result;
|
6905
7202
|
}
|
6906
7203
|
|
7204
|
+
// ggml_conv_1d_ph
|
7205
|
+
|
7206
|
+
struct ggml_tensor* ggml_conv_1d_ph(
|
7207
|
+
struct ggml_context * ctx,
|
7208
|
+
struct ggml_tensor * a,
|
7209
|
+
struct ggml_tensor * b,
|
7210
|
+
int s,
|
7211
|
+
int d) {
|
7212
|
+
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
7213
|
+
}
|
7214
|
+
|
6907
7215
|
// ggml_conv_2d
|
6908
7216
|
|
6909
7217
|
struct ggml_tensor * ggml_conv_2d(
|
@@ -6944,17 +7252,61 @@ struct ggml_tensor * ggml_conv_2d(
|
|
6944
7252
|
|
6945
7253
|
}
|
6946
7254
|
|
6947
|
-
//
|
7255
|
+
// ggml_conv_2d_sk_p0
|
6948
7256
|
|
6949
|
-
struct ggml_tensor *
|
7257
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6950
7258
|
struct ggml_context * ctx,
|
6951
7259
|
struct ggml_tensor * a,
|
6952
|
-
struct ggml_tensor * b
|
6953
|
-
|
6954
|
-
|
6955
|
-
|
7260
|
+
struct ggml_tensor * b) {
|
7261
|
+
return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
|
7262
|
+
}
|
7263
|
+
|
7264
|
+
// ggml_conv_2d_s1_ph
|
7265
|
+
|
7266
|
+
struct ggml_tensor * ggml_conv_2d_s1_ph(
|
7267
|
+
struct ggml_context * ctx,
|
7268
|
+
struct ggml_tensor * a,
|
7269
|
+
struct ggml_tensor * b) {
|
7270
|
+
return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
|
7271
|
+
}
|
7272
|
+
|
7273
|
+
// ggml_conv_transpose_2d_p0
|
7274
|
+
|
7275
|
+
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
7276
|
+
return (ins - 1) * s - 2 * p + ks;
|
6956
7277
|
}
|
6957
7278
|
|
7279
|
+
struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
7280
|
+
struct ggml_context * ctx,
|
7281
|
+
struct ggml_tensor * a,
|
7282
|
+
struct ggml_tensor * b,
|
7283
|
+
int stride) {
|
7284
|
+
GGML_ASSERT(a->ne[3] == b->ne[2]);
|
7285
|
+
|
7286
|
+
bool is_node = false;
|
7287
|
+
|
7288
|
+
if (a->grad || b->grad) {
|
7289
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7290
|
+
is_node = true;
|
7291
|
+
}
|
7292
|
+
|
7293
|
+
const int64_t ne[4] = {
|
7294
|
+
ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
|
7295
|
+
ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
|
7296
|
+
a->ne[2], b->ne[3],
|
7297
|
+
};
|
7298
|
+
|
7299
|
+
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7300
|
+
|
7301
|
+
ggml_set_op_params_i32(result, 0, stride);
|
7302
|
+
|
7303
|
+
result->op = GGML_OP_CONV_TRANSPOSE_2D;
|
7304
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7305
|
+
result->src[0] = a;
|
7306
|
+
result->src[1] = b;
|
7307
|
+
|
7308
|
+
return result;
|
7309
|
+
}
|
6958
7310
|
|
6959
7311
|
// ggml_pool_*
|
6960
7312
|
|
@@ -7032,6 +7384,40 @@ struct ggml_tensor * ggml_pool_2d(
|
|
7032
7384
|
return result;
|
7033
7385
|
}
|
7034
7386
|
|
7387
|
+
// ggml_upscale
|
7388
|
+
|
7389
|
+
static struct ggml_tensor * ggml_upscale_impl(
|
7390
|
+
struct ggml_context * ctx,
|
7391
|
+
struct ggml_tensor * a,
|
7392
|
+
int scale_factor) {
|
7393
|
+
bool is_node = false;
|
7394
|
+
|
7395
|
+
if (a->grad) {
|
7396
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7397
|
+
is_node = true;
|
7398
|
+
}
|
7399
|
+
|
7400
|
+
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
7401
|
+
a->ne[0] * scale_factor,
|
7402
|
+
a->ne[1] * scale_factor,
|
7403
|
+
a->ne[2], a->ne[3]);
|
7404
|
+
|
7405
|
+
result->op = GGML_OP_UPSCALE;
|
7406
|
+
result->op_params[0] = scale_factor;
|
7407
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7408
|
+
result->src[0] = a;
|
7409
|
+
result->src[1] = NULL;
|
7410
|
+
|
7411
|
+
return result;
|
7412
|
+
}
|
7413
|
+
|
7414
|
+
struct ggml_tensor * ggml_upscale(
|
7415
|
+
struct ggml_context * ctx,
|
7416
|
+
struct ggml_tensor * a,
|
7417
|
+
int scale_factor) {
|
7418
|
+
return ggml_upscale_impl(ctx, a, scale_factor);
|
7419
|
+
}
|
7420
|
+
|
7035
7421
|
// ggml_flash_attn
|
7036
7422
|
|
7037
7423
|
struct ggml_tensor * ggml_flash_attn(
|
@@ -7230,6 +7616,87 @@ struct ggml_tensor * ggml_win_unpart(
|
|
7230
7616
|
return result;
|
7231
7617
|
}
|
7232
7618
|
|
7619
|
+
// ggml_get_rel_pos
|
7620
|
+
|
7621
|
+
struct ggml_tensor * ggml_get_rel_pos(
|
7622
|
+
struct ggml_context * ctx,
|
7623
|
+
struct ggml_tensor * a,
|
7624
|
+
int qh,
|
7625
|
+
int kh) {
|
7626
|
+
GGML_ASSERT(qh == kh);
|
7627
|
+
GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
|
7628
|
+
|
7629
|
+
bool is_node = false;
|
7630
|
+
|
7631
|
+
if (a->grad) {
|
7632
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7633
|
+
is_node = true;
|
7634
|
+
}
|
7635
|
+
|
7636
|
+
const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
|
7637
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
|
7638
|
+
|
7639
|
+
result->op = GGML_OP_GET_REL_POS;
|
7640
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7641
|
+
result->src[0] = a;
|
7642
|
+
result->src[1] = NULL;
|
7643
|
+
|
7644
|
+
return result;
|
7645
|
+
}
|
7646
|
+
|
7647
|
+
// ggml_add_rel_pos
|
7648
|
+
|
7649
|
+
static struct ggml_tensor * ggml_add_rel_pos_impl(
|
7650
|
+
struct ggml_context * ctx,
|
7651
|
+
struct ggml_tensor * a,
|
7652
|
+
struct ggml_tensor * pw,
|
7653
|
+
struct ggml_tensor * ph,
|
7654
|
+
bool inplace) {
|
7655
|
+
GGML_ASSERT(ggml_are_same_shape(pw, ph));
|
7656
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
7657
|
+
GGML_ASSERT(ggml_is_contiguous(pw));
|
7658
|
+
GGML_ASSERT(ggml_is_contiguous(ph));
|
7659
|
+
GGML_ASSERT(ph->type == GGML_TYPE_F32);
|
7660
|
+
GGML_ASSERT(pw->type == GGML_TYPE_F32);
|
7661
|
+
GGML_ASSERT(pw->ne[3] == a->ne[2]);
|
7662
|
+
GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
|
7663
|
+
GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
|
7664
|
+
|
7665
|
+
bool is_node = false;
|
7666
|
+
|
7667
|
+
if (!inplace && (a->grad || pw->grad || ph->grad)) {
|
7668
|
+
is_node = true;
|
7669
|
+
}
|
7670
|
+
|
7671
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7672
|
+
ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
|
7673
|
+
|
7674
|
+
result->op = GGML_OP_ADD_REL_POS;
|
7675
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7676
|
+
result->src[0] = a;
|
7677
|
+
result->src[1] = pw;
|
7678
|
+
result->src[2] = ph;
|
7679
|
+
|
7680
|
+
return result;
|
7681
|
+
}
|
7682
|
+
|
7683
|
+
|
7684
|
+
struct ggml_tensor * ggml_add_rel_pos(
|
7685
|
+
struct ggml_context * ctx,
|
7686
|
+
struct ggml_tensor * a,
|
7687
|
+
struct ggml_tensor * pw,
|
7688
|
+
struct ggml_tensor * ph) {
|
7689
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
|
7690
|
+
}
|
7691
|
+
|
7692
|
+
struct ggml_tensor * ggml_add_rel_pos_inplace(
|
7693
|
+
struct ggml_context * ctx,
|
7694
|
+
struct ggml_tensor * a,
|
7695
|
+
struct ggml_tensor * pw,
|
7696
|
+
struct ggml_tensor * ph) {
|
7697
|
+
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7698
|
+
}
|
7699
|
+
|
7233
7700
|
// gmml_unary
|
7234
7701
|
|
7235
7702
|
static struct ggml_tensor * ggml_unary_impl(
|
@@ -7745,7 +8212,7 @@ static void ggml_compute_forward_dup_same_cont(
|
|
7745
8212
|
memcpy(
|
7746
8213
|
((char *) dst->data + ie0*nb0),
|
7747
8214
|
((char *) src0->data + ie0*nb00),
|
7748
|
-
(ie1 - ie0) *
|
8215
|
+
(ie1 - ie0) * ggml_type_size(src0->type));
|
7749
8216
|
}
|
7750
8217
|
|
7751
8218
|
}
|
@@ -7779,7 +8246,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7779
8246
|
|
7780
8247
|
if (src0->type == dst->type &&
|
7781
8248
|
ne00 == ne0 &&
|
7782
|
-
nb00 ==
|
8249
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
7783
8250
|
// copy by rows
|
7784
8251
|
const size_t rs = ne00*nb00;
|
7785
8252
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -7837,7 +8304,7 @@ static void ggml_compute_forward_dup_f16(
|
|
7837
8304
|
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
7838
8305
|
|
7839
8306
|
size_t id = 0;
|
7840
|
-
size_t rs = nb0 * (ne00 /
|
8307
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
7841
8308
|
char * dst_ptr = (char *) dst->data;
|
7842
8309
|
|
7843
8310
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8050,7 +8517,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8050
8517
|
|
8051
8518
|
if (src0->type == dst->type &&
|
8052
8519
|
ne00 == ne0 &&
|
8053
|
-
nb00 ==
|
8520
|
+
nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
|
8054
8521
|
// copy by rows
|
8055
8522
|
const size_t rs = ne00*nb00;
|
8056
8523
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -8089,7 +8556,7 @@ static void ggml_compute_forward_dup_f32(
|
|
8089
8556
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
8090
8557
|
|
8091
8558
|
size_t id = 0;
|
8092
|
-
size_t rs = nb0 * (ne00 /
|
8559
|
+
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
8093
8560
|
char * dst_ptr = (char *) dst->data;
|
8094
8561
|
|
8095
8562
|
for (int i03 = 0; i03 < ne03; i03++) {
|
@@ -8501,7 +8968,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
8501
8968
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8502
8969
|
|
8503
8970
|
// we don't support permuted src0 or src1
|
8504
|
-
GGML_ASSERT(nb00 ==
|
8971
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8505
8972
|
GGML_ASSERT(nb10 == sizeof(float));
|
8506
8973
|
|
8507
8974
|
// dst cannot be transposed or permuted
|
@@ -8775,7 +9242,7 @@ static void ggml_compute_forward_add1_q_f32(
|
|
8775
9242
|
ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
|
8776
9243
|
|
8777
9244
|
// we don't support permuted src0
|
8778
|
-
GGML_ASSERT(nb00 ==
|
9245
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
8779
9246
|
|
8780
9247
|
// dst cannot be transposed or permuted
|
8781
9248
|
GGML_ASSERT(nb0 <= nb1);
|
@@ -9137,6 +9604,8 @@ static void ggml_compute_forward_mul(
|
|
9137
9604
|
const struct ggml_tensor * src0,
|
9138
9605
|
const struct ggml_tensor * src1,
|
9139
9606
|
struct ggml_tensor * dst) {
|
9607
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
|
9608
|
+
|
9140
9609
|
switch (src0->type) {
|
9141
9610
|
case GGML_TYPE_F32:
|
9142
9611
|
{
|
@@ -9179,6 +9648,8 @@ static void ggml_compute_forward_div_f32(
|
|
9179
9648
|
|
9180
9649
|
|
9181
9650
|
#ifdef GGML_USE_ACCELERATE
|
9651
|
+
UNUSED(ggml_vec_div_f32);
|
9652
|
+
|
9182
9653
|
vDSP_vdiv(
|
9183
9654
|
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
9184
9655
|
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
@@ -9731,6 +10202,72 @@ static void ggml_compute_forward_repeat_back(
|
|
9731
10202
|
}
|
9732
10203
|
}
|
9733
10204
|
|
10205
|
+
// ggml_compute_forward_concat
|
10206
|
+
|
10207
|
+
static void ggml_compute_forward_concat_f32(
|
10208
|
+
const struct ggml_compute_params * params,
|
10209
|
+
const struct ggml_tensor * src0,
|
10210
|
+
const struct ggml_tensor * src1,
|
10211
|
+
struct ggml_tensor * dst) {
|
10212
|
+
|
10213
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10214
|
+
return;
|
10215
|
+
}
|
10216
|
+
|
10217
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10218
|
+
|
10219
|
+
const int ith = params->ith;
|
10220
|
+
|
10221
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
10222
|
+
|
10223
|
+
// TODO: support for transposed / permuted tensors
|
10224
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
10225
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
10226
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
10227
|
+
|
10228
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
10229
|
+
for (int i2 = ith; i2 < ne2; i2++) {
|
10230
|
+
if (i2 < ne02) { // src0
|
10231
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10232
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10233
|
+
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
10234
|
+
|
10235
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10236
|
+
*y = *x;
|
10237
|
+
}
|
10238
|
+
}
|
10239
|
+
} // src1
|
10240
|
+
else {
|
10241
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10242
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10243
|
+
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10244
|
+
|
10245
|
+
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10246
|
+
*y = *x;
|
10247
|
+
}
|
10248
|
+
}
|
10249
|
+
}
|
10250
|
+
}
|
10251
|
+
}
|
10252
|
+
}
|
10253
|
+
|
10254
|
+
static void ggml_compute_forward_concat(
|
10255
|
+
const struct ggml_compute_params* params,
|
10256
|
+
const struct ggml_tensor* src0,
|
10257
|
+
const struct ggml_tensor* src1,
|
10258
|
+
struct ggml_tensor* dst) {
|
10259
|
+
switch (src0->type) {
|
10260
|
+
case GGML_TYPE_F32:
|
10261
|
+
{
|
10262
|
+
ggml_compute_forward_concat_f32(params, src0, src1, dst);
|
10263
|
+
} break;
|
10264
|
+
default:
|
10265
|
+
{
|
10266
|
+
GGML_ASSERT(false);
|
10267
|
+
} break;
|
10268
|
+
}
|
10269
|
+
}
|
10270
|
+
|
9734
10271
|
// ggml_compute_forward_abs
|
9735
10272
|
|
9736
10273
|
static void ggml_compute_forward_abs_f32(
|
@@ -10285,7 +10822,8 @@ static void ggml_compute_forward_norm_f32(
|
|
10285
10822
|
|
10286
10823
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
10287
10824
|
|
10288
|
-
|
10825
|
+
float eps;
|
10826
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10289
10827
|
|
10290
10828
|
// TODO: optimize
|
10291
10829
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -10334,6 +10872,8 @@ static void ggml_compute_forward_norm(
|
|
10334
10872
|
}
|
10335
10873
|
}
|
10336
10874
|
|
10875
|
+
// ggml_compute_forward_group_rms_norm
|
10876
|
+
|
10337
10877
|
static void ggml_compute_forward_rms_norm_f32(
|
10338
10878
|
const struct ggml_compute_params * params,
|
10339
10879
|
const struct ggml_tensor * src0,
|
@@ -10398,7 +10938,6 @@ static void ggml_compute_forward_rms_norm(
|
|
10398
10938
|
}
|
10399
10939
|
}
|
10400
10940
|
|
10401
|
-
|
10402
10941
|
static void ggml_compute_forward_rms_norm_back_f32(
|
10403
10942
|
const struct ggml_compute_params * params,
|
10404
10943
|
const struct ggml_tensor * src0,
|
@@ -10417,7 +10956,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
|
|
10417
10956
|
|
10418
10957
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
10419
10958
|
|
10420
|
-
|
10959
|
+
float eps;
|
10960
|
+
memcpy(&eps, dst->op_params, sizeof(float));
|
10421
10961
|
|
10422
10962
|
// TODO: optimize
|
10423
10963
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
@@ -10572,54 +11112,144 @@ static void ggml_compute_forward_rms_norm_back(
|
|
10572
11112
|
}
|
10573
11113
|
}
|
10574
11114
|
|
10575
|
-
//
|
10576
|
-
|
10577
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10578
|
-
// helper function to determine if it is better to use BLAS or not
|
10579
|
-
// for large matrices, BLAS is faster
|
10580
|
-
static bool ggml_compute_forward_mul_mat_use_blas(
|
10581
|
-
const struct ggml_tensor * src0,
|
10582
|
-
const struct ggml_tensor * src1,
|
10583
|
-
struct ggml_tensor * dst) {
|
10584
|
-
//const int64_t ne00 = src0->ne[0];
|
10585
|
-
//const int64_t ne01 = src0->ne[1];
|
10586
|
-
|
10587
|
-
const int64_t ne10 = src1->ne[0];
|
10588
|
-
|
10589
|
-
const int64_t ne0 = dst->ne[0];
|
10590
|
-
const int64_t ne1 = dst->ne[1];
|
11115
|
+
// ggml_compute_forward_group_norm
|
10591
11116
|
|
10592
|
-
|
10593
|
-
|
10594
|
-
|
10595
|
-
|
11117
|
+
static void ggml_compute_forward_group_norm_f32(
|
11118
|
+
const struct ggml_compute_params * params,
|
11119
|
+
const struct ggml_tensor * src0,
|
11120
|
+
struct ggml_tensor * dst) {
|
11121
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10596
11122
|
|
10597
|
-
|
10598
|
-
return
|
11123
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11124
|
+
return;
|
10599
11125
|
}
|
10600
11126
|
|
10601
|
-
|
10602
|
-
}
|
10603
|
-
#endif
|
10604
|
-
|
10605
|
-
static void ggml_compute_forward_mul_mat(
|
10606
|
-
const struct ggml_compute_params * params,
|
10607
|
-
const struct ggml_tensor * src0,
|
10608
|
-
const struct ggml_tensor * src1,
|
10609
|
-
struct ggml_tensor * dst) {
|
10610
|
-
int64_t t0 = ggml_perf_time_us();
|
10611
|
-
UNUSED(t0);
|
10612
|
-
|
10613
|
-
GGML_TENSOR_BINARY_OP_LOCALS;
|
11127
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
10614
11128
|
|
10615
11129
|
const int ith = params->ith;
|
10616
11130
|
const int nth = params->nth;
|
10617
11131
|
|
10618
|
-
|
11132
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
10619
11133
|
|
10620
|
-
const
|
11134
|
+
const float eps = 1e-6f; // TODO: make this a parameter
|
10621
11135
|
|
10622
|
-
|
11136
|
+
// TODO: optimize
|
11137
|
+
|
11138
|
+
int n_channels = src0->ne[2];
|
11139
|
+
int n_groups = dst->op_params[0];
|
11140
|
+
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
11141
|
+
for (int i = ith; i < n_groups; i+=nth) {
|
11142
|
+
int start = i * n_channels_per_group;
|
11143
|
+
int end = start + n_channels_per_group;
|
11144
|
+
if (end > n_channels) {
|
11145
|
+
end = n_channels;
|
11146
|
+
}
|
11147
|
+
int step = end - start;
|
11148
|
+
|
11149
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
11150
|
+
ggml_float sum = 0.0;
|
11151
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
11152
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11153
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
11154
|
+
|
11155
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11156
|
+
sum += (ggml_float)x[i00];
|
11157
|
+
}
|
11158
|
+
}
|
11159
|
+
}
|
11160
|
+
float mean = sum / (ne00 * ne01 * step);
|
11161
|
+
ggml_float sum2 = 0.0;
|
11162
|
+
|
11163
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
11164
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11165
|
+
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
|
11166
|
+
|
11167
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
11168
|
+
|
11169
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
11170
|
+
float v = x[i00] - mean;
|
11171
|
+
y[i00] = v;
|
11172
|
+
sum2 += (ggml_float)(v * v);
|
11173
|
+
}
|
11174
|
+
}
|
11175
|
+
}
|
11176
|
+
float variance = sum2 / (ne00 * ne01 * step);
|
11177
|
+
const float scale = 1.0f / sqrtf(variance + eps);
|
11178
|
+
|
11179
|
+
for (int64_t i02 = start; i02 < end; i02++) {
|
11180
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
11181
|
+
float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
|
11182
|
+
ggml_vec_scale_f32(ne00, y, scale);
|
11183
|
+
}
|
11184
|
+
}
|
11185
|
+
}
|
11186
|
+
}
|
11187
|
+
}
|
11188
|
+
|
11189
|
+
static void ggml_compute_forward_group_norm(
|
11190
|
+
const struct ggml_compute_params * params,
|
11191
|
+
const struct ggml_tensor * src0,
|
11192
|
+
struct ggml_tensor * dst) {
|
11193
|
+
switch (src0->type) {
|
11194
|
+
case GGML_TYPE_F32:
|
11195
|
+
{
|
11196
|
+
ggml_compute_forward_group_norm_f32(params, src0, dst);
|
11197
|
+
} break;
|
11198
|
+
default:
|
11199
|
+
{
|
11200
|
+
GGML_ASSERT(false);
|
11201
|
+
} break;
|
11202
|
+
}
|
11203
|
+
}
|
11204
|
+
|
11205
|
+
// ggml_compute_forward_mul_mat
|
11206
|
+
|
11207
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
11208
|
+
// helper function to determine if it is better to use BLAS or not
|
11209
|
+
// for large matrices, BLAS is faster
|
11210
|
+
static bool ggml_compute_forward_mul_mat_use_blas(
|
11211
|
+
const struct ggml_tensor * src0,
|
11212
|
+
const struct ggml_tensor * src1,
|
11213
|
+
struct ggml_tensor * dst) {
|
11214
|
+
//const int64_t ne00 = src0->ne[0];
|
11215
|
+
//const int64_t ne01 = src0->ne[1];
|
11216
|
+
|
11217
|
+
const int64_t ne10 = src1->ne[0];
|
11218
|
+
|
11219
|
+
const int64_t ne0 = dst->ne[0];
|
11220
|
+
const int64_t ne1 = dst->ne[1];
|
11221
|
+
|
11222
|
+
// TODO: find the optimal values for these
|
11223
|
+
if (ggml_is_contiguous(src0) &&
|
11224
|
+
ggml_is_contiguous(src1) &&
|
11225
|
+
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
11226
|
+
|
11227
|
+
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
11228
|
+
return true;
|
11229
|
+
}
|
11230
|
+
|
11231
|
+
return false;
|
11232
|
+
}
|
11233
|
+
#endif
|
11234
|
+
|
11235
|
+
static void ggml_compute_forward_mul_mat(
|
11236
|
+
const struct ggml_compute_params * params,
|
11237
|
+
const struct ggml_tensor * src0,
|
11238
|
+
const struct ggml_tensor * src1,
|
11239
|
+
struct ggml_tensor * dst) {
|
11240
|
+
int64_t t0 = ggml_perf_time_us();
|
11241
|
+
UNUSED(t0);
|
11242
|
+
|
11243
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
11244
|
+
|
11245
|
+
const int ith = params->ith;
|
11246
|
+
const int nth = params->nth;
|
11247
|
+
|
11248
|
+
const enum ggml_type type = src0->type;
|
11249
|
+
|
11250
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
11251
|
+
|
11252
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
10623
11253
|
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
10624
11254
|
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
10625
11255
|
|
@@ -10629,7 +11259,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10629
11259
|
GGML_ASSERT(ne3 == ne13);
|
10630
11260
|
|
10631
11261
|
// we don't support permuted src0 or src1
|
10632
|
-
GGML_ASSERT(nb00 ==
|
11262
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
10633
11263
|
GGML_ASSERT(nb10 == sizeof(float));
|
10634
11264
|
|
10635
11265
|
// dst cannot be transposed or permuted
|
@@ -10638,6 +11268,10 @@ static void ggml_compute_forward_mul_mat(
|
|
10638
11268
|
GGML_ASSERT(nb1 <= nb2);
|
10639
11269
|
GGML_ASSERT(nb2 <= nb3);
|
10640
11270
|
|
11271
|
+
// broadcast factors
|
11272
|
+
const int64_t r2 = ne12/ne02;
|
11273
|
+
const int64_t r3 = ne13/ne03;
|
11274
|
+
|
10641
11275
|
// nb01 >= nb00 - src0 is not transposed
|
10642
11276
|
// compute by src0 rows
|
10643
11277
|
|
@@ -10657,11 +11291,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10657
11291
|
|
10658
11292
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
10659
11293
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
10660
|
-
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
10661
|
-
// ref: https://github.com/ggerganov/ggml/pull/224
|
10662
|
-
GGML_ASSERT(ne02 == ne12);
|
10663
|
-
GGML_ASSERT(ne03 == ne13);
|
10664
|
-
|
10665
11294
|
if (params->ith != 0) {
|
10666
11295
|
return;
|
10667
11296
|
}
|
@@ -10674,12 +11303,16 @@ static void ggml_compute_forward_mul_mat(
|
|
10674
11303
|
return;
|
10675
11304
|
}
|
10676
11305
|
|
10677
|
-
for (int64_t
|
10678
|
-
for (int64_t
|
10679
|
-
|
10680
|
-
const
|
11306
|
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
11307
|
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
11308
|
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
11309
|
+
const int64_t i03 = i13/r3;
|
11310
|
+
const int64_t i02 = i12/r2;
|
10681
11311
|
|
10682
|
-
|
11312
|
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
11313
|
+
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
11314
|
+
|
11315
|
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
10683
11316
|
|
10684
11317
|
if (type != GGML_TYPE_F32) {
|
10685
11318
|
float * const wdata = params->wdata;
|
@@ -10687,7 +11320,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10687
11320
|
|
10688
11321
|
size_t id = 0;
|
10689
11322
|
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
10690
|
-
to_float((char *)
|
11323
|
+
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
10691
11324
|
id += ne00;
|
10692
11325
|
}
|
10693
11326
|
|
@@ -10712,7 +11345,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10712
11345
|
if (params->type == GGML_TASK_INIT) {
|
10713
11346
|
if (src1->type != vec_dot_type) {
|
10714
11347
|
char * wdata = params->wdata;
|
10715
|
-
const size_t row_size = ne10*
|
11348
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10716
11349
|
|
10717
11350
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
10718
11351
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
@@ -10732,7 +11365,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10732
11365
|
}
|
10733
11366
|
|
10734
11367
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
10735
|
-
const size_t row_size = ne10*
|
11368
|
+
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
10736
11369
|
|
10737
11370
|
const int64_t nr0 = ne01; // src0 rows
|
10738
11371
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
@@ -10767,10 +11400,6 @@ static void ggml_compute_forward_mul_mat(
|
|
10767
11400
|
assert(ne12 % ne02 == 0);
|
10768
11401
|
assert(ne13 % ne03 == 0);
|
10769
11402
|
|
10770
|
-
// broadcast factors
|
10771
|
-
const int64_t r2 = ne12/ne02;
|
10772
|
-
const int64_t r3 = ne13/ne03;
|
10773
|
-
|
10774
11403
|
// block-tiling attempt
|
10775
11404
|
const int64_t blck_0 = 16;
|
10776
11405
|
const int64_t blck_1 = 16;
|
@@ -11205,7 +11834,7 @@ static void ggml_compute_forward_get_rows_q(
|
|
11205
11834
|
|
11206
11835
|
assert( dst->ne[0] == nc);
|
11207
11836
|
assert( dst->ne[1] == nr);
|
11208
|
-
assert(src0->nb[0] ==
|
11837
|
+
assert(src0->nb[0] == ggml_type_size(type));
|
11209
11838
|
|
11210
11839
|
for (int i = 0; i < nr; ++i) {
|
11211
11840
|
const int r = ((int32_t *) src1->data)[i];
|
@@ -11506,8 +12135,8 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
11506
12135
|
const int ith = params->ith;
|
11507
12136
|
const int nth = params->nth;
|
11508
12137
|
|
11509
|
-
const int n_past =
|
11510
|
-
const bool inplace =
|
12138
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
12139
|
+
const bool inplace = src0->data == dst->data;
|
11511
12140
|
|
11512
12141
|
GGML_ASSERT(n_past >= 0);
|
11513
12142
|
|
@@ -11718,6 +12347,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|
11718
12347
|
// dx = J * dy
|
11719
12348
|
// dxk = sum_i(Jki * dyi)
|
11720
12349
|
// dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
|
12350
|
+
// dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
|
11721
12351
|
// dxk = sum_i(-yk*yi * dyi) + yk*dyk
|
11722
12352
|
// dxk = -yk * sum_i(yi * dyi) + yk*dyk
|
11723
12353
|
// dxk = -yk * dot(y, dy) + yk*dyk
|
@@ -11926,7 +12556,6 @@ static void ggml_compute_forward_alibi(
|
|
11926
12556
|
}
|
11927
12557
|
}
|
11928
12558
|
|
11929
|
-
|
11930
12559
|
// ggml_compute_forward_clamp
|
11931
12560
|
|
11932
12561
|
static void ggml_compute_forward_clamp_f32(
|
@@ -12015,12 +12644,18 @@ static void ggml_compute_forward_rope_f32(
|
|
12015
12644
|
float freq_base;
|
12016
12645
|
float freq_scale;
|
12017
12646
|
|
12647
|
+
// these two only relevant for xPos RoPE:
|
12648
|
+
float xpos_base;
|
12649
|
+
bool xpos_down;
|
12650
|
+
|
12018
12651
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12019
12652
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12020
12653
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12021
12654
|
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
12022
12655
|
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12023
12656
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12657
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12658
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12024
12659
|
|
12025
12660
|
assert(n_past >= 0);
|
12026
12661
|
|
@@ -12092,6 +12727,9 @@ static void ggml_compute_forward_rope_f32(
|
|
12092
12727
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12093
12728
|
const float cos_theta = cosf(theta);
|
12094
12729
|
const float sin_theta = sinf(theta);
|
12730
|
+
// zeta scaling for xPos only:
|
12731
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12732
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12095
12733
|
|
12096
12734
|
theta *= theta_scale;
|
12097
12735
|
|
@@ -12101,11 +12739,11 @@ static void ggml_compute_forward_rope_f32(
|
|
12101
12739
|
const float x0 = src[0];
|
12102
12740
|
const float x1 = src[1];
|
12103
12741
|
|
12104
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12105
|
-
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
12742
|
+
dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
|
12743
|
+
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
12106
12744
|
}
|
12107
12745
|
} else {
|
12108
|
-
// TODO: this
|
12746
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12109
12747
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12110
12748
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12111
12749
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12234,7 +12872,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12234
12872
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12235
12873
|
}
|
12236
12874
|
} else {
|
12237
|
-
// TODO: this
|
12875
|
+
// TODO: this might be wrong for ne0 != n_dims - need double check
|
12238
12876
|
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
12239
12877
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
12240
12878
|
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
@@ -12296,9 +12934,21 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12296
12934
|
// dx = rope_back(dy, src1)
|
12297
12935
|
// src0 is dy, src1 contains options
|
12298
12936
|
|
12937
|
+
float freq_base;
|
12938
|
+
float freq_scale;
|
12939
|
+
|
12940
|
+
// these two only relevant for xPos RoPE:
|
12941
|
+
float xpos_base;
|
12942
|
+
bool xpos_down;
|
12943
|
+
|
12299
12944
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
12300
12945
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
12301
12946
|
const int mode = ((int32_t *) dst->op_params)[2];
|
12947
|
+
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
12948
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
12949
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
12950
|
+
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
12951
|
+
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
12302
12952
|
|
12303
12953
|
assert(n_past >= 0);
|
12304
12954
|
|
@@ -12324,7 +12974,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12324
12974
|
// row index used to determine which thread to use
|
12325
12975
|
int ir = 0;
|
12326
12976
|
|
12327
|
-
const float theta_scale = powf(
|
12977
|
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
12328
12978
|
|
12329
12979
|
const bool is_neox = mode & 2;
|
12330
12980
|
|
@@ -12335,12 +12985,15 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12335
12985
|
if (ir++ < ir0) continue;
|
12336
12986
|
if (ir > ir1) break;
|
12337
12987
|
|
12338
|
-
float theta = (float)p;
|
12988
|
+
float theta = freq_scale * (float)p;
|
12339
12989
|
|
12340
12990
|
if (!is_neox) {
|
12341
12991
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12342
12992
|
const float cos_theta = cosf(theta);
|
12343
12993
|
const float sin_theta = sinf(theta);
|
12994
|
+
// zeta scaling for xPos only:
|
12995
|
+
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
|
12996
|
+
if (xpos_down) zeta = 1.0f / zeta;
|
12344
12997
|
|
12345
12998
|
theta *= theta_scale;
|
12346
12999
|
|
@@ -12350,8 +13003,8 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12350
13003
|
const float dy0 = dy[0];
|
12351
13004
|
const float dy1 = dy[1];
|
12352
13005
|
|
12353
|
-
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
12354
|
-
dx[1] = - dy0*sin_theta + dy1*cos_theta;
|
13006
|
+
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
13007
|
+
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
12355
13008
|
}
|
12356
13009
|
} else {
|
12357
13010
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
@@ -13044,6 +13697,106 @@ static void ggml_compute_forward_conv_2d(
|
|
13044
13697
|
}
|
13045
13698
|
}
|
13046
13699
|
|
13700
|
+
// ggml_compute_forward_conv_transpose_2d
|
13701
|
+
|
13702
|
+
static void ggml_compute_forward_conv_transpose_2d(
|
13703
|
+
const struct ggml_compute_params * params,
|
13704
|
+
const struct ggml_tensor * src0,
|
13705
|
+
const struct ggml_tensor * src1,
|
13706
|
+
struct ggml_tensor * dst) {
|
13707
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13708
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13709
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13710
|
+
|
13711
|
+
int64_t t0 = ggml_perf_time_us();
|
13712
|
+
UNUSED(t0);
|
13713
|
+
|
13714
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13715
|
+
|
13716
|
+
const int ith = params->ith;
|
13717
|
+
const int nth = params->nth;
|
13718
|
+
|
13719
|
+
const int nk = ne00*ne01*ne02*ne03;
|
13720
|
+
|
13721
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13722
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13723
|
+
|
13724
|
+
if (params->type == GGML_TASK_INIT) {
|
13725
|
+
memset(params->wdata, 0, params->wsize);
|
13726
|
+
|
13727
|
+
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
13728
|
+
{
|
13729
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13730
|
+
|
13731
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
13732
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
13733
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
|
13734
|
+
ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
|
13735
|
+
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
13736
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
13737
|
+
dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
|
13738
|
+
}
|
13739
|
+
}
|
13740
|
+
}
|
13741
|
+
}
|
13742
|
+
}
|
13743
|
+
|
13744
|
+
// permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
|
13745
|
+
{
|
13746
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
|
13747
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13748
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13749
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
|
13750
|
+
ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
|
13751
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13752
|
+
dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
|
13753
|
+
}
|
13754
|
+
}
|
13755
|
+
}
|
13756
|
+
}
|
13757
|
+
|
13758
|
+
return;
|
13759
|
+
}
|
13760
|
+
|
13761
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13762
|
+
return;
|
13763
|
+
}
|
13764
|
+
|
13765
|
+
const int32_t stride = ggml_get_op_params_i32(dst, 0);
|
13766
|
+
|
13767
|
+
// total patches in dst
|
13768
|
+
const int np = ne2;
|
13769
|
+
|
13770
|
+
// patches per thread
|
13771
|
+
const int dp = (np + nth - 1)/nth;
|
13772
|
+
|
13773
|
+
// patch range for this thread
|
13774
|
+
const int ip0 = dp*ith;
|
13775
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13776
|
+
|
13777
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13778
|
+
ggml_fp16_t * const wdata_src = wdata + nk;
|
13779
|
+
|
13780
|
+
for (int i2 = ip0; i2 < ip1; i2++) { // Cout
|
13781
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13782
|
+
ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
|
13783
|
+
for (int i11 = 0; i11 < ne11; i11++) {
|
13784
|
+
for (int i10 = 0; i10 < ne10; i10++) {
|
13785
|
+
const int i1n = i11*ne10*ne12 + i10*ne12;
|
13786
|
+
for (int i01 = 0; i01 < ne01; i01++) {
|
13787
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
13788
|
+
float v = 0;
|
13789
|
+
ggml_vec_dot_f16(ne03, &v,
|
13790
|
+
wdata_src + i1n,
|
13791
|
+
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
13792
|
+
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
13793
|
+
}
|
13794
|
+
}
|
13795
|
+
}
|
13796
|
+
}
|
13797
|
+
}
|
13798
|
+
}
|
13799
|
+
|
13047
13800
|
// ggml_compute_forward_pool_1d_sk_p0
|
13048
13801
|
|
13049
13802
|
static void ggml_compute_forward_pool_1d_sk_p0(
|
@@ -13202,6 +13955,60 @@ static void ggml_compute_forward_pool_2d(
|
|
13202
13955
|
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
13203
13956
|
}
|
13204
13957
|
|
13958
|
+
// ggml_compute_forward_upscale
|
13959
|
+
|
13960
|
+
static void ggml_compute_forward_upscale_f32(
|
13961
|
+
const struct ggml_compute_params * params,
|
13962
|
+
const struct ggml_tensor * src0,
|
13963
|
+
struct ggml_tensor * dst) {
|
13964
|
+
|
13965
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
13966
|
+
return;
|
13967
|
+
}
|
13968
|
+
|
13969
|
+
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
13970
|
+
|
13971
|
+
const int ith = params->ith;
|
13972
|
+
|
13973
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
13974
|
+
|
13975
|
+
const int scale_factor = dst->op_params[0];
|
13976
|
+
|
13977
|
+
// TODO: optimize
|
13978
|
+
|
13979
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
13980
|
+
for (int i02 = ith; i02 < ne02; i02++) {
|
13981
|
+
for (int m = 0; m < dst->ne[1]; m++) {
|
13982
|
+
int i01 = m / scale_factor;
|
13983
|
+
for (int n = 0; n < dst->ne[0]; n++) {
|
13984
|
+
int i00 = n / scale_factor;
|
13985
|
+
|
13986
|
+
const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
|
13987
|
+
|
13988
|
+
float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
13989
|
+
|
13990
|
+
*y = *x;
|
13991
|
+
}
|
13992
|
+
}
|
13993
|
+
}
|
13994
|
+
}
|
13995
|
+
}
|
13996
|
+
|
13997
|
+
static void ggml_compute_forward_upscale(
|
13998
|
+
const struct ggml_compute_params * params,
|
13999
|
+
const struct ggml_tensor * src0,
|
14000
|
+
struct ggml_tensor * dst) {
|
14001
|
+
switch (src0->type) {
|
14002
|
+
case GGML_TYPE_F32:
|
14003
|
+
{
|
14004
|
+
ggml_compute_forward_upscale_f32(params, src0, dst);
|
14005
|
+
} break;
|
14006
|
+
default:
|
14007
|
+
{
|
14008
|
+
GGML_ASSERT(false);
|
14009
|
+
} break;
|
14010
|
+
}
|
14011
|
+
}
|
13205
14012
|
|
13206
14013
|
// ggml_compute_forward_flash_attn
|
13207
14014
|
|
@@ -13331,7 +14138,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13331
14138
|
vvexpf(S, S, &Mup);
|
13332
14139
|
ggml_vec_sum_f32(Mup, &sum, S);
|
13333
14140
|
#else
|
13334
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14141
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
13335
14142
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
13336
14143
|
|
13337
14144
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -13341,9 +14148,13 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
13341
14148
|
if (SS[j] == -INFINITY) {
|
13342
14149
|
SS[j] = 0.0f;
|
13343
14150
|
} else {
|
14151
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14152
|
+
const float val = expf(SS[j] - max);
|
14153
|
+
#else
|
13344
14154
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
13345
14155
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
13346
14156
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14157
|
+
#endif
|
13347
14158
|
sump[j] += (ggml_float)val;
|
13348
14159
|
SS[j] = val;
|
13349
14160
|
}
|
@@ -13921,7 +14732,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
13921
14732
|
vvexpf(SM, SM, &Mup);
|
13922
14733
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
13923
14734
|
#else
|
13924
|
-
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
14735
|
+
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
13925
14736
|
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
13926
14737
|
|
13927
14738
|
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
@@ -13932,9 +14743,13 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
13932
14743
|
if (SR[j] == -INFINITY) {
|
13933
14744
|
SW[j] = 0.0f;
|
13934
14745
|
} else {
|
14746
|
+
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
14747
|
+
const float val = expf(SR[j] - max);
|
14748
|
+
#else
|
13935
14749
|
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
13936
14750
|
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
13937
14751
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
|
14752
|
+
#endif
|
13938
14753
|
sump[j] += (ggml_float)val;
|
13939
14754
|
SW[j] = val;
|
13940
14755
|
}
|
@@ -14327,38 +15142,169 @@ static void ggml_compute_forward_unary(
|
|
14327
15142
|
}
|
14328
15143
|
}
|
14329
15144
|
|
14330
|
-
//
|
15145
|
+
// ggml_compute_forward_get_rel_pos
|
14331
15146
|
|
14332
|
-
static void
|
15147
|
+
static void ggml_compute_forward_get_rel_pos_f16(
|
14333
15148
|
const struct ggml_compute_params * params,
|
14334
15149
|
const struct ggml_tensor * src0,
|
14335
|
-
struct ggml_tensor * dst
|
14336
|
-
const ggml_unary_op_f32_t fun) {
|
14337
|
-
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
14338
|
-
|
15150
|
+
struct ggml_tensor * dst) {
|
14339
15151
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14340
15152
|
return;
|
14341
15153
|
}
|
14342
15154
|
|
14343
|
-
|
14344
|
-
const int nc = src0->ne[0];
|
15155
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
|
14345
15156
|
|
14346
|
-
|
14347
|
-
assert(src0->nb[0] == sizeof(float));
|
15157
|
+
GGML_TENSOR_UNARY_OP_LOCALS;
|
14348
15158
|
|
14349
|
-
|
14350
|
-
|
14351
|
-
|
14352
|
-
|
15159
|
+
const int64_t w = ne1;
|
15160
|
+
|
15161
|
+
ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
|
15162
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data;
|
15163
|
+
|
15164
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
15165
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
15166
|
+
const int64_t pos = (w - i1 - 1) + i2;
|
15167
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
15168
|
+
dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
|
15169
|
+
}
|
15170
|
+
}
|
14353
15171
|
}
|
14354
15172
|
}
|
14355
15173
|
|
14356
|
-
|
14357
|
-
static void ggml_compute_forward_map_unary(
|
15174
|
+
static void ggml_compute_forward_get_rel_pos(
|
14358
15175
|
const struct ggml_compute_params * params,
|
14359
15176
|
const struct ggml_tensor * src0,
|
14360
|
-
struct ggml_tensor * dst
|
14361
|
-
|
15177
|
+
struct ggml_tensor * dst) {
|
15178
|
+
switch (src0->type) {
|
15179
|
+
case GGML_TYPE_F16:
|
15180
|
+
{
|
15181
|
+
ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
|
15182
|
+
} break;
|
15183
|
+
default:
|
15184
|
+
{
|
15185
|
+
GGML_ASSERT(false);
|
15186
|
+
} break;
|
15187
|
+
}
|
15188
|
+
}
|
15189
|
+
|
15190
|
+
// ggml_compute_forward_add_rel_pos
|
15191
|
+
|
15192
|
+
static void ggml_compute_forward_add_rel_pos_f32(
|
15193
|
+
const struct ggml_compute_params * params,
|
15194
|
+
const struct ggml_tensor * src0,
|
15195
|
+
const struct ggml_tensor * src1,
|
15196
|
+
const struct ggml_tensor * src2,
|
15197
|
+
struct ggml_tensor * dst) {
|
15198
|
+
|
15199
|
+
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
15200
|
+
if (!inplace && params->type == GGML_TASK_INIT) {
|
15201
|
+
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
15202
|
+
return;
|
15203
|
+
}
|
15204
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15205
|
+
return;
|
15206
|
+
}
|
15207
|
+
|
15208
|
+
int64_t t0 = ggml_perf_time_us();
|
15209
|
+
UNUSED(t0);
|
15210
|
+
|
15211
|
+
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
|
15212
|
+
|
15213
|
+
float * src1_data = (float *) src1->data;
|
15214
|
+
float * src2_data = (float *) src2->data;
|
15215
|
+
float * dst_data = (float *) dst->data;
|
15216
|
+
|
15217
|
+
const int64_t ne10 = src1->ne[0];
|
15218
|
+
const int64_t ne11 = src1->ne[1];
|
15219
|
+
const int64_t ne12 = src1->ne[2];
|
15220
|
+
const int64_t ne13 = src1->ne[3];
|
15221
|
+
|
15222
|
+
const int ith = params->ith;
|
15223
|
+
const int nth = params->nth;
|
15224
|
+
|
15225
|
+
// total patches in dst
|
15226
|
+
const int np = ne13;
|
15227
|
+
|
15228
|
+
// patches per thread
|
15229
|
+
const int dp = (np + nth - 1)/nth;
|
15230
|
+
|
15231
|
+
// patch range for this thread
|
15232
|
+
const int ip0 = dp*ith;
|
15233
|
+
const int ip1 = MIN(ip0 + dp, np);
|
15234
|
+
|
15235
|
+
|
15236
|
+
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
|
15237
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
15238
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
15239
|
+
const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
|
15240
|
+
for (int64_t i10 = 0; i10 < ne10; ++i10) {
|
15241
|
+
const int64_t jp0 = jp1 + i10;
|
15242
|
+
const float src1_e = src1_data[jp0];
|
15243
|
+
const float src2_e = src2_data[jp0];
|
15244
|
+
|
15245
|
+
const int64_t jdh = jp0 * ne10;
|
15246
|
+
const int64_t jdw = jdh - (ne10 - 1) * i10;
|
15247
|
+
|
15248
|
+
for (int64_t j = 0; j < ne10; ++j) {
|
15249
|
+
dst_data[jdh + j ] += src2_e;
|
15250
|
+
dst_data[jdw + j*ne10] += src1_e;
|
15251
|
+
}
|
15252
|
+
}
|
15253
|
+
}
|
15254
|
+
}
|
15255
|
+
}
|
15256
|
+
}
|
15257
|
+
|
15258
|
+
static void ggml_compute_forward_add_rel_pos(
|
15259
|
+
const struct ggml_compute_params * params,
|
15260
|
+
const struct ggml_tensor * src0,
|
15261
|
+
const struct ggml_tensor * src1,
|
15262
|
+
const struct ggml_tensor * src2,
|
15263
|
+
struct ggml_tensor * dst) {
|
15264
|
+
switch (src0->type) {
|
15265
|
+
case GGML_TYPE_F32:
|
15266
|
+
{
|
15267
|
+
ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
|
15268
|
+
} break;
|
15269
|
+
default:
|
15270
|
+
{
|
15271
|
+
GGML_ASSERT(false);
|
15272
|
+
} break;
|
15273
|
+
}
|
15274
|
+
}
|
15275
|
+
|
15276
|
+
// ggml_compute_forward_map_unary
|
15277
|
+
|
15278
|
+
static void ggml_compute_forward_map_unary_f32(
|
15279
|
+
const struct ggml_compute_params * params,
|
15280
|
+
const struct ggml_tensor * src0,
|
15281
|
+
struct ggml_tensor * dst,
|
15282
|
+
const ggml_unary_op_f32_t fun) {
|
15283
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
15284
|
+
|
15285
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15286
|
+
return;
|
15287
|
+
}
|
15288
|
+
|
15289
|
+
const int n = ggml_nrows(src0);
|
15290
|
+
const int nc = src0->ne[0];
|
15291
|
+
|
15292
|
+
assert( dst->nb[0] == sizeof(float));
|
15293
|
+
assert(src0->nb[0] == sizeof(float));
|
15294
|
+
|
15295
|
+
for (int i = 0; i < n; i++) {
|
15296
|
+
fun(nc,
|
15297
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
15298
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
15299
|
+
}
|
15300
|
+
}
|
15301
|
+
|
15302
|
+
|
15303
|
+
static void ggml_compute_forward_map_unary(
|
15304
|
+
const struct ggml_compute_params * params,
|
15305
|
+
const struct ggml_tensor * src0,
|
15306
|
+
struct ggml_tensor * dst,
|
15307
|
+
const ggml_unary_op_f32_t fun) {
|
14362
15308
|
switch (src0->type) {
|
14363
15309
|
case GGML_TYPE_F32:
|
14364
15310
|
{
|
@@ -14541,6 +15487,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14541
15487
|
const int nc = src0->ne[0];
|
14542
15488
|
const int nr = ggml_nrows(src0);
|
14543
15489
|
|
15490
|
+
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
15491
|
+
|
14544
15492
|
if (params->type == GGML_TASK_INIT) {
|
14545
15493
|
if (ith == 0) {
|
14546
15494
|
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
@@ -14552,7 +15500,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14552
15500
|
if (ith == 0) {
|
14553
15501
|
float * dp = (float *) dst->data;
|
14554
15502
|
ggml_vec_sum_f32(nth, dp, sums);
|
14555
|
-
dp[0] *= -1.0f;
|
15503
|
+
dp[0] *= -1.0f / (float) nr;
|
14556
15504
|
}
|
14557
15505
|
return;
|
14558
15506
|
}
|
@@ -14569,7 +15517,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14569
15517
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
14570
15518
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
14571
15519
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
14572
|
-
float * st = (float *) params->wdata + nth + ith*nc;
|
15520
|
+
float * st = ((float *) params->wdata) + nth + ith*nc;
|
14573
15521
|
|
14574
15522
|
#ifndef NDEBUG
|
14575
15523
|
for (int i = 0; i < nc; ++i) {
|
@@ -14584,15 +15532,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14584
15532
|
float max = -INFINITY;
|
14585
15533
|
ggml_vec_max_f32(nc, &max, s0);
|
14586
15534
|
|
14587
|
-
uint16_t scvt;
|
15535
|
+
uint16_t scvt; UNUSED(scvt);
|
14588
15536
|
for (int i = 0; i < nc; i++) {
|
14589
15537
|
if (s0[i] == -INFINITY) {
|
14590
15538
|
st[i] = 0.0f;
|
14591
15539
|
} else {
|
14592
|
-
|
15540
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15541
|
+
const float s = s0[i] - max;
|
15542
|
+
const float val = expf(s);
|
15543
|
+
#else
|
14593
15544
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
14594
15545
|
memcpy(&scvt, &s, sizeof(scvt));
|
14595
15546
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15547
|
+
#endif
|
14596
15548
|
sum += (ggml_float)val;
|
14597
15549
|
st[i] = val;
|
14598
15550
|
}
|
@@ -14608,7 +15560,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
14608
15560
|
ggml_vec_log_f32(nc, st, st);
|
14609
15561
|
ggml_vec_mul_f32(nc, st, st, s1);
|
14610
15562
|
|
14611
|
-
|
15563
|
+
float st_sum = 0;
|
15564
|
+
ggml_vec_sum_f32(nc, &st_sum, st);
|
15565
|
+
sums[ith] += st_sum;
|
14612
15566
|
|
14613
15567
|
#ifndef NDEBUG
|
14614
15568
|
for (int i = 0; i < nc; ++i) {
|
@@ -14658,7 +15612,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14658
15612
|
return;
|
14659
15613
|
}
|
14660
15614
|
|
14661
|
-
const
|
15615
|
+
const double eps = 1e-9;
|
14662
15616
|
|
14663
15617
|
// TODO: handle transposed/permuted matrices
|
14664
15618
|
const int64_t nc = src0->ne[0];
|
@@ -14677,7 +15631,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14677
15631
|
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
|
14678
15632
|
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
14679
15633
|
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
14680
|
-
float * sm = (float *) params->wdata + ith*nc;
|
14681
15634
|
|
14682
15635
|
#ifndef NDEBUG
|
14683
15636
|
for (int i = 0; i < nc; ++i) {
|
@@ -14686,54 +15639,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14686
15639
|
assert(!isnan(s1[i]));
|
14687
15640
|
}
|
14688
15641
|
#endif
|
14689
|
-
// step by step explanation:
|
14690
|
-
{
|
14691
|
-
//float * sums = (float *) params->wdata;
|
14692
|
-
|
14693
|
-
// forward pass with annotated gradients from backward pass
|
14694
|
-
// (built by going in reverse operation order, adding to gradients of current operation args)
|
14695
|
-
// st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum
|
14696
|
-
// from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
14697
|
-
// ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps)
|
14698
|
-
// ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3]
|
14699
|
-
// ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3
|
14700
|
-
// ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1
|
14701
|
-
// ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]]
|
14702
|
-
// ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel]
|
14703
|
-
|
14704
|
-
// substitute into grad[st1], because we can reuse softmax_back from this point on
|
14705
|
-
// grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps))
|
14706
|
-
// postorder:
|
14707
|
-
// grad[st1] := softmax(s0)
|
14708
|
-
// grad[st1] := grad[st1]*(1.0 - eps)
|
14709
|
-
// grad[st1] := grad[st1] + eps
|
14710
|
-
// grad[st1] := s1 / grad[st1]
|
14711
|
-
// grad[st1] := grad[st1]*(1.0-eps)*-grad[cel]
|
14712
|
-
|
14713
|
-
// src0 gradients by going through softmax_back
|
14714
|
-
// grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1]))
|
14715
|
-
// from softmax_back:
|
14716
|
-
// dxk = yk * (dyk - dot(y, dy))
|
14717
|
-
// dot_y_dy := dot(y, dy)
|
14718
|
-
// dx := dy
|
14719
|
-
// dx := dx - dot_y_dy
|
14720
|
-
// dx := dx * y
|
14721
|
-
// postorder:
|
14722
|
-
// dot_st1_dst1 := dot(st1, grad[st1])
|
14723
|
-
// grad[s0] := grad[st1]
|
14724
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
14725
|
-
// grad[s0] := grad[s0] * st1
|
14726
|
-
|
14727
|
-
// prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1]
|
14728
|
-
// sm := softmax(s0)
|
14729
|
-
// grad[s0] := sm*(1.0 - eps)
|
14730
|
-
// grad[s0] := grad[s0] + eps
|
14731
|
-
// grad[s0] := s1 / grad[s0]
|
14732
|
-
// grad[s0] := grad[s0]*(1.0-eps)*-grad[cel]
|
14733
|
-
// dot_st1_dst1 := dot(sm, grad[s0])
|
14734
|
-
// grad[s0] := grad[s0] - dot_st1_dst1
|
14735
|
-
// grad[s0] := grad[s0] * sm
|
14736
|
-
}
|
14737
15642
|
|
14738
15643
|
// soft_max
|
14739
15644
|
ggml_float sum = 0.0;
|
@@ -14741,39 +15646,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
14741
15646
|
float max = -INFINITY;
|
14742
15647
|
ggml_vec_max_f32(nc, &max, s0);
|
14743
15648
|
|
14744
|
-
uint16_t scvt;
|
15649
|
+
uint16_t scvt; UNUSED(scvt);
|
14745
15650
|
for (int i = 0; i < nc; i++) {
|
14746
15651
|
if (s0[i] == -INFINITY) {
|
14747
|
-
|
15652
|
+
ds0[i] = 0.0f;
|
14748
15653
|
} else {
|
14749
|
-
|
15654
|
+
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
15655
|
+
const float s = s0[i] - max;
|
15656
|
+
const float val = expf(s);
|
15657
|
+
#else
|
14750
15658
|
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
14751
15659
|
memcpy(&scvt, &s, sizeof(scvt));
|
14752
15660
|
const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
|
15661
|
+
#endif
|
14753
15662
|
sum += (ggml_float)val;
|
14754
|
-
|
15663
|
+
ds0[i] = val;
|
14755
15664
|
}
|
14756
15665
|
}
|
14757
15666
|
|
14758
15667
|
assert(sum > 0.0);
|
14759
|
-
sum = 1.0/sum;
|
15668
|
+
sum = (1.0 - eps)/sum;
|
14760
15669
|
}
|
14761
15670
|
|
14762
|
-
|
14763
|
-
ggml_vec_scale_f32(nc,
|
14764
|
-
|
14765
|
-
|
14766
|
-
|
14767
|
-
|
14768
|
-
ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]);
|
14769
|
-
ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0);
|
14770
|
-
ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1);
|
14771
|
-
ggml_vec_mul_f32 (nc, ds0, ds0, sm);
|
15671
|
+
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
15672
|
+
ggml_vec_scale_f32(nc, ds0, sum);
|
15673
|
+
ggml_vec_add1_f32(nc, ds0, ds0, eps);
|
15674
|
+
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
15675
|
+
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
15676
|
+
|
14772
15677
|
|
14773
15678
|
#ifndef NDEBUG
|
14774
15679
|
for (int i = 0; i < nc; ++i) {
|
14775
|
-
assert(!isnan(sm[i]));
|
14776
|
-
assert(!isinf(sm[i]));
|
14777
15680
|
assert(!isnan(ds0[i]));
|
14778
15681
|
assert(!isinf(ds0[i]));
|
14779
15682
|
}
|
@@ -14879,6 +15782,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14879
15782
|
{
|
14880
15783
|
ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
|
14881
15784
|
} break;
|
15785
|
+
case GGML_OP_CONCAT:
|
15786
|
+
{
|
15787
|
+
ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
|
15788
|
+
} break;
|
14882
15789
|
case GGML_OP_SILU_BACK:
|
14883
15790
|
{
|
14884
15791
|
ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14895,6 +15802,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14895
15802
|
{
|
14896
15803
|
ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
|
14897
15804
|
} break;
|
15805
|
+
case GGML_OP_GROUP_NORM:
|
15806
|
+
{
|
15807
|
+
ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
|
15808
|
+
} break;
|
14898
15809
|
case GGML_OP_MUL_MAT:
|
14899
15810
|
{
|
14900
15811
|
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
@@ -14987,6 +15898,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14987
15898
|
{
|
14988
15899
|
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
14989
15900
|
} break;
|
15901
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
15902
|
+
{
|
15903
|
+
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
15904
|
+
} break;
|
14990
15905
|
case GGML_OP_POOL_1D:
|
14991
15906
|
{
|
14992
15907
|
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
|
@@ -14995,6 +15910,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14995
15910
|
{
|
14996
15911
|
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
|
14997
15912
|
} break;
|
15913
|
+
case GGML_OP_UPSCALE:
|
15914
|
+
{
|
15915
|
+
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
15916
|
+
} break;
|
14998
15917
|
case GGML_OP_FLASH_ATTN:
|
14999
15918
|
{
|
15000
15919
|
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
@@ -15025,6 +15944,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15025
15944
|
{
|
15026
15945
|
ggml_compute_forward_unary(params, tensor->src[0], tensor);
|
15027
15946
|
} break;
|
15947
|
+
case GGML_OP_GET_REL_POS:
|
15948
|
+
{
|
15949
|
+
ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
|
15950
|
+
} break;
|
15951
|
+
case GGML_OP_ADD_REL_POS:
|
15952
|
+
{
|
15953
|
+
ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
15954
|
+
} break;
|
15028
15955
|
case GGML_OP_MAP_UNARY:
|
15029
15956
|
{
|
15030
15957
|
ggml_unary_op_f32_t fun;
|
@@ -15288,6 +16215,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15288
16215
|
inplace);
|
15289
16216
|
}
|
15290
16217
|
} break;
|
16218
|
+
case GGML_OP_CONCAT:
|
16219
|
+
{
|
16220
|
+
GGML_ASSERT(false); // TODO: implement
|
16221
|
+
} break;
|
15291
16222
|
case GGML_OP_SILU_BACK:
|
15292
16223
|
{
|
15293
16224
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15300,9 +16231,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15300
16231
|
{
|
15301
16232
|
// necessary for llama
|
15302
16233
|
if (src0->grad) {
|
16234
|
+
float eps;
|
16235
|
+
memcpy(&eps, tensor->op_params, sizeof(float));
|
16236
|
+
|
15303
16237
|
src0->grad = ggml_add_impl(ctx,
|
15304
16238
|
src0->grad,
|
15305
|
-
ggml_rms_norm_back(ctx, src0, tensor->grad),
|
16239
|
+
ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
|
15306
16240
|
inplace);
|
15307
16241
|
}
|
15308
16242
|
} break;
|
@@ -15310,6 +16244,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15310
16244
|
{
|
15311
16245
|
GGML_ASSERT(false); // TODO: not implemented
|
15312
16246
|
} break;
|
16247
|
+
case GGML_OP_GROUP_NORM:
|
16248
|
+
{
|
16249
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16250
|
+
} break;
|
15313
16251
|
case GGML_OP_MUL_MAT:
|
15314
16252
|
{
|
15315
16253
|
// https://cs231n.github.io/optimization-2/#staged
|
@@ -15584,6 +16522,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15584
16522
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15585
16523
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15586
16524
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16525
|
+
float freq_base;
|
16526
|
+
float freq_scale;
|
16527
|
+
float xpos_base;
|
16528
|
+
bool xpos_down;
|
16529
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16530
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16531
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16532
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16533
|
+
|
15587
16534
|
src0->grad = ggml_add_impl(ctx,
|
15588
16535
|
src0->grad,
|
15589
16536
|
ggml_rope_back(ctx,
|
@@ -15591,7 +16538,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15591
16538
|
n_past,
|
15592
16539
|
n_dims,
|
15593
16540
|
mode,
|
15594
|
-
n_ctx
|
16541
|
+
n_ctx,
|
16542
|
+
freq_base,
|
16543
|
+
freq_scale,
|
16544
|
+
xpos_base,
|
16545
|
+
xpos_down),
|
15595
16546
|
inplace);
|
15596
16547
|
}
|
15597
16548
|
} break;
|
@@ -15602,14 +16553,28 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15602
16553
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
15603
16554
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
15604
16555
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
16556
|
+
float freq_base;
|
16557
|
+
float freq_scale;
|
16558
|
+
float xpos_base;
|
16559
|
+
bool xpos_down;
|
16560
|
+
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
16561
|
+
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
16562
|
+
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
16563
|
+
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
16564
|
+
|
15605
16565
|
src0->grad = ggml_add_impl(ctx,
|
15606
16566
|
src0->grad,
|
15607
|
-
|
16567
|
+
ggml_rope_impl(ctx,
|
15608
16568
|
tensor->grad,
|
15609
16569
|
n_past,
|
15610
16570
|
n_dims,
|
15611
16571
|
mode,
|
15612
|
-
n_ctx
|
16572
|
+
n_ctx,
|
16573
|
+
freq_base,
|
16574
|
+
freq_scale,
|
16575
|
+
xpos_base,
|
16576
|
+
xpos_down,
|
16577
|
+
false),
|
15613
16578
|
inplace);
|
15614
16579
|
}
|
15615
16580
|
} break;
|
@@ -15629,6 +16594,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15629
16594
|
{
|
15630
16595
|
GGML_ASSERT(false); // TODO: not implemented
|
15631
16596
|
} break;
|
16597
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
16598
|
+
{
|
16599
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16600
|
+
} break;
|
15632
16601
|
case GGML_OP_POOL_1D:
|
15633
16602
|
{
|
15634
16603
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15637,6 +16606,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15637
16606
|
{
|
15638
16607
|
GGML_ASSERT(false); // TODO: not implemented
|
15639
16608
|
} break;
|
16609
|
+
case GGML_OP_UPSCALE:
|
16610
|
+
{
|
16611
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16612
|
+
} break;
|
15640
16613
|
case GGML_OP_FLASH_ATTN:
|
15641
16614
|
{
|
15642
16615
|
struct ggml_tensor * flash_grad = NULL;
|
@@ -15878,6 +16851,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15878
16851
|
GGML_ASSERT(false);
|
15879
16852
|
}
|
15880
16853
|
} break;
|
16854
|
+
case GGML_OP_GET_REL_POS:
|
16855
|
+
case GGML_OP_ADD_REL_POS:
|
15881
16856
|
case GGML_OP_MAP_UNARY:
|
15882
16857
|
case GGML_OP_MAP_BINARY:
|
15883
16858
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16029,9 +17004,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|
16029
17004
|
return result;
|
16030
17005
|
}
|
16031
17006
|
|
16032
|
-
|
16033
|
-
struct ggml_cgraph result = *gf;
|
16034
|
-
|
17007
|
+
void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
|
16035
17008
|
GGML_ASSERT(gf->n_nodes > 0);
|
16036
17009
|
|
16037
17010
|
// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
|
@@ -16055,15 +17028,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
|
|
16055
17028
|
}
|
16056
17029
|
}
|
16057
17030
|
|
16058
|
-
for (int i =
|
17031
|
+
for (int i = 0; i < gf->n_nodes; i++) {
|
16059
17032
|
struct ggml_tensor * node = gf->nodes[i];
|
16060
17033
|
|
16061
17034
|
if (node->is_param) {
|
16062
17035
|
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
16063
|
-
ggml_build_forward_expand(
|
17036
|
+
ggml_build_forward_expand(gb, node->grad);
|
16064
17037
|
}
|
16065
17038
|
}
|
17039
|
+
}
|
16066
17040
|
|
17041
|
+
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
|
17042
|
+
struct ggml_cgraph result = *gf;
|
17043
|
+
ggml_build_backward_expand(ctx, gf, &result, keep);
|
16067
17044
|
return result;
|
16068
17045
|
}
|
16069
17046
|
|
@@ -16382,7 +17359,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16382
17359
|
|
16383
17360
|
size_t cur = 0;
|
16384
17361
|
if (ggml_is_quantized(node->type)) {
|
16385
|
-
cur =
|
17362
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
16386
17363
|
}
|
16387
17364
|
|
16388
17365
|
work_size = MAX(work_size, cur);
|
@@ -16395,7 +17372,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16395
17372
|
size_t cur = 0;
|
16396
17373
|
|
16397
17374
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16398
|
-
cur =
|
17375
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
16399
17376
|
}
|
16400
17377
|
|
16401
17378
|
work_size = MAX(work_size, cur);
|
@@ -16407,7 +17384,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16407
17384
|
size_t cur = 0;
|
16408
17385
|
|
16409
17386
|
if (ggml_is_quantized(node->src[0]->type)) {
|
16410
|
-
cur =
|
17387
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
16411
17388
|
}
|
16412
17389
|
|
16413
17390
|
work_size = MAX(work_size, cur);
|
@@ -16454,9 +17431,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16454
17431
|
case GGML_OP_NORM:
|
16455
17432
|
case GGML_OP_RMS_NORM:
|
16456
17433
|
case GGML_OP_RMS_NORM_BACK:
|
17434
|
+
case GGML_OP_GROUP_NORM:
|
16457
17435
|
{
|
16458
17436
|
n_tasks = n_threads;
|
16459
17437
|
} break;
|
17438
|
+
case GGML_OP_CONCAT:
|
16460
17439
|
case GGML_OP_MUL_MAT:
|
16461
17440
|
case GGML_OP_OUT_PROD:
|
16462
17441
|
{
|
@@ -16490,12 +17469,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16490
17469
|
// the threads are still spinning
|
16491
17470
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
16492
17471
|
// here we need memory just for single 2D matrix from src0
|
16493
|
-
cur =
|
17472
|
+
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
16494
17473
|
}
|
16495
17474
|
} else
|
16496
17475
|
#endif
|
16497
17476
|
if (node->src[1]->type != vec_dot_type) {
|
16498
|
-
cur =
|
17477
|
+
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
16499
17478
|
} else {
|
16500
17479
|
cur = 0;
|
16501
17480
|
}
|
@@ -16524,6 +17503,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16524
17503
|
case GGML_OP_SOFT_MAX_BACK:
|
16525
17504
|
case GGML_OP_ROPE:
|
16526
17505
|
case GGML_OP_ROPE_BACK:
|
17506
|
+
case GGML_OP_ADD_REL_POS:
|
16527
17507
|
{
|
16528
17508
|
n_tasks = n_threads;
|
16529
17509
|
} break;
|
@@ -16598,6 +17578,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16598
17578
|
GGML_ASSERT(false);
|
16599
17579
|
}
|
16600
17580
|
|
17581
|
+
work_size = MAX(work_size, cur);
|
17582
|
+
} break;
|
17583
|
+
case GGML_OP_CONV_TRANSPOSE_2D:
|
17584
|
+
{
|
17585
|
+
n_tasks = n_threads;
|
17586
|
+
|
17587
|
+
const int64_t ne00 = node->src[0]->ne[0]; // W
|
17588
|
+
const int64_t ne01 = node->src[0]->ne[1]; // H
|
17589
|
+
const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
|
17590
|
+
const int64_t ne03 = node->src[0]->ne[3]; // Channels In
|
17591
|
+
|
17592
|
+
const int64_t ne10 = node->src[1]->ne[0]; // W
|
17593
|
+
const int64_t ne11 = node->src[1]->ne[1]; // H
|
17594
|
+
const int64_t ne12 = node->src[1]->ne[2]; // Channels In
|
17595
|
+
|
17596
|
+
size_t cur = 0;
|
17597
|
+
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
17598
|
+
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
17599
|
+
|
16601
17600
|
work_size = MAX(work_size, cur);
|
16602
17601
|
} break;
|
16603
17602
|
case GGML_OP_POOL_1D:
|
@@ -16605,6 +17604,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16605
17604
|
{
|
16606
17605
|
n_tasks = 1;
|
16607
17606
|
} break;
|
17607
|
+
case GGML_OP_UPSCALE:
|
17608
|
+
{
|
17609
|
+
n_tasks = n_threads;
|
17610
|
+
} break;
|
16608
17611
|
case GGML_OP_FLASH_ATTN:
|
16609
17612
|
{
|
16610
17613
|
n_tasks = n_threads;
|
@@ -16666,6 +17669,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16666
17669
|
} break;
|
16667
17670
|
case GGML_OP_WIN_PART:
|
16668
17671
|
case GGML_OP_WIN_UNPART:
|
17672
|
+
case GGML_OP_GET_REL_POS:
|
16669
17673
|
case GGML_OP_MAP_UNARY:
|
16670
17674
|
case GGML_OP_MAP_BINARY:
|
16671
17675
|
case GGML_OP_MAP_CUSTOM1_F32:
|
@@ -16712,10 +17716,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
16712
17716
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
16713
17717
|
{
|
16714
17718
|
n_tasks = n_threads;
|
16715
|
-
|
16716
|
-
size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks;
|
16717
|
-
|
16718
|
-
work_size = MAX(work_size, cur);
|
16719
17719
|
} break;
|
16720
17720
|
case GGML_OP_NONE:
|
16721
17721
|
{
|
@@ -16783,8 +17783,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
16783
17783
|
|
16784
17784
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
16785
17785
|
GGML_ASSERT(rc == 0);
|
17786
|
+
UNUSED(rc);
|
16786
17787
|
}
|
16787
17788
|
}
|
17789
|
+
|
16788
17790
|
workers[0].ith = 0;
|
16789
17791
|
workers[0].shared = &state_shared;
|
16790
17792
|
|
@@ -16900,7 +17902,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|
16900
17902
|
// compute size of intermediate results
|
16901
17903
|
// TODO: does not take into account scratch buffers !!!!
|
16902
17904
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
16903
|
-
size_eval +=
|
17905
|
+
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
16904
17906
|
}
|
16905
17907
|
|
16906
17908
|
// print
|
@@ -17591,14 +18593,16 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17591
18593
|
struct ggml_opt_params params,
|
17592
18594
|
struct ggml_tensor * f,
|
17593
18595
|
struct ggml_cgraph * gf,
|
17594
|
-
struct ggml_cgraph * gb
|
18596
|
+
struct ggml_cgraph * gb,
|
18597
|
+
ggml_opt_callback callback,
|
18598
|
+
void * callback_data) {
|
17595
18599
|
GGML_ASSERT(ggml_is_scalar(f));
|
17596
18600
|
|
17597
18601
|
// these will store the parameters we want to optimize
|
17598
18602
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
17599
18603
|
|
17600
18604
|
int np = 0;
|
17601
|
-
|
18605
|
+
int64_t nx = 0;
|
17602
18606
|
for (int i = 0; i < gf->n_nodes; ++i) {
|
17603
18607
|
if (gf->nodes[i]->is_param) {
|
17604
18608
|
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
@@ -17617,31 +18621,32 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17617
18621
|
}
|
17618
18622
|
|
17619
18623
|
// constants
|
17620
|
-
|
17621
|
-
const float
|
17622
|
-
const float
|
18624
|
+
float sched = params.adam.sched;
|
18625
|
+
const float alpha = params.adam.alpha;
|
18626
|
+
const float decay = params.adam.decay * alpha;
|
17623
18627
|
const float beta1 = params.adam.beta1;
|
17624
18628
|
const float beta2 = params.adam.beta2;
|
17625
18629
|
const float eps = params.adam.eps;
|
18630
|
+
const float gclip = params.adam.gclip;
|
18631
|
+
const int decay_min_ndim = params.adam.decay_min_ndim;
|
17626
18632
|
|
17627
|
-
float * x = opt->adam.x->data; // view of the parameters
|
17628
|
-
float * g1 = opt->adam.g1->data; // gradient
|
17629
|
-
float * g2 = opt->adam.g2->data; // gradient squared
|
17630
18633
|
float * m = opt->adam.m->data; // first moment
|
17631
18634
|
float * v = opt->adam.v->data; // second moment
|
17632
|
-
float * mh = opt->adam.mh->data; // first moment hat
|
17633
|
-
float * vh = opt->adam.vh->data; // second moment hat
|
17634
18635
|
|
17635
18636
|
float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
|
17636
18637
|
|
17637
|
-
|
17638
|
-
|
18638
|
+
if (callback) {
|
18639
|
+
callback(callback_data, &sched);
|
18640
|
+
}
|
17639
18641
|
|
17640
18642
|
// compute the function value
|
17641
18643
|
ggml_graph_reset (gf);
|
17642
18644
|
ggml_set_f32 (f->grad, 1.0f);
|
17643
18645
|
|
17644
|
-
|
18646
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18647
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18648
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18649
|
+
ggml_graph_compute(gb, &cplan);
|
17645
18650
|
|
17646
18651
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
17647
18652
|
opt->adam.fx_best = opt->adam.fx_prev;
|
@@ -17649,6 +18654,9 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17649
18654
|
pf[opt->iter % params.past] = opt->adam.fx_prev;
|
17650
18655
|
}
|
17651
18656
|
|
18657
|
+
opt->loss_before = opt->adam.fx_prev;
|
18658
|
+
opt->loss_after = opt->adam.fx_prev;
|
18659
|
+
|
17652
18660
|
// initialize
|
17653
18661
|
if (opt->just_initialized) {
|
17654
18662
|
opt->adam.n_no_improvement = 0;
|
@@ -17681,50 +18689,55 @@ static enum ggml_opt_result ggml_opt_adam(
|
|
17681
18689
|
UNUSED(t_start_cpu);
|
17682
18690
|
|
17683
18691
|
{
|
17684
|
-
|
17685
|
-
|
17686
|
-
|
17687
|
-
|
17688
|
-
|
17689
|
-
|
17690
|
-
|
17691
|
-
|
17692
|
-
|
17693
|
-
|
17694
|
-
|
17695
|
-
|
17696
|
-
|
17697
|
-
|
17698
|
-
|
17699
|
-
|
17700
|
-
|
17701
|
-
|
17702
|
-
|
17703
|
-
|
17704
|
-
|
17705
|
-
|
17706
|
-
|
17707
|
-
|
17708
|
-
|
17709
|
-
|
17710
|
-
|
17711
|
-
|
17712
|
-
|
17713
|
-
|
17714
|
-
|
17715
|
-
|
17716
|
-
|
18692
|
+
float gnorm = 1.0f;
|
18693
|
+
if (gclip > 0.0f) {
|
18694
|
+
// gradient clipping
|
18695
|
+
ggml_float sum = 0.0;
|
18696
|
+
for (int p = 0; p < np; ++p) {
|
18697
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18698
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18699
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j);
|
18700
|
+
sum += (ggml_float)(g*g);
|
18701
|
+
}
|
18702
|
+
}
|
18703
|
+
ggml_float norm = sqrt(sum);
|
18704
|
+
if (norm > (ggml_float) gclip) {
|
18705
|
+
gnorm = (float) ((ggml_float) gclip / norm);
|
18706
|
+
}
|
18707
|
+
}
|
18708
|
+
const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
|
18709
|
+
const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter));
|
18710
|
+
int64_t i = 0;
|
18711
|
+
for (int p = 0; p < np; ++p) {
|
18712
|
+
const int64_t ne = ggml_nelements(ps[p]);
|
18713
|
+
const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
|
18714
|
+
for (int64_t j = 0; j < ne; ++j) {
|
18715
|
+
float x = ggml_get_f32_1d(ps[p], j);
|
18716
|
+
float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm;
|
18717
|
+
m[i] = m[i]*beta1 + g*(1.0f - beta1);
|
18718
|
+
v[i] = v[i]*beta2 + g*g*(1.0f - beta2);
|
18719
|
+
float mh = m[i]*beta1h;
|
18720
|
+
float vh = v[i]*beta2h;
|
18721
|
+
vh = sqrtf(vh) + eps;
|
18722
|
+
x = x*(1.0f - p_decay) - mh/vh;
|
18723
|
+
ggml_set_f32_1d(ps[p], j, x);
|
18724
|
+
++i;
|
18725
|
+
}
|
18726
|
+
}
|
18727
|
+
}
|
17717
18728
|
|
17718
|
-
|
17719
|
-
|
18729
|
+
if (callback) {
|
18730
|
+
callback(callback_data, &sched);
|
17720
18731
|
}
|
17721
18732
|
|
17722
18733
|
ggml_graph_reset (gf);
|
17723
18734
|
ggml_set_f32 (f->grad, 1.0f);
|
17724
18735
|
|
17725
|
-
|
18736
|
+
ggml_graph_compute(gb, &cplan);
|
17726
18737
|
|
17727
18738
|
const float fx = ggml_get_f32_1d(f, 0);
|
18739
|
+
opt->loss_after = fx;
|
18740
|
+
|
17728
18741
|
|
17729
18742
|
// check convergence
|
17730
18743
|
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
@@ -17793,7 +18806,6 @@ struct ggml_lbfgs_iteration_data {
|
|
17793
18806
|
};
|
17794
18807
|
|
17795
18808
|
static enum ggml_opt_result linesearch_backtracking(
|
17796
|
-
struct ggml_context * ctx,
|
17797
18809
|
const struct ggml_opt_params * params,
|
17798
18810
|
int nx,
|
17799
18811
|
float * x,
|
@@ -17805,8 +18817,11 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17805
18817
|
struct ggml_tensor * f,
|
17806
18818
|
struct ggml_cgraph * gf,
|
17807
18819
|
struct ggml_cgraph * gb,
|
18820
|
+
struct ggml_cplan * cplan,
|
17808
18821
|
const int np,
|
17809
|
-
struct ggml_tensor * ps[]
|
18822
|
+
struct ggml_tensor * ps[],
|
18823
|
+
ggml_opt_callback callback,
|
18824
|
+
void * callback_data) {
|
17810
18825
|
int count = 0;
|
17811
18826
|
|
17812
18827
|
float width = 0.0f;
|
@@ -17835,6 +18850,12 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17835
18850
|
dgtest = params->lbfgs.ftol*dginit;
|
17836
18851
|
|
17837
18852
|
while (true) {
|
18853
|
+
if (callback) {
|
18854
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18855
|
+
float sched = 0;
|
18856
|
+
callback(callback_data, &sched);
|
18857
|
+
}
|
18858
|
+
|
17838
18859
|
ggml_vec_cpy_f32(nx, x, xp);
|
17839
18860
|
ggml_vec_mad_f32(nx, x, d, *step);
|
17840
18861
|
|
@@ -17845,7 +18866,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
17845
18866
|
ggml_graph_reset (gf);
|
17846
18867
|
ggml_set_f32 (f->grad, 1.0f);
|
17847
18868
|
|
17848
|
-
|
18869
|
+
ggml_graph_compute(gb, cplan);
|
17849
18870
|
|
17850
18871
|
ggml_opt_get_grad(np, ps, g);
|
17851
18872
|
|
@@ -17905,7 +18926,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17905
18926
|
struct ggml_opt_params params,
|
17906
18927
|
struct ggml_tensor * f,
|
17907
18928
|
struct ggml_cgraph * gf,
|
17908
|
-
struct ggml_cgraph * gb
|
18929
|
+
struct ggml_cgraph * gb,
|
18930
|
+
ggml_opt_callback callback,
|
18931
|
+
void * callback_data) {
|
17909
18932
|
if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
|
17910
18933
|
params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
|
17911
18934
|
if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
|
@@ -17937,6 +18960,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17937
18960
|
opt->iter = iter;
|
17938
18961
|
}
|
17939
18962
|
|
18963
|
+
struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
|
18964
|
+
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
|
18965
|
+
cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
|
18966
|
+
|
17940
18967
|
float * x = opt->lbfgs.x->data; // current parameters
|
17941
18968
|
float * xp = opt->lbfgs.xp->data; // previous parameters
|
17942
18969
|
float * g = opt->lbfgs.g->data; // current gradient
|
@@ -17958,6 +18985,12 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17958
18985
|
float * lm_s = opt->lbfgs.lms->data;
|
17959
18986
|
float * lm_y = opt->lbfgs.lmy->data;
|
17960
18987
|
|
18988
|
+
if (callback) {
|
18989
|
+
// LBFG-S does not support learning rate -> ignore learning schedule
|
18990
|
+
float sched = 0;
|
18991
|
+
callback(callback_data, &sched);
|
18992
|
+
}
|
18993
|
+
|
17961
18994
|
// evaluate the function value and its gradient
|
17962
18995
|
{
|
17963
18996
|
ggml_opt_set_params(np, ps, x);
|
@@ -17965,11 +18998,14 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
17965
18998
|
ggml_graph_reset (gf);
|
17966
18999
|
ggml_set_f32 (f->grad, 1.0f);
|
17967
19000
|
|
17968
|
-
|
19001
|
+
ggml_graph_compute(gb, &cplan);
|
17969
19002
|
|
17970
19003
|
ggml_opt_get_grad(np, ps, g);
|
17971
19004
|
|
17972
19005
|
fx = ggml_get_f32_1d(f, 0);
|
19006
|
+
|
19007
|
+
opt->loss_before = fx;
|
19008
|
+
opt->loss_after = fx;
|
17973
19009
|
}
|
17974
19010
|
|
17975
19011
|
// search direction = -gradient
|
@@ -18024,7 +19060,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18024
19060
|
ggml_vec_cpy_f32(nx, xp, x);
|
18025
19061
|
ggml_vec_cpy_f32(nx, gp, g);
|
18026
19062
|
|
18027
|
-
ls = linesearch_backtracking(
|
19063
|
+
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data);
|
18028
19064
|
|
18029
19065
|
if (ls < 0) {
|
18030
19066
|
// linesearch failed - go back to the previous point and return
|
@@ -18034,6 +19070,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18034
19070
|
return ls;
|
18035
19071
|
}
|
18036
19072
|
|
19073
|
+
opt->loss_after = fx;
|
19074
|
+
|
18037
19075
|
ggml_vec_norm_f32(nx, &xnorm, x);
|
18038
19076
|
ggml_vec_norm_f32(nx, &gnorm, g);
|
18039
19077
|
|
@@ -18091,7 +19129,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
18091
19129
|
// ys = y^t \cdot s -> 1 / \rho.
|
18092
19130
|
// yy = y^t \cdot y.
|
18093
19131
|
//
|
18094
|
-
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]
|
19132
|
+
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
18095
19133
|
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
18096
19134
|
|
18097
19135
|
lm_ys[end[0]] = ys;
|
@@ -18154,13 +19192,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
|
|
18154
19192
|
.adam = {
|
18155
19193
|
.n_iter = 10000,
|
18156
19194
|
.sched = 1.000f,
|
18157
|
-
.decay = 0.
|
19195
|
+
.decay = 0.0f,
|
19196
|
+
.decay_min_ndim = 2,
|
18158
19197
|
.alpha = 0.001f,
|
18159
19198
|
.beta1 = 0.9f,
|
18160
19199
|
.beta2 = 0.999f,
|
18161
19200
|
.eps = 1e-8f,
|
18162
19201
|
.eps_f = 1e-5f,
|
18163
19202
|
.eps_g = 1e-3f,
|
19203
|
+
.gclip = 0.0f,
|
18164
19204
|
},
|
18165
19205
|
};
|
18166
19206
|
} break;
|
@@ -18210,23 +19250,13 @@ GGML_API void ggml_opt_init(
|
|
18210
19250
|
switch (opt->params.type) {
|
18211
19251
|
case GGML_OPT_ADAM:
|
18212
19252
|
{
|
18213
|
-
opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18214
|
-
opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18215
|
-
opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18216
19253
|
opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18217
19254
|
opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18218
|
-
opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18219
|
-
opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx);
|
18220
19255
|
opt->adam.pf = params.past > 0
|
18221
19256
|
? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)
|
18222
19257
|
: NULL;
|
18223
|
-
ggml_set_zero(opt->adam.x);
|
18224
|
-
ggml_set_zero(opt->adam.g1);
|
18225
|
-
ggml_set_zero(opt->adam.g2);
|
18226
19258
|
ggml_set_zero(opt->adam.m);
|
18227
19259
|
ggml_set_zero(opt->adam.v);
|
18228
|
-
ggml_set_zero(opt->adam.mh);
|
18229
|
-
ggml_set_zero(opt->adam.vh);
|
18230
19260
|
if (opt->adam.pf) {
|
18231
19261
|
ggml_set_zero(opt->adam.pf);
|
18232
19262
|
}
|
@@ -18301,8 +19331,8 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18301
19331
|
struct ggml_tensor * f) {
|
18302
19332
|
|
18303
19333
|
// build forward + backward compute graphs
|
18304
|
-
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
18305
|
-
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) /
|
19334
|
+
struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
19335
|
+
struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
|
18306
19336
|
|
18307
19337
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
18308
19338
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
@@ -18310,7 +19340,7 @@ enum ggml_opt_result ggml_opt_resume(
|
|
18310
19340
|
*gf = ggml_build_forward (f);
|
18311
19341
|
*gb = ggml_build_backward(ctx, gf, true);
|
18312
19342
|
|
18313
|
-
return ggml_opt_resume_g(ctx, opt, f, gf, gb);
|
19343
|
+
return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
|
18314
19344
|
}
|
18315
19345
|
|
18316
19346
|
enum ggml_opt_result ggml_opt_resume_g(
|
@@ -18318,7 +19348,9 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18318
19348
|
struct ggml_opt_context * opt,
|
18319
19349
|
struct ggml_tensor * f,
|
18320
19350
|
struct ggml_cgraph * gf,
|
18321
|
-
struct ggml_cgraph * gb
|
19351
|
+
struct ggml_cgraph * gb,
|
19352
|
+
ggml_opt_callback callback,
|
19353
|
+
void * callback_data) {
|
18322
19354
|
|
18323
19355
|
// build forward + backward compute graphs
|
18324
19356
|
enum ggml_opt_result result = GGML_OPT_OK;
|
@@ -18326,11 +19358,11 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|
18326
19358
|
switch (opt->params.type) {
|
18327
19359
|
case GGML_OPT_ADAM:
|
18328
19360
|
{
|
18329
|
-
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
19361
|
+
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
18330
19362
|
} break;
|
18331
19363
|
case GGML_OPT_LBFGS:
|
18332
19364
|
{
|
18333
|
-
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb);
|
19365
|
+
result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
|
18334
19366
|
} break;
|
18335
19367
|
}
|
18336
19368
|
|
@@ -18561,64 +19593,1164 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
18561
19593
|
|
18562
19594
|
////////////////////////////////////////////////////////////////////////////////
|
18563
19595
|
|
18564
|
-
|
18565
|
-
|
18566
|
-
|
18567
|
-
|
18568
|
-
return 0;
|
18569
|
-
#endif
|
18570
|
-
}
|
19596
|
+
struct gguf_str {
|
19597
|
+
uint64_t n; // GGUFv2
|
19598
|
+
char * data;
|
19599
|
+
};
|
18571
19600
|
|
18572
|
-
|
18573
|
-
|
18574
|
-
|
18575
|
-
|
18576
|
-
|
18577
|
-
|
18578
|
-
|
19601
|
+
static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
|
19602
|
+
[GGUF_TYPE_UINT8] = sizeof(uint8_t),
|
19603
|
+
[GGUF_TYPE_INT8] = sizeof(int8_t),
|
19604
|
+
[GGUF_TYPE_UINT16] = sizeof(uint16_t),
|
19605
|
+
[GGUF_TYPE_INT16] = sizeof(int16_t),
|
19606
|
+
[GGUF_TYPE_UINT32] = sizeof(uint32_t),
|
19607
|
+
[GGUF_TYPE_INT32] = sizeof(int32_t),
|
19608
|
+
[GGUF_TYPE_FLOAT32] = sizeof(float),
|
19609
|
+
[GGUF_TYPE_BOOL] = sizeof(bool),
|
19610
|
+
[GGUF_TYPE_STRING] = sizeof(struct gguf_str),
|
19611
|
+
[GGUF_TYPE_UINT64] = sizeof(uint64_t),
|
19612
|
+
[GGUF_TYPE_INT64] = sizeof(int64_t),
|
19613
|
+
[GGUF_TYPE_FLOAT64] = sizeof(double),
|
19614
|
+
[GGUF_TYPE_ARRAY] = 0, // undefined
|
19615
|
+
};
|
19616
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19617
|
+
|
19618
|
+
static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
|
19619
|
+
[GGUF_TYPE_UINT8] = "u8",
|
19620
|
+
[GGUF_TYPE_INT8] = "i8",
|
19621
|
+
[GGUF_TYPE_UINT16] = "u16",
|
19622
|
+
[GGUF_TYPE_INT16] = "i16",
|
19623
|
+
[GGUF_TYPE_UINT32] = "u32",
|
19624
|
+
[GGUF_TYPE_INT32] = "i32",
|
19625
|
+
[GGUF_TYPE_FLOAT32] = "f32",
|
19626
|
+
[GGUF_TYPE_BOOL] = "bool",
|
19627
|
+
[GGUF_TYPE_STRING] = "str",
|
19628
|
+
[GGUF_TYPE_ARRAY] = "arr",
|
19629
|
+
[GGUF_TYPE_UINT64] = "u64",
|
19630
|
+
[GGUF_TYPE_INT64] = "i64",
|
19631
|
+
[GGUF_TYPE_FLOAT64] = "f64",
|
19632
|
+
};
|
19633
|
+
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
|
19634
|
+
|
19635
|
+
union gguf_value {
|
19636
|
+
uint8_t uint8;
|
19637
|
+
int8_t int8;
|
19638
|
+
uint16_t uint16;
|
19639
|
+
int16_t int16;
|
19640
|
+
uint32_t uint32;
|
19641
|
+
int32_t int32;
|
19642
|
+
float float32;
|
19643
|
+
uint64_t uint64;
|
19644
|
+
int64_t int64;
|
19645
|
+
double float64;
|
19646
|
+
bool bool_;
|
19647
|
+
|
19648
|
+
struct gguf_str str;
|
19649
|
+
|
19650
|
+
struct {
|
19651
|
+
enum gguf_type type;
|
19652
|
+
|
19653
|
+
uint64_t n; // GGUFv2
|
19654
|
+
void * data;
|
19655
|
+
} arr;
|
19656
|
+
};
|
18579
19657
|
|
18580
|
-
|
18581
|
-
|
18582
|
-
return 1;
|
18583
|
-
#else
|
18584
|
-
return 0;
|
18585
|
-
#endif
|
18586
|
-
}
|
19658
|
+
struct gguf_kv {
|
19659
|
+
struct gguf_str key;
|
18587
19660
|
|
18588
|
-
|
18589
|
-
|
18590
|
-
|
18591
|
-
#else
|
18592
|
-
return 0;
|
18593
|
-
#endif
|
18594
|
-
}
|
19661
|
+
enum gguf_type type;
|
19662
|
+
union gguf_value value;
|
19663
|
+
};
|
18595
19664
|
|
18596
|
-
|
18597
|
-
|
18598
|
-
|
18599
|
-
|
18600
|
-
|
18601
|
-
|
18602
|
-
}
|
19665
|
+
struct gguf_header {
|
19666
|
+
uint32_t magic;
|
19667
|
+
uint32_t version;
|
19668
|
+
uint64_t n_tensors; // GGUFv2
|
19669
|
+
uint64_t n_kv; // GGUFv2
|
19670
|
+
};
|
18603
19671
|
|
18604
|
-
|
18605
|
-
|
18606
|
-
return 1;
|
18607
|
-
#else
|
18608
|
-
return 0;
|
18609
|
-
#endif
|
18610
|
-
}
|
19672
|
+
struct gguf_tensor_info {
|
19673
|
+
struct gguf_str name;
|
18611
19674
|
|
18612
|
-
|
18613
|
-
|
18614
|
-
return 1;
|
18615
|
-
#else
|
18616
|
-
return 0;
|
18617
|
-
#endif
|
18618
|
-
}
|
19675
|
+
uint32_t n_dims;
|
19676
|
+
uint64_t ne[GGML_MAX_DIMS];
|
18619
19677
|
|
18620
|
-
|
18621
|
-
|
19678
|
+
enum ggml_type type;
|
19679
|
+
|
19680
|
+
uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
|
19681
|
+
|
19682
|
+
// for writing API
|
19683
|
+
const void * data;
|
19684
|
+
size_t size;
|
19685
|
+
};
|
19686
|
+
|
19687
|
+
struct gguf_context {
|
19688
|
+
struct gguf_header header;
|
19689
|
+
|
19690
|
+
struct gguf_kv * kv;
|
19691
|
+
struct gguf_tensor_info * infos;
|
19692
|
+
|
19693
|
+
size_t alignment;
|
19694
|
+
size_t offset; // offset of `data` from beginning of file
|
19695
|
+
size_t size; // size of `data` in bytes
|
19696
|
+
|
19697
|
+
//uint8_t * padding;
|
19698
|
+
void * data;
|
19699
|
+
};
|
19700
|
+
|
19701
|
+
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
19702
|
+
const size_t n = fread(dst, 1, size, file);
|
19703
|
+
*offset += n;
|
19704
|
+
return n == size;
|
19705
|
+
}
|
19706
|
+
|
19707
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19708
|
+
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
19709
|
+
p->n = 0;
|
19710
|
+
p->data = NULL;
|
19711
|
+
|
19712
|
+
bool ok = true;
|
19713
|
+
|
19714
|
+
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
19715
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19716
|
+
|
19717
|
+
return ok;
|
19718
|
+
}
|
19719
|
+
|
19720
|
+
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
19721
|
+
p->n = 0;
|
19722
|
+
p->data = NULL;
|
19723
|
+
|
19724
|
+
bool ok = true;
|
19725
|
+
|
19726
|
+
uint32_t n = 0;
|
19727
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
19728
|
+
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
19729
|
+
|
19730
|
+
return ok;
|
19731
|
+
}
|
19732
|
+
|
19733
|
+
struct gguf_context * gguf_init_empty(void) {
|
19734
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19735
|
+
|
19736
|
+
ctx->header.magic = GGUF_MAGIC;
|
19737
|
+
ctx->header.version = GGUF_VERSION;
|
19738
|
+
ctx->header.n_tensors = 0;
|
19739
|
+
ctx->header.n_kv = 0;
|
19740
|
+
|
19741
|
+
ctx->kv = NULL;
|
19742
|
+
ctx->infos = NULL;
|
19743
|
+
|
19744
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19745
|
+
ctx->offset = 0;
|
19746
|
+
ctx->size = 0;
|
19747
|
+
|
19748
|
+
ctx->data = NULL;
|
19749
|
+
|
19750
|
+
return ctx;
|
19751
|
+
}
|
19752
|
+
|
19753
|
+
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
|
19754
|
+
FILE * file = fopen(fname, "rb");
|
19755
|
+
if (!file) {
|
19756
|
+
return NULL;
|
19757
|
+
}
|
19758
|
+
|
19759
|
+
// offset from start of file
|
19760
|
+
size_t offset = 0;
|
19761
|
+
|
19762
|
+
uint32_t magic = 0;
|
19763
|
+
|
19764
|
+
// check the magic before making allocations
|
19765
|
+
{
|
19766
|
+
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
19767
|
+
|
19768
|
+
if (magic != GGUF_MAGIC) {
|
19769
|
+
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
19770
|
+
fclose(file);
|
19771
|
+
return NULL;
|
19772
|
+
}
|
19773
|
+
}
|
19774
|
+
|
19775
|
+
bool ok = true;
|
19776
|
+
|
19777
|
+
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
19778
|
+
|
19779
|
+
// read the header
|
19780
|
+
{
|
19781
|
+
ctx->header.magic = magic;
|
19782
|
+
|
19783
|
+
ctx->kv = NULL;
|
19784
|
+
ctx->infos = NULL;
|
19785
|
+
ctx->data = NULL;
|
19786
|
+
|
19787
|
+
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
19788
|
+
|
19789
|
+
if (ctx->header.version == 1) {
|
19790
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19791
|
+
uint32_t n_tensors = 0;
|
19792
|
+
uint32_t n_kv = 0;
|
19793
|
+
|
19794
|
+
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
19795
|
+
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
19796
|
+
|
19797
|
+
ctx->header.n_tensors = n_tensors;
|
19798
|
+
ctx->header.n_kv = n_kv;
|
19799
|
+
} else {
|
19800
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
19801
|
+
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
19802
|
+
}
|
19803
|
+
|
19804
|
+
if (!ok) {
|
19805
|
+
fprintf(stderr, "%s: failed to read header\n", __func__);
|
19806
|
+
fclose(file);
|
19807
|
+
gguf_free(ctx);
|
19808
|
+
return NULL;
|
19809
|
+
}
|
19810
|
+
}
|
19811
|
+
|
19812
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19813
|
+
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
19814
|
+
if (ctx->header.version == 1) {
|
19815
|
+
gguf_fread_str = gguf_fread_str_v1;
|
19816
|
+
}
|
19817
|
+
|
19818
|
+
// read the kv pairs
|
19819
|
+
{
|
19820
|
+
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
19821
|
+
|
19822
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
19823
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
19824
|
+
|
19825
|
+
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
19826
|
+
|
19827
|
+
ok = ok && gguf_fread_str(file, &kv->key, &offset);
|
19828
|
+
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
|
19829
|
+
|
19830
|
+
//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
|
19831
|
+
|
19832
|
+
switch (kv->type) {
|
19833
|
+
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
|
19834
|
+
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
|
19835
|
+
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
|
19836
|
+
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
|
19837
|
+
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
|
19838
|
+
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
|
19839
|
+
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
|
19840
|
+
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
|
19841
|
+
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
|
19842
|
+
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
|
19843
|
+
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
|
19844
|
+
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
|
19845
|
+
case GGUF_TYPE_ARRAY:
|
19846
|
+
{
|
19847
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
19848
|
+
|
19849
|
+
if (ctx->header.version == 1) {
|
19850
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19851
|
+
uint32_t n = 0;
|
19852
|
+
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
19853
|
+
kv->value.arr.n = n;
|
19854
|
+
} else {
|
19855
|
+
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
19856
|
+
}
|
19857
|
+
|
19858
|
+
switch (kv->value.arr.type) {
|
19859
|
+
case GGUF_TYPE_UINT8:
|
19860
|
+
case GGUF_TYPE_INT8:
|
19861
|
+
case GGUF_TYPE_UINT16:
|
19862
|
+
case GGUF_TYPE_INT16:
|
19863
|
+
case GGUF_TYPE_UINT32:
|
19864
|
+
case GGUF_TYPE_INT32:
|
19865
|
+
case GGUF_TYPE_FLOAT32:
|
19866
|
+
case GGUF_TYPE_UINT64:
|
19867
|
+
case GGUF_TYPE_INT64:
|
19868
|
+
case GGUF_TYPE_FLOAT64:
|
19869
|
+
case GGUF_TYPE_BOOL:
|
19870
|
+
{
|
19871
|
+
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
19872
|
+
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
19873
|
+
} break;
|
19874
|
+
case GGUF_TYPE_STRING:
|
19875
|
+
{
|
19876
|
+
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
19877
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
19878
|
+
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
19879
|
+
}
|
19880
|
+
} break;
|
19881
|
+
case GGUF_TYPE_ARRAY:
|
19882
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
19883
|
+
};
|
19884
|
+
} break;
|
19885
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
19886
|
+
};
|
19887
|
+
|
19888
|
+
if (!ok) {
|
19889
|
+
break;
|
19890
|
+
}
|
19891
|
+
}
|
19892
|
+
|
19893
|
+
if (!ok) {
|
19894
|
+
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
19895
|
+
fclose(file);
|
19896
|
+
gguf_free(ctx);
|
19897
|
+
return NULL;
|
19898
|
+
}
|
19899
|
+
}
|
19900
|
+
|
19901
|
+
// read the tensor infos
|
19902
|
+
{
|
19903
|
+
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
19904
|
+
|
19905
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19906
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19907
|
+
|
19908
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
19909
|
+
info->ne[j] = 1;
|
19910
|
+
}
|
19911
|
+
|
19912
|
+
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
19913
|
+
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
19914
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
19915
|
+
if (ctx->header.version == 1) {
|
19916
|
+
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
19917
|
+
uint32_t t = 0;
|
19918
|
+
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
19919
|
+
info->ne[j] = t;
|
19920
|
+
} else {
|
19921
|
+
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
19922
|
+
}
|
19923
|
+
}
|
19924
|
+
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
19925
|
+
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
19926
|
+
|
19927
|
+
if (!ok) {
|
19928
|
+
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
19929
|
+
fclose(file);
|
19930
|
+
gguf_free(ctx);
|
19931
|
+
return NULL;
|
19932
|
+
}
|
19933
|
+
}
|
19934
|
+
}
|
19935
|
+
|
19936
|
+
ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
|
19937
|
+
|
19938
|
+
int alignment_idx = gguf_find_key(ctx, "general.alignment");
|
19939
|
+
if (alignment_idx != -1) {
|
19940
|
+
ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
|
19941
|
+
}
|
19942
|
+
|
19943
|
+
// we require the data section to be aligned, so take into account any padding
|
19944
|
+
{
|
19945
|
+
const size_t offset_pad = offset % ctx->alignment;
|
19946
|
+
|
19947
|
+
if (offset_pad != 0) {
|
19948
|
+
offset += ctx->alignment - offset_pad;
|
19949
|
+
fseek(file, offset, SEEK_SET);
|
19950
|
+
}
|
19951
|
+
}
|
19952
|
+
|
19953
|
+
// store the current file offset - this is where the data section starts
|
19954
|
+
ctx->offset = offset;
|
19955
|
+
|
19956
|
+
// compute the total size of the data section, taking into account the alignment
|
19957
|
+
{
|
19958
|
+
ctx->size = 0;
|
19959
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
19960
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
19961
|
+
|
19962
|
+
const int64_t ne =
|
19963
|
+
(int64_t) info->ne[0] *
|
19964
|
+
(int64_t) info->ne[1] *
|
19965
|
+
(int64_t) info->ne[2] *
|
19966
|
+
(int64_t) info->ne[3];
|
19967
|
+
|
19968
|
+
if (ne % ggml_blck_size(info->type) != 0) {
|
19969
|
+
fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
|
19970
|
+
__func__, info->name.data, ne, ggml_blck_size(info->type));
|
19971
|
+
fclose(file);
|
19972
|
+
gguf_free(ctx);
|
19973
|
+
return NULL;
|
19974
|
+
}
|
19975
|
+
|
19976
|
+
const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
|
19977
|
+
|
19978
|
+
ctx->size += GGML_PAD(size_cur, ctx->alignment);
|
19979
|
+
}
|
19980
|
+
}
|
19981
|
+
|
19982
|
+
// load the tensor data only if requested
|
19983
|
+
if (params.ctx != NULL) {
|
19984
|
+
// if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
|
19985
|
+
// otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
|
19986
|
+
// the ggml_tensor structs to the appropriate locations in the binary blob
|
19987
|
+
|
19988
|
+
// compute the exact size needed for the new ggml_context
|
19989
|
+
const size_t mem_size =
|
19990
|
+
params.no_alloc ?
|
19991
|
+
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
19992
|
+
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
19993
|
+
|
19994
|
+
struct ggml_init_params pdata = {
|
19995
|
+
.mem_size = mem_size,
|
19996
|
+
.mem_buffer = NULL,
|
19997
|
+
.no_alloc = params.no_alloc,
|
19998
|
+
};
|
19999
|
+
|
20000
|
+
*params.ctx = ggml_init(pdata);
|
20001
|
+
|
20002
|
+
struct ggml_context * ctx_data = *params.ctx;
|
20003
|
+
|
20004
|
+
struct ggml_tensor * data = NULL;
|
20005
|
+
|
20006
|
+
if (params.no_alloc == false) {
|
20007
|
+
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
20008
|
+
|
20009
|
+
ok = ok && data != NULL;
|
20010
|
+
|
20011
|
+
// read the binary blob with the tensor data
|
20012
|
+
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
|
20013
|
+
|
20014
|
+
if (!ok) {
|
20015
|
+
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
20016
|
+
fclose(file);
|
20017
|
+
ggml_free(ctx_data);
|
20018
|
+
gguf_free(ctx);
|
20019
|
+
return NULL;
|
20020
|
+
}
|
20021
|
+
|
20022
|
+
ctx->data = data->data;
|
20023
|
+
}
|
20024
|
+
|
20025
|
+
ggml_set_no_alloc(ctx_data, true);
|
20026
|
+
|
20027
|
+
// create the tensors
|
20028
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20029
|
+
const int64_t ne[GGML_MAX_DIMS] = {
|
20030
|
+
ctx->infos[i].ne[0],
|
20031
|
+
ctx->infos[i].ne[1],
|
20032
|
+
ctx->infos[i].ne[2],
|
20033
|
+
ctx->infos[i].ne[3],
|
20034
|
+
};
|
20035
|
+
|
20036
|
+
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
|
20037
|
+
|
20038
|
+
ok = ok && cur != NULL;
|
20039
|
+
|
20040
|
+
ggml_set_name(cur, ctx->infos[i].name.data);
|
20041
|
+
|
20042
|
+
if (!ok) {
|
20043
|
+
break;
|
20044
|
+
}
|
20045
|
+
|
20046
|
+
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20047
|
+
if (params.no_alloc == false) {
|
20048
|
+
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
20049
|
+
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
20050
|
+
}
|
20051
|
+
}
|
20052
|
+
|
20053
|
+
if (!ok) {
|
20054
|
+
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
|
20055
|
+
fclose(file);
|
20056
|
+
ggml_free(ctx_data);
|
20057
|
+
gguf_free(ctx);
|
20058
|
+
return NULL;
|
20059
|
+
}
|
20060
|
+
|
20061
|
+
ggml_set_no_alloc(ctx_data, params.no_alloc);
|
20062
|
+
}
|
20063
|
+
|
20064
|
+
fclose(file);
|
20065
|
+
|
20066
|
+
return ctx;
|
20067
|
+
}
|
20068
|
+
|
20069
|
+
void gguf_free(struct gguf_context * ctx) {
|
20070
|
+
if (ctx == NULL) {
|
20071
|
+
return;
|
20072
|
+
}
|
20073
|
+
|
20074
|
+
if (ctx->kv) {
|
20075
|
+
// free string memory - not great..
|
20076
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
20077
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
20078
|
+
|
20079
|
+
if (kv->key.data) {
|
20080
|
+
free(kv->key.data);
|
20081
|
+
}
|
20082
|
+
|
20083
|
+
if (kv->type == GGUF_TYPE_STRING) {
|
20084
|
+
if (kv->value.str.data) {
|
20085
|
+
free(kv->value.str.data);
|
20086
|
+
}
|
20087
|
+
}
|
20088
|
+
|
20089
|
+
if (kv->type == GGUF_TYPE_ARRAY) {
|
20090
|
+
if (kv->value.arr.data) {
|
20091
|
+
if (kv->value.arr.type == GGUF_TYPE_STRING) {
|
20092
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
20093
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
20094
|
+
if (str->data) {
|
20095
|
+
free(str->data);
|
20096
|
+
}
|
20097
|
+
}
|
20098
|
+
}
|
20099
|
+
free(kv->value.arr.data);
|
20100
|
+
}
|
20101
|
+
}
|
20102
|
+
}
|
20103
|
+
|
20104
|
+
free(ctx->kv);
|
20105
|
+
}
|
20106
|
+
|
20107
|
+
if (ctx->infos) {
|
20108
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20109
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20110
|
+
|
20111
|
+
if (info->name.data) {
|
20112
|
+
free(info->name.data);
|
20113
|
+
}
|
20114
|
+
}
|
20115
|
+
|
20116
|
+
free(ctx->infos);
|
20117
|
+
}
|
20118
|
+
|
20119
|
+
GGML_ALIGNED_FREE(ctx);
|
20120
|
+
}
|
20121
|
+
|
20122
|
+
const char * gguf_type_name(enum gguf_type type) {
|
20123
|
+
return GGUF_TYPE_NAME[type];
|
20124
|
+
}
|
20125
|
+
|
20126
|
+
int gguf_get_version(struct gguf_context * ctx) {
|
20127
|
+
return ctx->header.version;
|
20128
|
+
}
|
20129
|
+
|
20130
|
+
size_t gguf_get_alignment(struct gguf_context * ctx) {
|
20131
|
+
return ctx->alignment;
|
20132
|
+
}
|
20133
|
+
|
20134
|
+
size_t gguf_get_data_offset(struct gguf_context * ctx) {
|
20135
|
+
return ctx->offset;
|
20136
|
+
}
|
20137
|
+
|
20138
|
+
void * gguf_get_data(struct gguf_context * ctx) {
|
20139
|
+
return ctx->data;
|
20140
|
+
}
|
20141
|
+
|
20142
|
+
int gguf_get_n_kv(struct gguf_context * ctx) {
|
20143
|
+
return ctx->header.n_kv;
|
20144
|
+
}
|
20145
|
+
|
20146
|
+
int gguf_find_key(struct gguf_context * ctx, const char * key) {
|
20147
|
+
// return -1 if key not found
|
20148
|
+
int keyfound = -1;
|
20149
|
+
|
20150
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
20151
|
+
|
20152
|
+
for (int i = 0; i < n_kv; ++i) {
|
20153
|
+
if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
|
20154
|
+
keyfound = i;
|
20155
|
+
break;
|
20156
|
+
}
|
20157
|
+
}
|
20158
|
+
|
20159
|
+
return keyfound;
|
20160
|
+
}
|
20161
|
+
|
20162
|
+
const char * gguf_get_key(struct gguf_context * ctx, int i) {
|
20163
|
+
return ctx->kv[i].key.data;
|
20164
|
+
}
|
20165
|
+
|
20166
|
+
enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
|
20167
|
+
return ctx->kv[i].type;
|
20168
|
+
}
|
20169
|
+
|
20170
|
+
enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
20171
|
+
return ctx->kv[i].value.arr.type;
|
20172
|
+
}
|
20173
|
+
|
20174
|
+
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
20175
|
+
return ctx->kv[i].value.arr.data;
|
20176
|
+
}
|
20177
|
+
|
20178
|
+
const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
|
20179
|
+
struct gguf_kv * kv = &ctx->kv[key_id];
|
20180
|
+
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
20181
|
+
return str->data;
|
20182
|
+
}
|
20183
|
+
|
20184
|
+
int gguf_get_arr_n(struct gguf_context * ctx, int i) {
|
20185
|
+
return ctx->kv[i].value.arr.n;
|
20186
|
+
}
|
20187
|
+
|
20188
|
+
uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
|
20189
|
+
return ctx->kv[i].value.uint8;
|
20190
|
+
}
|
20191
|
+
|
20192
|
+
int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
|
20193
|
+
return ctx->kv[i].value.int8;
|
20194
|
+
}
|
20195
|
+
|
20196
|
+
uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
|
20197
|
+
return ctx->kv[i].value.uint16;
|
20198
|
+
}
|
20199
|
+
|
20200
|
+
int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
|
20201
|
+
return ctx->kv[i].value.int16;
|
20202
|
+
}
|
20203
|
+
|
20204
|
+
uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
|
20205
|
+
return ctx->kv[i].value.uint32;
|
20206
|
+
}
|
20207
|
+
|
20208
|
+
int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
|
20209
|
+
return ctx->kv[i].value.int32;
|
20210
|
+
}
|
20211
|
+
|
20212
|
+
float gguf_get_val_f32(struct gguf_context * ctx, int i) {
|
20213
|
+
return ctx->kv[i].value.float32;
|
20214
|
+
}
|
20215
|
+
|
20216
|
+
uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) {
|
20217
|
+
return ctx->kv[i].value.uint64;
|
20218
|
+
}
|
20219
|
+
|
20220
|
+
int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) {
|
20221
|
+
return ctx->kv[i].value.int64;
|
20222
|
+
}
|
20223
|
+
|
20224
|
+
double gguf_get_val_f64(struct gguf_context * ctx, int i) {
|
20225
|
+
return ctx->kv[i].value.float64;
|
20226
|
+
}
|
20227
|
+
|
20228
|
+
bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
|
20229
|
+
return ctx->kv[i].value.bool_;
|
20230
|
+
}
|
20231
|
+
|
20232
|
+
const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
|
20233
|
+
return ctx->kv[i].value.str.data;
|
20234
|
+
}
|
20235
|
+
|
20236
|
+
int gguf_get_n_tensors(struct gguf_context * ctx) {
|
20237
|
+
return ctx->header.n_tensors;
|
20238
|
+
}
|
20239
|
+
|
20240
|
+
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
|
20241
|
+
// return -1 if tensor not found
|
20242
|
+
int tensorfound = -1;
|
20243
|
+
|
20244
|
+
const int n_tensors = gguf_get_n_tensors(ctx);
|
20245
|
+
|
20246
|
+
for (int i = 0; i < n_tensors; ++i) {
|
20247
|
+
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
|
20248
|
+
tensorfound = i;
|
20249
|
+
break;
|
20250
|
+
}
|
20251
|
+
}
|
20252
|
+
|
20253
|
+
return tensorfound;
|
20254
|
+
}
|
20255
|
+
|
20256
|
+
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
|
20257
|
+
return ctx->infos[i].offset;
|
20258
|
+
}
|
20259
|
+
|
20260
|
+
char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
|
20261
|
+
return ctx->infos[i].name.data;
|
20262
|
+
}
|
20263
|
+
|
20264
|
+
// returns the index
|
20265
|
+
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
20266
|
+
const int idx = gguf_find_key(ctx, key);
|
20267
|
+
if (idx >= 0) {
|
20268
|
+
return idx;
|
20269
|
+
}
|
20270
|
+
|
20271
|
+
const int n_kv = gguf_get_n_kv(ctx);
|
20272
|
+
|
20273
|
+
ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
|
20274
|
+
ctx->kv[n_kv].key.n = strlen(key);
|
20275
|
+
ctx->kv[n_kv].key.data = strdup(key);
|
20276
|
+
ctx->header.n_kv++;
|
20277
|
+
|
20278
|
+
return n_kv;
|
20279
|
+
}
|
20280
|
+
|
20281
|
+
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
|
20282
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20283
|
+
|
20284
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT8;
|
20285
|
+
ctx->kv[idx].value.uint8 = val;
|
20286
|
+
}
|
20287
|
+
|
20288
|
+
void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
|
20289
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20290
|
+
|
20291
|
+
ctx->kv[idx].type = GGUF_TYPE_INT8;
|
20292
|
+
ctx->kv[idx].value.int8 = val;
|
20293
|
+
}
|
20294
|
+
|
20295
|
+
void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
|
20296
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20297
|
+
|
20298
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT16;
|
20299
|
+
ctx->kv[idx].value.uint16 = val;
|
20300
|
+
}
|
20301
|
+
|
20302
|
+
void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
|
20303
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20304
|
+
|
20305
|
+
ctx->kv[idx].type = GGUF_TYPE_INT16;
|
20306
|
+
ctx->kv[idx].value.int16 = val;
|
20307
|
+
}
|
20308
|
+
|
20309
|
+
void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
|
20310
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20311
|
+
|
20312
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT32;
|
20313
|
+
ctx->kv[idx].value.uint32 = val;
|
20314
|
+
}
|
20315
|
+
|
20316
|
+
void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
|
20317
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20318
|
+
|
20319
|
+
ctx->kv[idx].type = GGUF_TYPE_INT32;
|
20320
|
+
ctx->kv[idx].value.int32 = val;
|
20321
|
+
}
|
20322
|
+
|
20323
|
+
void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
|
20324
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20325
|
+
|
20326
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
|
20327
|
+
ctx->kv[idx].value.float32 = val;
|
20328
|
+
}
|
20329
|
+
|
20330
|
+
void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
|
20331
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20332
|
+
|
20333
|
+
ctx->kv[idx].type = GGUF_TYPE_UINT64;
|
20334
|
+
ctx->kv[idx].value.uint64 = val;
|
20335
|
+
}
|
20336
|
+
|
20337
|
+
void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
|
20338
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20339
|
+
|
20340
|
+
ctx->kv[idx].type = GGUF_TYPE_INT64;
|
20341
|
+
ctx->kv[idx].value.int64 = val;
|
20342
|
+
}
|
20343
|
+
|
20344
|
+
void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
|
20345
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20346
|
+
|
20347
|
+
ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
|
20348
|
+
ctx->kv[idx].value.float64 = val;
|
20349
|
+
}
|
20350
|
+
|
20351
|
+
void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
|
20352
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20353
|
+
|
20354
|
+
ctx->kv[idx].type = GGUF_TYPE_BOOL;
|
20355
|
+
ctx->kv[idx].value.bool_ = val;
|
20356
|
+
}
|
20357
|
+
|
20358
|
+
void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
|
20359
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20360
|
+
|
20361
|
+
ctx->kv[idx].type = GGUF_TYPE_STRING;
|
20362
|
+
ctx->kv[idx].value.str.n = strlen(val);
|
20363
|
+
ctx->kv[idx].value.str.data = strdup(val);
|
20364
|
+
}
|
20365
|
+
|
20366
|
+
void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
|
20367
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20368
|
+
|
20369
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20370
|
+
ctx->kv[idx].value.arr.type = type;
|
20371
|
+
ctx->kv[idx].value.arr.n = n;
|
20372
|
+
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
20373
|
+
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
20374
|
+
}
|
20375
|
+
|
20376
|
+
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
20377
|
+
const int idx = gguf_get_or_add_key(ctx, key);
|
20378
|
+
|
20379
|
+
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
20380
|
+
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
20381
|
+
ctx->kv[idx].value.arr.n = n;
|
20382
|
+
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
20383
|
+
for (int i = 0; i < n; i++) {
|
20384
|
+
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
20385
|
+
str->n = strlen(data[i]);
|
20386
|
+
str->data = strdup(data[i]);
|
20387
|
+
}
|
20388
|
+
}
|
20389
|
+
|
20390
|
+
// set or add KV pairs from another context
|
20391
|
+
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
20392
|
+
for (uint32_t i = 0; i < src->header.n_kv; i++) {
|
20393
|
+
switch (src->kv[i].type) {
|
20394
|
+
case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
|
20395
|
+
case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
|
20396
|
+
case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
|
20397
|
+
case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
|
20398
|
+
case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
|
20399
|
+
case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
|
20400
|
+
case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
|
20401
|
+
case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
|
20402
|
+
case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
|
20403
|
+
case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
|
20404
|
+
case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
|
20405
|
+
case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
|
20406
|
+
case GGUF_TYPE_ARRAY:
|
20407
|
+
{
|
20408
|
+
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
20409
|
+
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
20410
|
+
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
20411
|
+
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
20412
|
+
}
|
20413
|
+
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
20414
|
+
free(data);
|
20415
|
+
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
20416
|
+
GGML_ASSERT(false && "nested arrays not supported");
|
20417
|
+
} else {
|
20418
|
+
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
20419
|
+
}
|
20420
|
+
} break;
|
20421
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20422
|
+
}
|
20423
|
+
}
|
20424
|
+
}
|
20425
|
+
|
20426
|
+
void gguf_add_tensor(
|
20427
|
+
struct gguf_context * ctx,
|
20428
|
+
const struct ggml_tensor * tensor) {
|
20429
|
+
const int idx = ctx->header.n_tensors;
|
20430
|
+
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
|
20431
|
+
|
20432
|
+
ctx->infos[idx].name.n = strlen(tensor->name);
|
20433
|
+
ctx->infos[idx].name.data = strdup(tensor->name);
|
20434
|
+
|
20435
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
20436
|
+
ctx->infos[idx].ne[i] = 1;
|
20437
|
+
}
|
20438
|
+
|
20439
|
+
ctx->infos[idx].n_dims = tensor->n_dims;
|
20440
|
+
for (int i = 0; i < tensor->n_dims; i++) {
|
20441
|
+
ctx->infos[idx].ne[i] = tensor->ne[i];
|
20442
|
+
}
|
20443
|
+
|
20444
|
+
ctx->infos[idx].type = tensor->type;
|
20445
|
+
ctx->infos[idx].offset = 0;
|
20446
|
+
ctx->infos[idx].data = tensor->data;
|
20447
|
+
ctx->infos[idx].size = ggml_nbytes(tensor);
|
20448
|
+
|
20449
|
+
if (ctx->header.n_tensors > 0) {
|
20450
|
+
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
|
20451
|
+
}
|
20452
|
+
|
20453
|
+
ctx->header.n_tensors++;
|
20454
|
+
}
|
20455
|
+
|
20456
|
+
void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
|
20457
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20458
|
+
if (idx < 0) {
|
20459
|
+
GGML_ASSERT(false && "tensor not found");
|
20460
|
+
}
|
20461
|
+
|
20462
|
+
ctx->infos[idx].type = type;
|
20463
|
+
}
|
20464
|
+
|
20465
|
+
void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
|
20466
|
+
const int idx = gguf_find_tensor(ctx, name);
|
20467
|
+
if (idx < 0) {
|
20468
|
+
GGML_ASSERT(false && "tensor not found");
|
20469
|
+
}
|
20470
|
+
|
20471
|
+
ctx->infos[idx].data = data;
|
20472
|
+
ctx->infos[idx].size = size;
|
20473
|
+
|
20474
|
+
// update offsets
|
20475
|
+
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
|
20476
|
+
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
|
20477
|
+
}
|
20478
|
+
}
|
20479
|
+
|
20480
|
+
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
|
20481
|
+
// fwrite(&val->n, sizeof(val->n), 1, file);
|
20482
|
+
// fwrite(val->data, sizeof(char), val->n, file);
|
20483
|
+
//}
|
20484
|
+
//
|
20485
|
+
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
|
20486
|
+
// fwrite(val, sizeof(char), size, file);
|
20487
|
+
//}
|
20488
|
+
|
20489
|
+
struct gguf_buf {
|
20490
|
+
void * data;
|
20491
|
+
size_t size;
|
20492
|
+
size_t offset;
|
20493
|
+
};
|
20494
|
+
|
20495
|
+
static struct gguf_buf gguf_buf_init(size_t size) {
|
20496
|
+
struct gguf_buf buf = {
|
20497
|
+
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
20498
|
+
/*buf.size =*/ size,
|
20499
|
+
/*buf.offset =*/ 0,
|
20500
|
+
};
|
20501
|
+
|
20502
|
+
return buf;
|
20503
|
+
}
|
20504
|
+
|
20505
|
+
static void gguf_buf_free(struct gguf_buf buf) {
|
20506
|
+
if (buf.data) {
|
20507
|
+
free(buf.data);
|
20508
|
+
}
|
20509
|
+
}
|
20510
|
+
|
20511
|
+
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
|
20512
|
+
if (buf->offset + size > buf->size) {
|
20513
|
+
buf->size = 1.5*(buf->offset + size);
|
20514
|
+
if (buf->data) {
|
20515
|
+
buf->data = realloc(buf->data, buf->size);
|
20516
|
+
}
|
20517
|
+
}
|
20518
|
+
}
|
20519
|
+
|
20520
|
+
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
|
20521
|
+
gguf_buf_grow(buf, sizeof(val->n) + val->n);
|
20522
|
+
|
20523
|
+
if (buf->data) {
|
20524
|
+
memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
|
20525
|
+
}
|
20526
|
+
buf->offset += sizeof(val->n);
|
20527
|
+
|
20528
|
+
if (buf->data) {
|
20529
|
+
memcpy((char *) buf->data + buf->offset, val->data, val->n);
|
20530
|
+
}
|
20531
|
+
buf->offset += val->n;
|
20532
|
+
}
|
20533
|
+
|
20534
|
+
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
|
20535
|
+
gguf_buf_grow(buf, el_size);
|
20536
|
+
|
20537
|
+
if (buf->data) {
|
20538
|
+
memcpy((char *) buf->data + buf->offset, val, el_size);
|
20539
|
+
}
|
20540
|
+
buf->offset += el_size;
|
20541
|
+
}
|
20542
|
+
|
20543
|
+
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
|
20544
|
+
// write header
|
20545
|
+
gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
|
20546
|
+
gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
|
20547
|
+
gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
|
20548
|
+
gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
|
20549
|
+
|
20550
|
+
// write key-value pairs
|
20551
|
+
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
20552
|
+
struct gguf_kv * kv = &ctx->kv[i];
|
20553
|
+
|
20554
|
+
gguf_bwrite_str(buf, &kv->key);
|
20555
|
+
gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
|
20556
|
+
|
20557
|
+
switch (kv->type) {
|
20558
|
+
case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
|
20559
|
+
case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
|
20560
|
+
case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
|
20561
|
+
case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
|
20562
|
+
case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
|
20563
|
+
case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
|
20564
|
+
case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
|
20565
|
+
case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
|
20566
|
+
case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
|
20567
|
+
case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
|
20568
|
+
case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
|
20569
|
+
case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
|
20570
|
+
case GGUF_TYPE_ARRAY:
|
20571
|
+
{
|
20572
|
+
gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
|
20573
|
+
gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
|
20574
|
+
|
20575
|
+
switch (kv->value.arr.type) {
|
20576
|
+
case GGUF_TYPE_UINT8:
|
20577
|
+
case GGUF_TYPE_INT8:
|
20578
|
+
case GGUF_TYPE_UINT16:
|
20579
|
+
case GGUF_TYPE_INT16:
|
20580
|
+
case GGUF_TYPE_UINT32:
|
20581
|
+
case GGUF_TYPE_INT32:
|
20582
|
+
case GGUF_TYPE_FLOAT32:
|
20583
|
+
case GGUF_TYPE_UINT64:
|
20584
|
+
case GGUF_TYPE_INT64:
|
20585
|
+
case GGUF_TYPE_FLOAT64:
|
20586
|
+
case GGUF_TYPE_BOOL:
|
20587
|
+
{
|
20588
|
+
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
20589
|
+
} break;
|
20590
|
+
case GGUF_TYPE_STRING:
|
20591
|
+
{
|
20592
|
+
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
20593
|
+
gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
|
20594
|
+
}
|
20595
|
+
} break;
|
20596
|
+
case GGUF_TYPE_ARRAY:
|
20597
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
20598
|
+
};
|
20599
|
+
} break;
|
20600
|
+
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
20601
|
+
};
|
20602
|
+
}
|
20603
|
+
|
20604
|
+
// write tensor infos
|
20605
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20606
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20607
|
+
|
20608
|
+
gguf_bwrite_str(buf, &info->name);
|
20609
|
+
gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
|
20610
|
+
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
20611
|
+
gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
|
20612
|
+
}
|
20613
|
+
gguf_bwrite_el(buf, &info->type, sizeof(info->type));
|
20614
|
+
gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
|
20615
|
+
}
|
20616
|
+
|
20617
|
+
// we require the data section to be aligned, so take into account any padding
|
20618
|
+
{
|
20619
|
+
const size_t offset = buf->offset;
|
20620
|
+
const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
|
20621
|
+
|
20622
|
+
if (offset_pad != offset) {
|
20623
|
+
uint8_t pad = 0;
|
20624
|
+
for (size_t i = 0; i < offset_pad - offset; ++i) {
|
20625
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20626
|
+
}
|
20627
|
+
}
|
20628
|
+
}
|
20629
|
+
|
20630
|
+
if (only_meta) {
|
20631
|
+
return;
|
20632
|
+
}
|
20633
|
+
|
20634
|
+
size_t offset = 0;
|
20635
|
+
|
20636
|
+
// write tensor data
|
20637
|
+
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
20638
|
+
struct gguf_tensor_info * info = &ctx->infos[i];
|
20639
|
+
|
20640
|
+
const size_t size = info->size;
|
20641
|
+
const size_t size_pad = GGML_PAD(size, ctx->alignment);
|
20642
|
+
|
20643
|
+
gguf_bwrite_el(buf, info->data, size);
|
20644
|
+
|
20645
|
+
if (size_pad != size) {
|
20646
|
+
uint8_t pad = 0;
|
20647
|
+
for (size_t j = 0; j < size_pad - size; ++j) {
|
20648
|
+
gguf_bwrite_el(buf, &pad, sizeof(pad));
|
20649
|
+
}
|
20650
|
+
}
|
20651
|
+
|
20652
|
+
GGML_ASSERT(offset == info->offset);
|
20653
|
+
|
20654
|
+
offset += size_pad;
|
20655
|
+
}
|
20656
|
+
}
|
20657
|
+
|
20658
|
+
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
|
20659
|
+
FILE * file = fopen(fname, "wb");
|
20660
|
+
if (!file) {
|
20661
|
+
GGML_ASSERT(false && "failed to open file for writing");
|
20662
|
+
}
|
20663
|
+
|
20664
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20665
|
+
|
20666
|
+
gguf_write_to_buf(ctx, &buf, only_meta);
|
20667
|
+
|
20668
|
+
fwrite(buf.data, 1, buf.offset, file);
|
20669
|
+
|
20670
|
+
gguf_buf_free(buf);
|
20671
|
+
|
20672
|
+
fclose(file);
|
20673
|
+
}
|
20674
|
+
|
20675
|
+
size_t gguf_get_meta_size(struct gguf_context * ctx) {
|
20676
|
+
// no allocs - only compute size
|
20677
|
+
struct gguf_buf buf = gguf_buf_init(0);
|
20678
|
+
|
20679
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20680
|
+
|
20681
|
+
return buf.offset;
|
20682
|
+
}
|
20683
|
+
|
20684
|
+
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
|
20685
|
+
struct gguf_buf buf = gguf_buf_init(16*1024);
|
20686
|
+
|
20687
|
+
gguf_write_to_buf(ctx, &buf, true);
|
20688
|
+
|
20689
|
+
memcpy(data, buf.data, buf.offset);
|
20690
|
+
|
20691
|
+
gguf_buf_free(buf);
|
20692
|
+
}
|
20693
|
+
|
20694
|
+
////////////////////////////////////////////////////////////////////////////////
|
20695
|
+
|
20696
|
+
int ggml_cpu_has_avx(void) {
|
20697
|
+
#if defined(__AVX__)
|
20698
|
+
return 1;
|
20699
|
+
#else
|
20700
|
+
return 0;
|
20701
|
+
#endif
|
20702
|
+
}
|
20703
|
+
|
20704
|
+
int ggml_cpu_has_avx2(void) {
|
20705
|
+
#if defined(__AVX2__)
|
20706
|
+
return 1;
|
20707
|
+
#else
|
20708
|
+
return 0;
|
20709
|
+
#endif
|
20710
|
+
}
|
20711
|
+
|
20712
|
+
int ggml_cpu_has_avx512(void) {
|
20713
|
+
#if defined(__AVX512F__)
|
20714
|
+
return 1;
|
20715
|
+
#else
|
20716
|
+
return 0;
|
20717
|
+
#endif
|
20718
|
+
}
|
20719
|
+
|
20720
|
+
int ggml_cpu_has_avx512_vbmi(void) {
|
20721
|
+
#if defined(__AVX512VBMI__)
|
20722
|
+
return 1;
|
20723
|
+
#else
|
20724
|
+
return 0;
|
20725
|
+
#endif
|
20726
|
+
}
|
20727
|
+
|
20728
|
+
int ggml_cpu_has_avx512_vnni(void) {
|
20729
|
+
#if defined(__AVX512VNNI__)
|
20730
|
+
return 1;
|
20731
|
+
#else
|
20732
|
+
return 0;
|
20733
|
+
#endif
|
20734
|
+
}
|
20735
|
+
|
20736
|
+
int ggml_cpu_has_fma(void) {
|
20737
|
+
#if defined(__FMA__)
|
20738
|
+
return 1;
|
20739
|
+
#else
|
20740
|
+
return 0;
|
20741
|
+
#endif
|
20742
|
+
}
|
20743
|
+
|
20744
|
+
int ggml_cpu_has_neon(void) {
|
20745
|
+
#if defined(__ARM_NEON)
|
20746
|
+
return 1;
|
20747
|
+
#else
|
20748
|
+
return 0;
|
20749
|
+
#endif
|
20750
|
+
}
|
20751
|
+
|
20752
|
+
int ggml_cpu_has_arm_fma(void) {
|
20753
|
+
#if defined(__ARM_FEATURE_FMA)
|
18622
20754
|
return 1;
|
18623
20755
|
#else
|
18624
20756
|
return 0;
|
@@ -18685,6 +20817,14 @@ int ggml_cpu_has_sse3(void) {
|
|
18685
20817
|
#endif
|
18686
20818
|
}
|
18687
20819
|
|
20820
|
+
int ggml_cpu_has_ssse3(void) {
|
20821
|
+
#if defined(__SSSE3__)
|
20822
|
+
return 1;
|
20823
|
+
#else
|
20824
|
+
return 0;
|
20825
|
+
#endif
|
20826
|
+
}
|
20827
|
+
|
18688
20828
|
int ggml_cpu_has_vsx(void) {
|
18689
20829
|
#if defined(__POWER9_VECTOR__)
|
18690
20830
|
return 1;
|