llama_cpp 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/examples/README.md +32 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +553 -313
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +157 -19
- data/ext/llama_cpp/src/ggml-metal.metal +149 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +736 -98
- data/ext/llama_cpp/src/ggml.h +140 -9
- data/ext/llama_cpp/src/llama.cpp +58 -31
- data/ext/llama_cpp/src/llama.h +8 -9
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -35,6 +35,12 @@
|
|
35
35
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
36
36
|
#endif
|
37
37
|
|
38
|
+
#if defined(_MSC_VER)
|
39
|
+
// disable "possible loss of data" to avoid hundreds of casts
|
40
|
+
// we should just be careful :)
|
41
|
+
#pragma warning(disable: 4244 4267)
|
42
|
+
#endif
|
43
|
+
|
38
44
|
#if defined(_WIN32)
|
39
45
|
|
40
46
|
#include <windows.h>
|
@@ -106,6 +112,7 @@ typedef void* thread_ret_t;
|
|
106
112
|
/*#define GGML_PERF*/
|
107
113
|
#define GGML_DEBUG 0
|
108
114
|
#define GGML_GELU_FP16
|
115
|
+
#define GGML_GELU_QUICK_FP16
|
109
116
|
#define GGML_SILU_FP16
|
110
117
|
|
111
118
|
#define GGML_SOFT_MAX_UNROLL 4
|
@@ -334,6 +341,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
334
341
|
// precomputed gelu table for f16 (128 KB)
|
335
342
|
static ggml_fp16_t table_gelu_f16[1 << 16];
|
336
343
|
|
344
|
+
// precomputed quick gelu table for f16 (128 KB)
|
345
|
+
static ggml_fp16_t table_gelu_quick_f16[1 << 16];
|
346
|
+
|
337
347
|
// precomputed silu table for f16 (128 KB)
|
338
348
|
static ggml_fp16_t table_silu_f16[1 << 16];
|
339
349
|
|
@@ -1671,14 +1681,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1671
1681
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1672
1682
|
#define GGML_F32x4_REDUCE(res, x) \
|
1673
1683
|
{ \
|
1674
|
-
|
1675
|
-
|
1684
|
+
int offset = GGML_F32_ARR >> 1; \
|
1685
|
+
for (int i = 0; i < offset; ++i) { \
|
1686
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1676
1687
|
} \
|
1677
|
-
|
1678
|
-
|
1688
|
+
offset >>= 1; \
|
1689
|
+
for (int i = 0; i < offset; ++i) { \
|
1690
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1679
1691
|
} \
|
1680
|
-
|
1681
|
-
|
1692
|
+
offset >>= 1; \
|
1693
|
+
for (int i = 0; i < offset; ++i) { \
|
1694
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1682
1695
|
} \
|
1683
1696
|
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1684
1697
|
}
|
@@ -1709,14 +1722,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1709
1722
|
#define GGML_F16x8_MUL vmulq_f16
|
1710
1723
|
#define GGML_F16x8_REDUCE(res, x) \
|
1711
1724
|
{ \
|
1712
|
-
|
1713
|
-
|
1725
|
+
int offset = GGML_F16_ARR >> 1; \
|
1726
|
+
for (int i = 0; i < offset; ++i) { \
|
1727
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1714
1728
|
} \
|
1715
|
-
|
1716
|
-
|
1729
|
+
offset >>= 1; \
|
1730
|
+
for (int i = 0; i < offset; ++i) { \
|
1731
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1717
1732
|
} \
|
1718
|
-
|
1719
|
-
|
1733
|
+
offset >>= 1; \
|
1734
|
+
for (int i = 0; i < offset; ++i) { \
|
1735
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1720
1736
|
} \
|
1721
1737
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1722
1738
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
@@ -1783,14 +1799,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1783
1799
|
#define GGML_F32x8_MUL _mm256_mul_ps
|
1784
1800
|
#define GGML_F32x8_REDUCE(res, x) \
|
1785
1801
|
{ \
|
1786
|
-
|
1787
|
-
|
1802
|
+
int offset = GGML_F32_ARR >> 1; \
|
1803
|
+
for (int i = 0; i < offset; ++i) { \
|
1804
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1788
1805
|
} \
|
1789
|
-
|
1790
|
-
|
1806
|
+
offset >>= 1; \
|
1807
|
+
for (int i = 0; i < offset; ++i) { \
|
1808
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1791
1809
|
} \
|
1792
|
-
|
1793
|
-
|
1810
|
+
offset >>= 1; \
|
1811
|
+
for (int i = 0; i < offset; ++i) { \
|
1812
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1794
1813
|
} \
|
1795
1814
|
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1796
1815
|
_mm256_extractf128_ps(x[0], 1)); \
|
@@ -1880,14 +1899,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1880
1899
|
#define GGML_F32x4_MUL vec_mul
|
1881
1900
|
#define GGML_F32x4_REDUCE(res, x) \
|
1882
1901
|
{ \
|
1883
|
-
|
1884
|
-
|
1902
|
+
int offset = GGML_F32_ARR >> 1; \
|
1903
|
+
for (int i = 0; i < offset; ++i) { \
|
1904
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1885
1905
|
} \
|
1886
|
-
|
1887
|
-
|
1906
|
+
offset >>= 1; \
|
1907
|
+
for (int i = 0; i < offset; ++i) { \
|
1908
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1888
1909
|
} \
|
1889
|
-
|
1890
|
-
|
1910
|
+
offset >>= 1; \
|
1911
|
+
for (int i = 0; i < offset; ++i) { \
|
1912
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1891
1913
|
} \
|
1892
1914
|
res = vec_extract(x[0], 0) + \
|
1893
1915
|
vec_extract(x[0], 1) + \
|
@@ -1943,14 +1965,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1943
1965
|
#define GGML_F32x4_MUL wasm_f32x4_mul
|
1944
1966
|
#define GGML_F32x4_REDUCE(res, x) \
|
1945
1967
|
{ \
|
1946
|
-
|
1947
|
-
|
1968
|
+
int offset = GGML_F32_ARR >> 1; \
|
1969
|
+
for (int i = 0; i < offset; ++i) { \
|
1970
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1948
1971
|
} \
|
1949
|
-
|
1950
|
-
|
1972
|
+
offset >>= 1; \
|
1973
|
+
for (int i = 0; i < offset; ++i) { \
|
1974
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1951
1975
|
} \
|
1952
|
-
|
1953
|
-
|
1976
|
+
offset >>= 1; \
|
1977
|
+
for (int i = 0; i < offset; ++i) { \
|
1978
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1954
1979
|
} \
|
1955
1980
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
1956
1981
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2005,14 +2030,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2005
2030
|
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2006
2031
|
#define GGML_F16x4_REDUCE(res, x) \
|
2007
2032
|
{ \
|
2008
|
-
|
2009
|
-
|
2033
|
+
int offset = GGML_F16_ARR >> 1; \
|
2034
|
+
for (int i = 0; i < offset; ++i) { \
|
2035
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2010
2036
|
} \
|
2011
|
-
|
2012
|
-
|
2037
|
+
offset >>= 1; \
|
2038
|
+
for (int i = 0; i < offset; ++i) { \
|
2039
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2013
2040
|
} \
|
2014
|
-
|
2015
|
-
|
2041
|
+
offset >>= 1; \
|
2042
|
+
for (int i = 0; i < offset; ++i) { \
|
2043
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2016
2044
|
} \
|
2017
2045
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2018
2046
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2054,14 +2082,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2054
2082
|
#define GGML_F32x4_MUL _mm_mul_ps
|
2055
2083
|
#define GGML_F32x4_REDUCE(res, x) \
|
2056
2084
|
{ \
|
2057
|
-
|
2058
|
-
|
2085
|
+
int offset = GGML_F32_ARR >> 1; \
|
2086
|
+
for (int i = 0; i < offset; ++i) { \
|
2087
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2059
2088
|
} \
|
2060
|
-
|
2061
|
-
|
2089
|
+
offset >>= 1; \
|
2090
|
+
for (int i = 0; i < offset; ++i) { \
|
2091
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2062
2092
|
} \
|
2063
|
-
|
2064
|
-
|
2093
|
+
offset >>= 1; \
|
2094
|
+
for (int i = 0; i < offset; ++i) { \
|
2095
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2065
2096
|
} \
|
2066
2097
|
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2067
2098
|
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
@@ -3350,6 +3381,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
3350
3381
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3351
3382
|
|
3352
3383
|
static const float GELU_COEF_A = 0.044715f;
|
3384
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3353
3385
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3354
3386
|
|
3355
3387
|
inline static float ggml_gelu_f32(float x) {
|
@@ -3380,6 +3412,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
3380
3412
|
}
|
3381
3413
|
#endif
|
3382
3414
|
|
3415
|
+
inline static float ggml_gelu_quick_f32(float x) {
|
3416
|
+
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
3417
|
+
}
|
3418
|
+
|
3419
|
+
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3420
|
+
// const uint16_t * i16 = (const uint16_t *) x;
|
3421
|
+
// for (int i = 0; i < n; ++i) {
|
3422
|
+
// y[i] = table_gelu_quick_f16[i16[i]];
|
3423
|
+
// }
|
3424
|
+
//}
|
3425
|
+
|
3426
|
+
#ifdef GGML_GELU_QUICK_FP16
|
3427
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3428
|
+
uint16_t t;
|
3429
|
+
for (int i = 0; i < n; ++i) {
|
3430
|
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3431
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
3432
|
+
y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
|
3433
|
+
}
|
3434
|
+
}
|
3435
|
+
#else
|
3436
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3437
|
+
for (int i = 0; i < n; ++i) {
|
3438
|
+
y[i] = ggml_gelu_quick_f32(x[i]);
|
3439
|
+
}
|
3440
|
+
}
|
3441
|
+
#endif
|
3442
|
+
|
3383
3443
|
// Sigmoid Linear Unit (SiLU) function
|
3384
3444
|
inline static float ggml_silu_f32(float x) {
|
3385
3445
|
return x/(1.0f + expf(-x));
|
@@ -3610,6 +3670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3610
3670
|
"STEP",
|
3611
3671
|
"RELU",
|
3612
3672
|
"GELU",
|
3673
|
+
"GELU_QUICK",
|
3613
3674
|
"SILU",
|
3614
3675
|
"SILU_BACK",
|
3615
3676
|
"NORM",
|
@@ -3638,12 +3699,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3638
3699
|
"ROPE_BACK",
|
3639
3700
|
"ALIBI",
|
3640
3701
|
"CLAMP",
|
3641
|
-
"
|
3642
|
-
"
|
3702
|
+
"CONV_1D_S1_PH",
|
3703
|
+
"CONV_1D_S2_PH",
|
3704
|
+
"CONV_2D_SK_P0",
|
3643
3705
|
|
3644
3706
|
"FLASH_ATTN",
|
3645
3707
|
"FLASH_FF",
|
3646
3708
|
"FLASH_ATTN_BACK",
|
3709
|
+
"WIN_PART",
|
3710
|
+
"WIN_UNPART",
|
3647
3711
|
|
3648
3712
|
"MAP_UNARY",
|
3649
3713
|
"MAP_BINARY",
|
@@ -3652,7 +3716,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3652
3716
|
"CROSS_ENTROPY_LOSS_BACK",
|
3653
3717
|
};
|
3654
3718
|
|
3655
|
-
static_assert(GGML_OP_COUNT ==
|
3719
|
+
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
3656
3720
|
|
3657
3721
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3658
3722
|
"none",
|
@@ -3678,6 +3742,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3678
3742
|
"step(x)",
|
3679
3743
|
"relu(x)",
|
3680
3744
|
"gelu(x)",
|
3745
|
+
"gelu_quick(x)",
|
3681
3746
|
"silu(x)",
|
3682
3747
|
"silu_back(x)",
|
3683
3748
|
"norm(x)",
|
@@ -3706,12 +3771,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3706
3771
|
"rope_back(x)",
|
3707
3772
|
"alibi(x)",
|
3708
3773
|
"clamp(x)",
|
3709
|
-
"
|
3710
|
-
"
|
3774
|
+
"conv_1d_s1_ph(x)",
|
3775
|
+
"conv_1d_s2_ph(x)",
|
3776
|
+
"conv_2d_sk_p0(x)",
|
3711
3777
|
|
3712
3778
|
"flash_attn(x)",
|
3713
3779
|
"flash_ff(x)",
|
3714
3780
|
"flash_attn_back(x)",
|
3781
|
+
"win_part(x)",
|
3782
|
+
"win_unpart(x)",
|
3715
3783
|
|
3716
3784
|
"f(x)",
|
3717
3785
|
"f(x,y)",
|
@@ -3720,7 +3788,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3720
3788
|
"cross_entropy_loss_back(x,y)",
|
3721
3789
|
};
|
3722
3790
|
|
3723
|
-
static_assert(GGML_OP_COUNT ==
|
3791
|
+
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
3724
3792
|
|
3725
3793
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3726
3794
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4011,7 +4079,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4011
4079
|
// initialize time system (required on Windows)
|
4012
4080
|
ggml_time_init();
|
4013
4081
|
|
4014
|
-
// initialize GELU, SILU and EXP F32 tables
|
4082
|
+
// initialize GELU, Quick GELU, SILU and EXP F32 tables
|
4015
4083
|
{
|
4016
4084
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
4017
4085
|
|
@@ -4021,13 +4089,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4021
4089
|
memcpy(&ii, &ui, sizeof(ii));
|
4022
4090
|
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
4023
4091
|
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
4092
|
+
table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
4024
4093
|
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
4025
4094
|
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4026
4095
|
}
|
4027
4096
|
|
4028
4097
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
4029
4098
|
|
4030
|
-
GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4099
|
+
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4031
4100
|
}
|
4032
4101
|
|
4033
4102
|
// initialize g_state
|
@@ -4148,14 +4217,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
|
4148
4217
|
ctx->no_alloc = no_alloc;
|
4149
4218
|
}
|
4150
4219
|
|
4151
|
-
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4220
|
+
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
|
4152
4221
|
return ctx->mem_buffer;
|
4153
4222
|
}
|
4154
4223
|
|
4155
|
-
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4224
|
+
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
4156
4225
|
return ctx->mem_size;
|
4157
4226
|
}
|
4158
4227
|
|
4228
|
+
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
4229
|
+
size_t max_size = 0;
|
4230
|
+
|
4231
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4232
|
+
|
4233
|
+
while (obj != NULL) {
|
4234
|
+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
4235
|
+
|
4236
|
+
const size_t size = ggml_nbytes(tensor);
|
4237
|
+
|
4238
|
+
if (max_size < size) {
|
4239
|
+
max_size = size;
|
4240
|
+
}
|
4241
|
+
|
4242
|
+
obj = obj->next;
|
4243
|
+
}
|
4244
|
+
|
4245
|
+
return max_size;
|
4246
|
+
}
|
4247
|
+
|
4159
4248
|
// IMPORTANT:
|
4160
4249
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4161
4250
|
// this is an error prone process, but it is necessary to support inplace
|
@@ -4639,9 +4728,10 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
|
4639
4728
|
return tensor->name;
|
4640
4729
|
}
|
4641
4730
|
|
4642
|
-
|
4731
|
+
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
4643
4732
|
strncpy(tensor->name, name, sizeof(tensor->name));
|
4644
4733
|
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
4734
|
+
return tensor;
|
4645
4735
|
}
|
4646
4736
|
|
4647
4737
|
struct ggml_tensor * ggml_view_tensor(
|
@@ -5420,6 +5510,40 @@ struct ggml_tensor * ggml_gelu_inplace(
|
|
5420
5510
|
return ggml_gelu_impl(ctx, a, true);
|
5421
5511
|
}
|
5422
5512
|
|
5513
|
+
// ggml_gelu_quick
|
5514
|
+
|
5515
|
+
struct ggml_tensor * ggml_gelu_quick_impl(
|
5516
|
+
struct ggml_context * ctx,
|
5517
|
+
struct ggml_tensor * a,
|
5518
|
+
bool inplace) {
|
5519
|
+
bool is_node = false;
|
5520
|
+
|
5521
|
+
if (!inplace && (a->grad)) {
|
5522
|
+
is_node = true;
|
5523
|
+
}
|
5524
|
+
|
5525
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5526
|
+
|
5527
|
+
result->op = GGML_OP_GELU_QUICK;
|
5528
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5529
|
+
result->src0 = a;
|
5530
|
+
result->src1 = NULL;
|
5531
|
+
|
5532
|
+
return result;
|
5533
|
+
}
|
5534
|
+
|
5535
|
+
struct ggml_tensor * ggml_gelu_quick(
|
5536
|
+
struct ggml_context * ctx,
|
5537
|
+
struct ggml_tensor * a) {
|
5538
|
+
return ggml_gelu_quick_impl(ctx, a, false);
|
5539
|
+
}
|
5540
|
+
|
5541
|
+
struct ggml_tensor * ggml_gelu_quick_inplace(
|
5542
|
+
struct ggml_context * ctx,
|
5543
|
+
struct ggml_tensor * a) {
|
5544
|
+
return ggml_gelu_quick_impl(ctx, a, true);
|
5545
|
+
}
|
5546
|
+
|
5423
5547
|
// ggml_silu
|
5424
5548
|
|
5425
5549
|
struct ggml_tensor * ggml_silu_impl(
|
@@ -6619,7 +6743,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6619
6743
|
|
6620
6744
|
ggml_scratch_save(ctx);
|
6621
6745
|
|
6622
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx,
|
6746
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
|
6623
6747
|
|
6624
6748
|
((float *) b->data)[0] = min;
|
6625
6749
|
((float *) b->data)[1] = max;
|
@@ -6634,9 +6758,9 @@ struct ggml_tensor * ggml_clamp(
|
|
6634
6758
|
return result;
|
6635
6759
|
}
|
6636
6760
|
|
6637
|
-
//
|
6761
|
+
// ggml_conv_1d_s1_ph
|
6638
6762
|
|
6639
|
-
struct ggml_tensor *
|
6763
|
+
struct ggml_tensor * ggml_conv_1d_s1_ph(
|
6640
6764
|
struct ggml_context * ctx,
|
6641
6765
|
struct ggml_tensor * a,
|
6642
6766
|
struct ggml_tensor * b) {
|
@@ -6653,7 +6777,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6653
6777
|
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
6654
6778
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6655
6779
|
|
6656
|
-
result->op =
|
6780
|
+
result->op = GGML_OP_CONV_1D_S1_PH;
|
6657
6781
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6658
6782
|
result->src0 = a;
|
6659
6783
|
result->src1 = b;
|
@@ -6661,9 +6785,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6661
6785
|
return result;
|
6662
6786
|
}
|
6663
6787
|
|
6664
|
-
//
|
6788
|
+
// ggml_conv_1d_s2_ph
|
6665
6789
|
|
6666
|
-
struct ggml_tensor *
|
6790
|
+
struct ggml_tensor * ggml_conv_1d_s2_ph(
|
6667
6791
|
struct ggml_context * ctx,
|
6668
6792
|
struct ggml_tensor * a,
|
6669
6793
|
struct ggml_tensor * b) {
|
@@ -6680,7 +6804,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
6680
6804
|
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
6681
6805
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6682
6806
|
|
6683
|
-
result->op =
|
6807
|
+
result->op = GGML_OP_CONV_1D_S2_PH;
|
6808
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6809
|
+
result->src0 = a;
|
6810
|
+
result->src1 = b;
|
6811
|
+
|
6812
|
+
return result;
|
6813
|
+
}
|
6814
|
+
|
6815
|
+
// ggml_conv_2d_sk_p0
|
6816
|
+
|
6817
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6818
|
+
struct ggml_context * ctx,
|
6819
|
+
struct ggml_tensor * a,
|
6820
|
+
struct ggml_tensor * b) {
|
6821
|
+
GGML_ASSERT(b->ne[3] == 1);
|
6822
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
6823
|
+
GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
|
6824
|
+
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
6825
|
+
bool is_node = false;
|
6826
|
+
|
6827
|
+
if (a->grad || b->grad) {
|
6828
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6829
|
+
is_node = true;
|
6830
|
+
}
|
6831
|
+
|
6832
|
+
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
6833
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6834
|
+
|
6835
|
+
result->op = GGML_OP_CONV_2D_SK_P0;
|
6684
6836
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6685
6837
|
result->src0 = a;
|
6686
6838
|
result->src1 = b;
|
@@ -6814,6 +6966,89 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6814
6966
|
return result;
|
6815
6967
|
}
|
6816
6968
|
|
6969
|
+
// ggml_win_part
|
6970
|
+
|
6971
|
+
struct ggml_tensor * ggml_win_part(
|
6972
|
+
struct ggml_context * ctx,
|
6973
|
+
struct ggml_tensor * a,
|
6974
|
+
int w) {
|
6975
|
+
GGML_ASSERT(a->ne[3] == 1);
|
6976
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
6977
|
+
|
6978
|
+
bool is_node = false;
|
6979
|
+
|
6980
|
+
if (a->grad) {
|
6981
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6982
|
+
is_node = true;
|
6983
|
+
}
|
6984
|
+
|
6985
|
+
// padding
|
6986
|
+
const int px = (w - a->ne[1]%w)%w;
|
6987
|
+
const int py = (w - a->ne[2]%w)%w;
|
6988
|
+
|
6989
|
+
const int npx = (px + a->ne[1])/w;
|
6990
|
+
const int npy = (py + a->ne[2])/w;
|
6991
|
+
const int np = npx*npy;
|
6992
|
+
|
6993
|
+
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
6994
|
+
|
6995
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6996
|
+
|
6997
|
+
ggml_scratch_save(ctx);
|
6998
|
+
|
6999
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7000
|
+
|
7001
|
+
((int32_t *) b->data)[0] = npx;
|
7002
|
+
((int32_t *) b->data)[1] = npy;
|
7003
|
+
((int32_t *) b->data)[2] = w;
|
7004
|
+
|
7005
|
+
ggml_scratch_load(ctx);
|
7006
|
+
|
7007
|
+
result->op = GGML_OP_WIN_PART;
|
7008
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7009
|
+
result->src0 = a;
|
7010
|
+
result->src1 = NULL;
|
7011
|
+
result->opt[0] = b;
|
7012
|
+
|
7013
|
+
return result;
|
7014
|
+
}
|
7015
|
+
|
7016
|
+
// ggml_win_unpart
|
7017
|
+
|
7018
|
+
struct ggml_tensor * ggml_win_unpart(
|
7019
|
+
struct ggml_context * ctx,
|
7020
|
+
struct ggml_tensor * a,
|
7021
|
+
int w0,
|
7022
|
+
int h0,
|
7023
|
+
int w) {
|
7024
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
7025
|
+
|
7026
|
+
bool is_node = false;
|
7027
|
+
|
7028
|
+
if (a->grad) {
|
7029
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7030
|
+
is_node = true;
|
7031
|
+
}
|
7032
|
+
|
7033
|
+
const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
|
7034
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7035
|
+
|
7036
|
+
ggml_scratch_save(ctx);
|
7037
|
+
|
7038
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
7039
|
+
|
7040
|
+
((int32_t *) b->data)[0] = w;
|
7041
|
+
|
7042
|
+
ggml_scratch_load(ctx);
|
7043
|
+
|
7044
|
+
result->op = GGML_OP_WIN_UNPART;
|
7045
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7046
|
+
result->src0 = a;
|
7047
|
+
result->src1 = NULL;
|
7048
|
+
result->opt[0] = b;
|
7049
|
+
|
7050
|
+
return result;
|
7051
|
+
}
|
6817
7052
|
|
6818
7053
|
// ggml_map_unary
|
6819
7054
|
|
@@ -7892,7 +8127,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
7892
8127
|
|
7893
8128
|
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
7894
8129
|
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
7895
|
-
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*
|
8130
|
+
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
7896
8131
|
|
7897
8132
|
assert(ne00 % 32 == 0);
|
7898
8133
|
|
@@ -9453,8 +9688,65 @@ static void ggml_compute_forward_gelu(
|
|
9453
9688
|
GGML_ASSERT(false);
|
9454
9689
|
} break;
|
9455
9690
|
}
|
9691
|
+
}
|
9692
|
+
|
9693
|
+
// ggml_compute_forward_gelu_quick
|
9694
|
+
|
9695
|
+
static void ggml_compute_forward_gelu_quick_f32(
|
9696
|
+
const struct ggml_compute_params * params,
|
9697
|
+
const struct ggml_tensor * src0,
|
9698
|
+
struct ggml_tensor * dst) {
|
9699
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
9700
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
9701
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
9702
|
+
|
9703
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9704
|
+
return;
|
9705
|
+
}
|
9706
|
+
|
9707
|
+
const int ith = params->ith;
|
9708
|
+
const int nth = params->nth;
|
9709
|
+
|
9710
|
+
const int nc = src0->ne[0];
|
9711
|
+
const int nr = ggml_nrows(src0);
|
9456
9712
|
|
9457
|
-
//
|
9713
|
+
// rows per thread
|
9714
|
+
const int dr = (nr + nth - 1)/nth;
|
9715
|
+
|
9716
|
+
// row range for this thread
|
9717
|
+
const int ir0 = dr*ith;
|
9718
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
9719
|
+
|
9720
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
9721
|
+
ggml_vec_gelu_quick_f32(nc,
|
9722
|
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
9723
|
+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
9724
|
+
|
9725
|
+
#ifndef NDEBUG
|
9726
|
+
for (int k = 0; k < nc; k++) {
|
9727
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
9728
|
+
UNUSED(x);
|
9729
|
+
assert(!isnan(x));
|
9730
|
+
assert(!isinf(x));
|
9731
|
+
}
|
9732
|
+
#endif
|
9733
|
+
}
|
9734
|
+
}
|
9735
|
+
|
9736
|
+
static void ggml_compute_forward_gelu_quick(
|
9737
|
+
const struct ggml_compute_params * params,
|
9738
|
+
const struct ggml_tensor * src0,
|
9739
|
+
struct ggml_tensor * dst) {
|
9740
|
+
switch (src0->type) {
|
9741
|
+
case GGML_TYPE_F32:
|
9742
|
+
{
|
9743
|
+
ggml_compute_forward_gelu_quick_f32(params, src0, dst);
|
9744
|
+
} break;
|
9745
|
+
default:
|
9746
|
+
{
|
9747
|
+
GGML_ASSERT(false);
|
9748
|
+
} break;
|
9749
|
+
}
|
9458
9750
|
}
|
9459
9751
|
|
9460
9752
|
// ggml_compute_forward_silu
|
@@ -10852,7 +11144,7 @@ static void ggml_compute_forward_set_f32(
|
|
10852
11144
|
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
10853
11145
|
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
10854
11146
|
|
10855
|
-
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3
|
11147
|
+
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
10856
11148
|
|
10857
11149
|
GGML_ASSERT(nb10 == sizeof(float));
|
10858
11150
|
|
@@ -11573,8 +11865,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11573
11865
|
const struct ggml_tensor * src1,
|
11574
11866
|
struct ggml_tensor * dst) {
|
11575
11867
|
assert(params->ith == 0);
|
11576
|
-
|
11577
|
-
|
11868
|
+
|
11869
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11870
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11578
11871
|
|
11579
11872
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11580
11873
|
return;
|
@@ -11637,8 +11930,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11637
11930
|
const struct ggml_tensor * src1,
|
11638
11931
|
struct ggml_tensor * dst) {
|
11639
11932
|
assert(params->ith == 0);
|
11640
|
-
|
11641
|
-
|
11933
|
+
|
11934
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11935
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11642
11936
|
|
11643
11937
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11644
11938
|
return;
|
@@ -11740,15 +12034,16 @@ static void ggml_compute_forward_clamp_f32(
|
|
11740
12034
|
const struct ggml_tensor * src1,
|
11741
12035
|
struct ggml_tensor * dst) {
|
11742
12036
|
assert(params->ith == 0);
|
11743
|
-
|
11744
|
-
|
12037
|
+
|
12038
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12039
|
+
GGML_ASSERT(ggml_nelements(src1) == 2);
|
11745
12040
|
|
11746
12041
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11747
12042
|
return;
|
11748
12043
|
}
|
11749
12044
|
|
11750
|
-
const
|
11751
|
-
const
|
12045
|
+
const float min = ((float *) src1->data)[0];
|
12046
|
+
const float max = ((float *) src1->data)[1];
|
11752
12047
|
|
11753
12048
|
const int ith = params->ith;
|
11754
12049
|
const int nth = params->nth;
|
@@ -12306,9 +12601,9 @@ static void ggml_compute_forward_rope_back(
|
|
12306
12601
|
}
|
12307
12602
|
}
|
12308
12603
|
|
12309
|
-
//
|
12604
|
+
// ggml_compute_forward_conv_1d_s1_ph
|
12310
12605
|
|
12311
|
-
static void
|
12606
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12312
12607
|
const struct ggml_compute_params * params,
|
12313
12608
|
const struct ggml_tensor * src0,
|
12314
12609
|
const struct ggml_tensor * src1,
|
@@ -12428,7 +12723,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
12428
12723
|
}
|
12429
12724
|
}
|
12430
12725
|
|
12431
|
-
static void
|
12726
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
12432
12727
|
const struct ggml_compute_params * params,
|
12433
12728
|
const struct ggml_tensor * src0,
|
12434
12729
|
const struct ggml_tensor * src1,
|
@@ -12548,7 +12843,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
12548
12843
|
}
|
12549
12844
|
}
|
12550
12845
|
|
12551
|
-
static void
|
12846
|
+
static void ggml_compute_forward_conv_1d_s1_ph(
|
12552
12847
|
const struct ggml_compute_params * params,
|
12553
12848
|
const struct ggml_tensor * src0,
|
12554
12849
|
const struct ggml_tensor * src1,
|
@@ -12556,11 +12851,11 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12556
12851
|
switch (src0->type) {
|
12557
12852
|
case GGML_TYPE_F16:
|
12558
12853
|
{
|
12559
|
-
|
12854
|
+
ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
|
12560
12855
|
} break;
|
12561
12856
|
case GGML_TYPE_F32:
|
12562
12857
|
{
|
12563
|
-
|
12858
|
+
ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
|
12564
12859
|
} break;
|
12565
12860
|
default:
|
12566
12861
|
{
|
@@ -12569,9 +12864,9 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12569
12864
|
}
|
12570
12865
|
}
|
12571
12866
|
|
12572
|
-
//
|
12867
|
+
// ggml_compute_forward_conv_1d_s2_ph
|
12573
12868
|
|
12574
|
-
static void
|
12869
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
12575
12870
|
const struct ggml_compute_params * params,
|
12576
12871
|
const struct ggml_tensor * src0,
|
12577
12872
|
const struct ggml_tensor * src1,
|
@@ -12691,7 +12986,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
12691
12986
|
}
|
12692
12987
|
}
|
12693
12988
|
|
12694
|
-
static void
|
12989
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
12695
12990
|
const struct ggml_compute_params * params,
|
12696
12991
|
const struct ggml_tensor * src0,
|
12697
12992
|
const struct ggml_tensor * src1,
|
@@ -12811,7 +13106,143 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
12811
13106
|
}
|
12812
13107
|
}
|
12813
13108
|
|
12814
|
-
static void
|
13109
|
+
static void ggml_compute_forward_conv_1d_s2_ph(
|
13110
|
+
const struct ggml_compute_params * params,
|
13111
|
+
const struct ggml_tensor * src0,
|
13112
|
+
const struct ggml_tensor * src1,
|
13113
|
+
struct ggml_tensor * dst) {
|
13114
|
+
switch (src0->type) {
|
13115
|
+
case GGML_TYPE_F16:
|
13116
|
+
{
|
13117
|
+
ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
|
13118
|
+
} break;
|
13119
|
+
case GGML_TYPE_F32:
|
13120
|
+
{
|
13121
|
+
ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
|
13122
|
+
} break;
|
13123
|
+
default:
|
13124
|
+
{
|
13125
|
+
GGML_ASSERT(false);
|
13126
|
+
} break;
|
13127
|
+
}
|
13128
|
+
}
|
13129
|
+
|
13130
|
+
// ggml_compute_forward_conv_2d_sk_p0
|
13131
|
+
|
13132
|
+
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
13133
|
+
const struct ggml_compute_params * params,
|
13134
|
+
const struct ggml_tensor * src0,
|
13135
|
+
const struct ggml_tensor * src1,
|
13136
|
+
struct ggml_tensor * dst) {
|
13137
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13138
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13139
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13140
|
+
|
13141
|
+
int64_t t0 = ggml_perf_time_us();
|
13142
|
+
UNUSED(t0);
|
13143
|
+
|
13144
|
+
const int ne00 = src0->ne[0];
|
13145
|
+
const int ne01 = src0->ne[1];
|
13146
|
+
const int ne02 = src0->ne[2];
|
13147
|
+
//const int ne03 = src0->ne[3];
|
13148
|
+
|
13149
|
+
const int ne10 = src1->ne[0];
|
13150
|
+
//const int ne11 = src1->ne[1];
|
13151
|
+
const int ne12 = src1->ne[2];
|
13152
|
+
//const int ne13 = src1->ne[3];
|
13153
|
+
|
13154
|
+
const int ne0 = dst->ne[0];
|
13155
|
+
const int ne1 = dst->ne[1];
|
13156
|
+
const int ne2 = dst->ne[2];
|
13157
|
+
//const int ne3 = dst->ne[3];
|
13158
|
+
//const int ne = ne0*ne1*ne2*ne3;
|
13159
|
+
|
13160
|
+
const int nb00 = src0->nb[0];
|
13161
|
+
//const int nb01 = src0->nb[1];
|
13162
|
+
//const int nb02 = src0->nb[2];
|
13163
|
+
const int nb03 = src0->nb[3];
|
13164
|
+
|
13165
|
+
const int nb10 = src1->nb[0];
|
13166
|
+
//const int nb11 = src1->nb[1];
|
13167
|
+
const int nb12 = src1->nb[2];
|
13168
|
+
//const int nb13 = src1->nb[3];
|
13169
|
+
|
13170
|
+
//const int nb0 = dst->nb[0];
|
13171
|
+
//const int nb1 = dst->nb[1];
|
13172
|
+
const int nb2 = dst->nb[2];
|
13173
|
+
//const int nb3 = dst->nb[3];
|
13174
|
+
|
13175
|
+
const int ith = params->ith;
|
13176
|
+
const int nth = params->nth;
|
13177
|
+
|
13178
|
+
const int nk0 = ne00;
|
13179
|
+
const int nk1 = ne01;
|
13180
|
+
|
13181
|
+
// size of the convolution row - the kernel size unrolled across all channels
|
13182
|
+
// round-up so it is more suitable for SIMD
|
13183
|
+
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
13184
|
+
|
13185
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13186
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13187
|
+
|
13188
|
+
if (params->type == GGML_TASK_INIT) {
|
13189
|
+
// TODO: fix this memset (wsize is overestimated)
|
13190
|
+
memset(params->wdata, 0, params->wsize);
|
13191
|
+
|
13192
|
+
// prepare source data (src1)
|
13193
|
+
{
|
13194
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13195
|
+
|
13196
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13197
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13198
|
+
ggml_fp16_t * dst_data = wdata;
|
13199
|
+
|
13200
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13201
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13202
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13203
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13204
|
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13205
|
+
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
13206
|
+
}
|
13207
|
+
}
|
13208
|
+
}
|
13209
|
+
}
|
13210
|
+
}
|
13211
|
+
}
|
13212
|
+
|
13213
|
+
return;
|
13214
|
+
}
|
13215
|
+
|
13216
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13217
|
+
return;
|
13218
|
+
}
|
13219
|
+
|
13220
|
+
// total patches in dst
|
13221
|
+
const int np = ne2;
|
13222
|
+
|
13223
|
+
// patches per thread
|
13224
|
+
const int dp = (np + nth - 1)/nth;
|
13225
|
+
|
13226
|
+
// patch range for this thread
|
13227
|
+
const int ip0 = dp*ith;
|
13228
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13229
|
+
|
13230
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13231
|
+
|
13232
|
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13233
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13234
|
+
|
13235
|
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13236
|
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13237
|
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13238
|
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13239
|
+
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
|
13240
|
+
}
|
13241
|
+
}
|
13242
|
+
}
|
13243
|
+
}
|
13244
|
+
|
13245
|
+
static void ggml_compute_forward_conv_2d_sk_p0(
|
12815
13246
|
const struct ggml_compute_params * params,
|
12816
13247
|
const struct ggml_tensor * src0,
|
12817
13248
|
const struct ggml_tensor * src1,
|
@@ -12819,11 +13250,12 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12819
13250
|
switch (src0->type) {
|
12820
13251
|
case GGML_TYPE_F16:
|
12821
13252
|
{
|
12822
|
-
|
13253
|
+
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
12823
13254
|
} break;
|
12824
13255
|
case GGML_TYPE_F32:
|
12825
13256
|
{
|
12826
|
-
|
13257
|
+
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
13258
|
+
GGML_ASSERT(false);
|
12827
13259
|
} break;
|
12828
13260
|
default:
|
12829
13261
|
{
|
@@ -13926,6 +14358,145 @@ static void ggml_compute_forward_flash_attn_back(
|
|
13926
14358
|
}
|
13927
14359
|
}
|
13928
14360
|
|
14361
|
+
// ggml_compute_forward_win_part
|
14362
|
+
|
14363
|
+
static void ggml_compute_forward_win_part_f32(
|
14364
|
+
const struct ggml_compute_params * params,
|
14365
|
+
const struct ggml_tensor * src0,
|
14366
|
+
const struct ggml_tensor * opt0,
|
14367
|
+
struct ggml_tensor * dst) {
|
14368
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14369
|
+
return;
|
14370
|
+
}
|
14371
|
+
|
14372
|
+
const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
|
14373
|
+
const int64_t ne01 = src0->ne[1];
|
14374
|
+
const int64_t ne02 = src0->ne[2];
|
14375
|
+
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14376
|
+
|
14377
|
+
const int64_t ne0 = dst->ne[0];
|
14378
|
+
const int64_t ne1 = dst->ne[1];
|
14379
|
+
const int64_t ne2 = dst->ne[2];
|
14380
|
+
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
14381
|
+
|
14382
|
+
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14383
|
+
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
14384
|
+
const int32_t w = ((const int32_t *)(opt0->data))[2];
|
14385
|
+
|
14386
|
+
assert(ne00 == ne0);
|
14387
|
+
assert(ne3 == nep0*nep1);
|
14388
|
+
|
14389
|
+
// TODO: optimize / multi-thread
|
14390
|
+
for (int py = 0; py < nep1; ++py) {
|
14391
|
+
for (int px = 0; px < nep0; ++px) {
|
14392
|
+
const int64_t i3 = py*nep0 + px;
|
14393
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14394
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14395
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14396
|
+
const int64_t i02 = py*w + i2;
|
14397
|
+
const int64_t i01 = px*w + i1;
|
14398
|
+
const int64_t i00 = i0;
|
14399
|
+
|
14400
|
+
const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
|
14401
|
+
const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
|
14402
|
+
|
14403
|
+
if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
|
14404
|
+
((float *) dst->data)[i] = 0.0f;
|
14405
|
+
} else {
|
14406
|
+
((float *) dst->data)[i] = ((float *) src0->data)[j];
|
14407
|
+
}
|
14408
|
+
}
|
14409
|
+
}
|
14410
|
+
}
|
14411
|
+
}
|
14412
|
+
}
|
14413
|
+
}
|
14414
|
+
|
14415
|
+
static void ggml_compute_forward_win_part(
|
14416
|
+
const struct ggml_compute_params * params,
|
14417
|
+
const struct ggml_tensor * src0,
|
14418
|
+
const struct ggml_tensor * opt0,
|
14419
|
+
struct ggml_tensor * dst) {
|
14420
|
+
switch (src0->type) {
|
14421
|
+
case GGML_TYPE_F32:
|
14422
|
+
{
|
14423
|
+
ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
|
14424
|
+
} break;
|
14425
|
+
default:
|
14426
|
+
{
|
14427
|
+
GGML_ASSERT(false);
|
14428
|
+
} break;
|
14429
|
+
}
|
14430
|
+
}
|
14431
|
+
|
14432
|
+
// ggml_compute_forward_win_unpart
|
14433
|
+
|
14434
|
+
static void ggml_compute_forward_win_unpart_f32(
|
14435
|
+
const struct ggml_compute_params * params,
|
14436
|
+
const struct ggml_tensor * src0,
|
14437
|
+
const struct ggml_tensor * opt0,
|
14438
|
+
struct ggml_tensor * dst) {
|
14439
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14440
|
+
return;
|
14441
|
+
}
|
14442
|
+
|
14443
|
+
const int64_t ne00 = src0->ne[0];
|
14444
|
+
const int64_t ne01 = src0->ne[1];
|
14445
|
+
const int64_t ne02 = src0->ne[2];
|
14446
|
+
//const int64_t ne03 = src0->ne[3];
|
14447
|
+
|
14448
|
+
const int64_t ne0 = dst->ne[0];
|
14449
|
+
const int64_t ne1 = dst->ne[1];
|
14450
|
+
const int64_t ne2 = dst->ne[2];
|
14451
|
+
|
14452
|
+
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14453
|
+
|
14454
|
+
// padding
|
14455
|
+
const int px = (w - ne1%w)%w;
|
14456
|
+
//const int py = (w - ne2%w)%w;
|
14457
|
+
|
14458
|
+
const int npx = (px + ne1)/w;
|
14459
|
+
//const int npy = (py + ne2)/w;
|
14460
|
+
|
14461
|
+
assert(ne0 == ne00);
|
14462
|
+
|
14463
|
+
// TODO: optimize / multi-thread
|
14464
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14465
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14466
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14467
|
+
const int ip2 = i2/w;
|
14468
|
+
const int ip1 = i1/w;
|
14469
|
+
|
14470
|
+
const int64_t i02 = i2%w;
|
14471
|
+
const int64_t i01 = i1%w;
|
14472
|
+
const int64_t i00 = i0;
|
14473
|
+
|
14474
|
+
const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
|
14475
|
+
const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
|
14476
|
+
|
14477
|
+
((float *) dst->data)[j] = ((float *) src0->data)[i];
|
14478
|
+
}
|
14479
|
+
}
|
14480
|
+
}
|
14481
|
+
}
|
14482
|
+
|
14483
|
+
static void ggml_compute_forward_win_unpart(
|
14484
|
+
const struct ggml_compute_params * params,
|
14485
|
+
const struct ggml_tensor * src0,
|
14486
|
+
const struct ggml_tensor * opt0,
|
14487
|
+
struct ggml_tensor * dst) {
|
14488
|
+
switch (src0->type) {
|
14489
|
+
case GGML_TYPE_F32:
|
14490
|
+
{
|
14491
|
+
ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
|
14492
|
+
} break;
|
14493
|
+
default:
|
14494
|
+
{
|
14495
|
+
GGML_ASSERT(false);
|
14496
|
+
} break;
|
14497
|
+
}
|
14498
|
+
}
|
14499
|
+
|
13929
14500
|
// ggml_compute_forward_map_unary
|
13930
14501
|
|
13931
14502
|
static void ggml_compute_forward_map_unary_f32(
|
@@ -14398,6 +14969,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14398
14969
|
{
|
14399
14970
|
ggml_compute_forward_gelu(params, tensor->src0, tensor);
|
14400
14971
|
} break;
|
14972
|
+
case GGML_OP_GELU_QUICK:
|
14973
|
+
{
|
14974
|
+
ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
|
14975
|
+
} break;
|
14401
14976
|
case GGML_OP_SILU:
|
14402
14977
|
{
|
14403
14978
|
ggml_compute_forward_silu(params, tensor->src0, tensor);
|
@@ -14502,19 +15077,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14502
15077
|
{
|
14503
15078
|
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
14504
15079
|
} break;
|
14505
|
-
case
|
15080
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15081
|
+
{
|
15082
|
+
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15083
|
+
} break;
|
15084
|
+
case GGML_OP_CONV_1D_S2_PH:
|
14506
15085
|
{
|
14507
|
-
|
15086
|
+
ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
|
14508
15087
|
} break;
|
14509
|
-
case
|
15088
|
+
case GGML_OP_CONV_2D_SK_P0:
|
14510
15089
|
{
|
14511
|
-
|
15090
|
+
ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
|
14512
15091
|
} break;
|
14513
15092
|
case GGML_OP_FLASH_ATTN:
|
14514
15093
|
{
|
14515
|
-
int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
15094
|
+
const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
14516
15095
|
GGML_ASSERT(t == 0 || t == 1);
|
14517
|
-
bool masked = t != 0;
|
15096
|
+
const bool masked = t != 0;
|
14518
15097
|
ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
|
14519
15098
|
} break;
|
14520
15099
|
case GGML_OP_FLASH_FF:
|
@@ -14528,6 +15107,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14528
15107
|
bool masked = t != 0;
|
14529
15108
|
ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
|
14530
15109
|
} break;
|
15110
|
+
case GGML_OP_WIN_PART:
|
15111
|
+
{
|
15112
|
+
ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
|
15113
|
+
} break;
|
15114
|
+
case GGML_OP_WIN_UNPART:
|
15115
|
+
{
|
15116
|
+
ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
|
15117
|
+
} break;
|
14531
15118
|
case GGML_OP_MAP_UNARY:
|
14532
15119
|
{
|
14533
15120
|
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
@@ -14799,6 +15386,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14799
15386
|
{
|
14800
15387
|
GGML_ASSERT(false); // TODO: not implemented
|
14801
15388
|
} break;
|
15389
|
+
case GGML_OP_GELU_QUICK:
|
15390
|
+
{
|
15391
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15392
|
+
} break;
|
14802
15393
|
case GGML_OP_ALIBI:
|
14803
15394
|
{
|
14804
15395
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15161,11 +15752,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15161
15752
|
// noop
|
15162
15753
|
}
|
15163
15754
|
} break;
|
15164
|
-
case
|
15755
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15756
|
+
{
|
15757
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15758
|
+
} break;
|
15759
|
+
case GGML_OP_CONV_1D_S2_PH:
|
15165
15760
|
{
|
15166
15761
|
GGML_ASSERT(false); // TODO: not implemented
|
15167
15762
|
} break;
|
15168
|
-
case
|
15763
|
+
case GGML_OP_CONV_2D_SK_P0:
|
15169
15764
|
{
|
15170
15765
|
GGML_ASSERT(false); // TODO: not implemented
|
15171
15766
|
} break;
|
@@ -15334,6 +15929,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15334
15929
|
{
|
15335
15930
|
GGML_ASSERT(false); // not supported
|
15336
15931
|
} break;
|
15932
|
+
case GGML_OP_WIN_PART:
|
15933
|
+
case GGML_OP_WIN_UNPART:
|
15337
15934
|
case GGML_OP_MAP_UNARY:
|
15338
15935
|
case GGML_OP_MAP_BINARY:
|
15339
15936
|
{
|
@@ -15742,6 +16339,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15742
16339
|
} break;
|
15743
16340
|
case GGML_OP_MUL:
|
15744
16341
|
case GGML_OP_GELU:
|
16342
|
+
case GGML_OP_GELU_QUICK:
|
15745
16343
|
case GGML_OP_SILU:
|
15746
16344
|
case GGML_OP_SILU_BACK:
|
15747
16345
|
case GGML_OP_NORM:
|
@@ -15848,8 +16446,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15848
16446
|
{
|
15849
16447
|
node->n_tasks = 1; //TODO
|
15850
16448
|
} break;
|
15851
|
-
case
|
15852
|
-
case
|
16449
|
+
case GGML_OP_CONV_1D_S1_PH:
|
16450
|
+
case GGML_OP_CONV_1D_S2_PH:
|
15853
16451
|
{
|
15854
16452
|
node->n_tasks = n_threads;
|
15855
16453
|
|
@@ -15876,6 +16474,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15876
16474
|
GGML_ASSERT(false);
|
15877
16475
|
}
|
15878
16476
|
|
16477
|
+
work_size = MAX(work_size, cur);
|
16478
|
+
} break;
|
16479
|
+
case GGML_OP_CONV_2D_SK_P0:
|
16480
|
+
{
|
16481
|
+
node->n_tasks = n_threads;
|
16482
|
+
|
16483
|
+
GGML_ASSERT(node->src1->ne[3] == 1);
|
16484
|
+
|
16485
|
+
const int64_t ne00 = node->src0->ne[0]; // W
|
16486
|
+
const int64_t ne01 = node->src0->ne[1]; // H
|
16487
|
+
const int64_t ne02 = node->src0->ne[2]; // C
|
16488
|
+
const int64_t ne03 = node->src0->ne[3]; // N
|
16489
|
+
|
16490
|
+
const int64_t ne10 = node->src1->ne[0]; // W
|
16491
|
+
const int64_t ne11 = node->src1->ne[1]; // H
|
16492
|
+
const int64_t ne12 = node->src1->ne[2]; // C
|
16493
|
+
|
16494
|
+
const int64_t nk = ne00*ne01;
|
16495
|
+
|
16496
|
+
UNUSED(ne02);
|
16497
|
+
UNUSED(ne03);
|
16498
|
+
UNUSED(nk);
|
16499
|
+
|
16500
|
+
size_t cur = 0;
|
16501
|
+
|
16502
|
+
if (node->src0->type == GGML_TYPE_F16 &&
|
16503
|
+
node->src1->type == GGML_TYPE_F32) {
|
16504
|
+
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
16505
|
+
} else if (node->src0->type == GGML_TYPE_F32 &&
|
16506
|
+
node->src1->type == GGML_TYPE_F32) {
|
16507
|
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
16508
|
+
} else {
|
16509
|
+
GGML_ASSERT(false);
|
16510
|
+
}
|
16511
|
+
|
15879
16512
|
work_size = MAX(work_size, cur);
|
15880
16513
|
} break;
|
15881
16514
|
case GGML_OP_FLASH_ATTN:
|
@@ -15937,6 +16570,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15937
16570
|
|
15938
16571
|
work_size = MAX(work_size, cur);
|
15939
16572
|
} break;
|
16573
|
+
case GGML_OP_WIN_PART:
|
16574
|
+
case GGML_OP_WIN_UNPART:
|
15940
16575
|
case GGML_OP_MAP_UNARY:
|
15941
16576
|
case GGML_OP_MAP_BINARY:
|
15942
16577
|
{
|
@@ -16469,16 +17104,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16469
17104
|
|
16470
17105
|
if (!*ctx_data) {
|
16471
17106
|
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
17107
|
+
fclose(fin);
|
16472
17108
|
return result;
|
16473
17109
|
}
|
16474
17110
|
}
|
16475
17111
|
|
16476
17112
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
16477
17113
|
|
16478
|
-
|
16479
|
-
|
16480
|
-
|
16481
|
-
|
17114
|
+
{
|
17115
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
17116
|
+
if (ret != fsize) {
|
17117
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
17118
|
+
fclose(fin);
|
17119
|
+
return result;
|
17120
|
+
}
|
16482
17121
|
}
|
16483
17122
|
|
16484
17123
|
fclose(fin);
|
@@ -17598,7 +18237,6 @@ GGML_API void ggml_opt_init(
|
|
17598
18237
|
ggml_set_zero(opt->lbfgs.g);
|
17599
18238
|
ggml_set_zero(opt->lbfgs.gp);
|
17600
18239
|
ggml_set_zero(opt->lbfgs.d);
|
17601
|
-
ggml_set_zero(opt->lbfgs.pf);
|
17602
18240
|
if (opt->lbfgs.pf) {
|
17603
18241
|
ggml_set_zero(opt->lbfgs.pf);
|
17604
18242
|
}
|