llama_cpp 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/examples/README.md +32 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +553 -313
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +157 -19
- data/ext/llama_cpp/src/ggml-metal.metal +149 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +736 -98
- data/ext/llama_cpp/src/ggml.h +140 -9
- data/ext/llama_cpp/src/llama.cpp +58 -31
- data/ext/llama_cpp/src/llama.h +8 -9
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -35,6 +35,12 @@
|
|
35
35
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
36
36
|
#endif
|
37
37
|
|
38
|
+
#if defined(_MSC_VER)
|
39
|
+
// disable "possible loss of data" to avoid hundreds of casts
|
40
|
+
// we should just be careful :)
|
41
|
+
#pragma warning(disable: 4244 4267)
|
42
|
+
#endif
|
43
|
+
|
38
44
|
#if defined(_WIN32)
|
39
45
|
|
40
46
|
#include <windows.h>
|
@@ -106,6 +112,7 @@ typedef void* thread_ret_t;
|
|
106
112
|
/*#define GGML_PERF*/
|
107
113
|
#define GGML_DEBUG 0
|
108
114
|
#define GGML_GELU_FP16
|
115
|
+
#define GGML_GELU_QUICK_FP16
|
109
116
|
#define GGML_SILU_FP16
|
110
117
|
|
111
118
|
#define GGML_SOFT_MAX_UNROLL 4
|
@@ -334,6 +341,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
334
341
|
// precomputed gelu table for f16 (128 KB)
|
335
342
|
static ggml_fp16_t table_gelu_f16[1 << 16];
|
336
343
|
|
344
|
+
// precomputed quick gelu table for f16 (128 KB)
|
345
|
+
static ggml_fp16_t table_gelu_quick_f16[1 << 16];
|
346
|
+
|
337
347
|
// precomputed silu table for f16 (128 KB)
|
338
348
|
static ggml_fp16_t table_silu_f16[1 << 16];
|
339
349
|
|
@@ -1671,14 +1681,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1671
1681
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1672
1682
|
#define GGML_F32x4_REDUCE(res, x) \
|
1673
1683
|
{ \
|
1674
|
-
|
1675
|
-
|
1684
|
+
int offset = GGML_F32_ARR >> 1; \
|
1685
|
+
for (int i = 0; i < offset; ++i) { \
|
1686
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1676
1687
|
} \
|
1677
|
-
|
1678
|
-
|
1688
|
+
offset >>= 1; \
|
1689
|
+
for (int i = 0; i < offset; ++i) { \
|
1690
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1679
1691
|
} \
|
1680
|
-
|
1681
|
-
|
1692
|
+
offset >>= 1; \
|
1693
|
+
for (int i = 0; i < offset; ++i) { \
|
1694
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1682
1695
|
} \
|
1683
1696
|
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1684
1697
|
}
|
@@ -1709,14 +1722,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1709
1722
|
#define GGML_F16x8_MUL vmulq_f16
|
1710
1723
|
#define GGML_F16x8_REDUCE(res, x) \
|
1711
1724
|
{ \
|
1712
|
-
|
1713
|
-
|
1725
|
+
int offset = GGML_F16_ARR >> 1; \
|
1726
|
+
for (int i = 0; i < offset; ++i) { \
|
1727
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1714
1728
|
} \
|
1715
|
-
|
1716
|
-
|
1729
|
+
offset >>= 1; \
|
1730
|
+
for (int i = 0; i < offset; ++i) { \
|
1731
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1717
1732
|
} \
|
1718
|
-
|
1719
|
-
|
1733
|
+
offset >>= 1; \
|
1734
|
+
for (int i = 0; i < offset; ++i) { \
|
1735
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1720
1736
|
} \
|
1721
1737
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1722
1738
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
@@ -1783,14 +1799,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1783
1799
|
#define GGML_F32x8_MUL _mm256_mul_ps
|
1784
1800
|
#define GGML_F32x8_REDUCE(res, x) \
|
1785
1801
|
{ \
|
1786
|
-
|
1787
|
-
|
1802
|
+
int offset = GGML_F32_ARR >> 1; \
|
1803
|
+
for (int i = 0; i < offset; ++i) { \
|
1804
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1788
1805
|
} \
|
1789
|
-
|
1790
|
-
|
1806
|
+
offset >>= 1; \
|
1807
|
+
for (int i = 0; i < offset; ++i) { \
|
1808
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1791
1809
|
} \
|
1792
|
-
|
1793
|
-
|
1810
|
+
offset >>= 1; \
|
1811
|
+
for (int i = 0; i < offset; ++i) { \
|
1812
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1794
1813
|
} \
|
1795
1814
|
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1796
1815
|
_mm256_extractf128_ps(x[0], 1)); \
|
@@ -1880,14 +1899,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1880
1899
|
#define GGML_F32x4_MUL vec_mul
|
1881
1900
|
#define GGML_F32x4_REDUCE(res, x) \
|
1882
1901
|
{ \
|
1883
|
-
|
1884
|
-
|
1902
|
+
int offset = GGML_F32_ARR >> 1; \
|
1903
|
+
for (int i = 0; i < offset; ++i) { \
|
1904
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1885
1905
|
} \
|
1886
|
-
|
1887
|
-
|
1906
|
+
offset >>= 1; \
|
1907
|
+
for (int i = 0; i < offset; ++i) { \
|
1908
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1888
1909
|
} \
|
1889
|
-
|
1890
|
-
|
1910
|
+
offset >>= 1; \
|
1911
|
+
for (int i = 0; i < offset; ++i) { \
|
1912
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1891
1913
|
} \
|
1892
1914
|
res = vec_extract(x[0], 0) + \
|
1893
1915
|
vec_extract(x[0], 1) + \
|
@@ -1943,14 +1965,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1943
1965
|
#define GGML_F32x4_MUL wasm_f32x4_mul
|
1944
1966
|
#define GGML_F32x4_REDUCE(res, x) \
|
1945
1967
|
{ \
|
1946
|
-
|
1947
|
-
|
1968
|
+
int offset = GGML_F32_ARR >> 1; \
|
1969
|
+
for (int i = 0; i < offset; ++i) { \
|
1970
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1948
1971
|
} \
|
1949
|
-
|
1950
|
-
|
1972
|
+
offset >>= 1; \
|
1973
|
+
for (int i = 0; i < offset; ++i) { \
|
1974
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1951
1975
|
} \
|
1952
|
-
|
1953
|
-
|
1976
|
+
offset >>= 1; \
|
1977
|
+
for (int i = 0; i < offset; ++i) { \
|
1978
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1954
1979
|
} \
|
1955
1980
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
1956
1981
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2005,14 +2030,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2005
2030
|
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2006
2031
|
#define GGML_F16x4_REDUCE(res, x) \
|
2007
2032
|
{ \
|
2008
|
-
|
2009
|
-
|
2033
|
+
int offset = GGML_F16_ARR >> 1; \
|
2034
|
+
for (int i = 0; i < offset; ++i) { \
|
2035
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2010
2036
|
} \
|
2011
|
-
|
2012
|
-
|
2037
|
+
offset >>= 1; \
|
2038
|
+
for (int i = 0; i < offset; ++i) { \
|
2039
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2013
2040
|
} \
|
2014
|
-
|
2015
|
-
|
2041
|
+
offset >>= 1; \
|
2042
|
+
for (int i = 0; i < offset; ++i) { \
|
2043
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2016
2044
|
} \
|
2017
2045
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2018
2046
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2054,14 +2082,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2054
2082
|
#define GGML_F32x4_MUL _mm_mul_ps
|
2055
2083
|
#define GGML_F32x4_REDUCE(res, x) \
|
2056
2084
|
{ \
|
2057
|
-
|
2058
|
-
|
2085
|
+
int offset = GGML_F32_ARR >> 1; \
|
2086
|
+
for (int i = 0; i < offset; ++i) { \
|
2087
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2059
2088
|
} \
|
2060
|
-
|
2061
|
-
|
2089
|
+
offset >>= 1; \
|
2090
|
+
for (int i = 0; i < offset; ++i) { \
|
2091
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2062
2092
|
} \
|
2063
|
-
|
2064
|
-
|
2093
|
+
offset >>= 1; \
|
2094
|
+
for (int i = 0; i < offset; ++i) { \
|
2095
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2065
2096
|
} \
|
2066
2097
|
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2067
2098
|
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
@@ -3350,6 +3381,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
3350
3381
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3351
3382
|
|
3352
3383
|
static const float GELU_COEF_A = 0.044715f;
|
3384
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3353
3385
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3354
3386
|
|
3355
3387
|
inline static float ggml_gelu_f32(float x) {
|
@@ -3380,6 +3412,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
3380
3412
|
}
|
3381
3413
|
#endif
|
3382
3414
|
|
3415
|
+
inline static float ggml_gelu_quick_f32(float x) {
|
3416
|
+
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
3417
|
+
}
|
3418
|
+
|
3419
|
+
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3420
|
+
// const uint16_t * i16 = (const uint16_t *) x;
|
3421
|
+
// for (int i = 0; i < n; ++i) {
|
3422
|
+
// y[i] = table_gelu_quick_f16[i16[i]];
|
3423
|
+
// }
|
3424
|
+
//}
|
3425
|
+
|
3426
|
+
#ifdef GGML_GELU_QUICK_FP16
|
3427
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3428
|
+
uint16_t t;
|
3429
|
+
for (int i = 0; i < n; ++i) {
|
3430
|
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3431
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
3432
|
+
y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
|
3433
|
+
}
|
3434
|
+
}
|
3435
|
+
#else
|
3436
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3437
|
+
for (int i = 0; i < n; ++i) {
|
3438
|
+
y[i] = ggml_gelu_quick_f32(x[i]);
|
3439
|
+
}
|
3440
|
+
}
|
3441
|
+
#endif
|
3442
|
+
|
3383
3443
|
// Sigmoid Linear Unit (SiLU) function
|
3384
3444
|
inline static float ggml_silu_f32(float x) {
|
3385
3445
|
return x/(1.0f + expf(-x));
|
@@ -3610,6 +3670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3610
3670
|
"STEP",
|
3611
3671
|
"RELU",
|
3612
3672
|
"GELU",
|
3673
|
+
"GELU_QUICK",
|
3613
3674
|
"SILU",
|
3614
3675
|
"SILU_BACK",
|
3615
3676
|
"NORM",
|
@@ -3638,12 +3699,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3638
3699
|
"ROPE_BACK",
|
3639
3700
|
"ALIBI",
|
3640
3701
|
"CLAMP",
|
3641
|
-
"
|
3642
|
-
"
|
3702
|
+
"CONV_1D_S1_PH",
|
3703
|
+
"CONV_1D_S2_PH",
|
3704
|
+
"CONV_2D_SK_P0",
|
3643
3705
|
|
3644
3706
|
"FLASH_ATTN",
|
3645
3707
|
"FLASH_FF",
|
3646
3708
|
"FLASH_ATTN_BACK",
|
3709
|
+
"WIN_PART",
|
3710
|
+
"WIN_UNPART",
|
3647
3711
|
|
3648
3712
|
"MAP_UNARY",
|
3649
3713
|
"MAP_BINARY",
|
@@ -3652,7 +3716,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3652
3716
|
"CROSS_ENTROPY_LOSS_BACK",
|
3653
3717
|
};
|
3654
3718
|
|
3655
|
-
static_assert(GGML_OP_COUNT ==
|
3719
|
+
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
3656
3720
|
|
3657
3721
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3658
3722
|
"none",
|
@@ -3678,6 +3742,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3678
3742
|
"step(x)",
|
3679
3743
|
"relu(x)",
|
3680
3744
|
"gelu(x)",
|
3745
|
+
"gelu_quick(x)",
|
3681
3746
|
"silu(x)",
|
3682
3747
|
"silu_back(x)",
|
3683
3748
|
"norm(x)",
|
@@ -3706,12 +3771,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3706
3771
|
"rope_back(x)",
|
3707
3772
|
"alibi(x)",
|
3708
3773
|
"clamp(x)",
|
3709
|
-
"
|
3710
|
-
"
|
3774
|
+
"conv_1d_s1_ph(x)",
|
3775
|
+
"conv_1d_s2_ph(x)",
|
3776
|
+
"conv_2d_sk_p0(x)",
|
3711
3777
|
|
3712
3778
|
"flash_attn(x)",
|
3713
3779
|
"flash_ff(x)",
|
3714
3780
|
"flash_attn_back(x)",
|
3781
|
+
"win_part(x)",
|
3782
|
+
"win_unpart(x)",
|
3715
3783
|
|
3716
3784
|
"f(x)",
|
3717
3785
|
"f(x,y)",
|
@@ -3720,7 +3788,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3720
3788
|
"cross_entropy_loss_back(x,y)",
|
3721
3789
|
};
|
3722
3790
|
|
3723
|
-
static_assert(GGML_OP_COUNT ==
|
3791
|
+
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
3724
3792
|
|
3725
3793
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3726
3794
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4011,7 +4079,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4011
4079
|
// initialize time system (required on Windows)
|
4012
4080
|
ggml_time_init();
|
4013
4081
|
|
4014
|
-
// initialize GELU, SILU and EXP F32 tables
|
4082
|
+
// initialize GELU, Quick GELU, SILU and EXP F32 tables
|
4015
4083
|
{
|
4016
4084
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
4017
4085
|
|
@@ -4021,13 +4089,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4021
4089
|
memcpy(&ii, &ui, sizeof(ii));
|
4022
4090
|
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
4023
4091
|
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
4092
|
+
table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
4024
4093
|
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
4025
4094
|
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4026
4095
|
}
|
4027
4096
|
|
4028
4097
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
4029
4098
|
|
4030
|
-
GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4099
|
+
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4031
4100
|
}
|
4032
4101
|
|
4033
4102
|
// initialize g_state
|
@@ -4148,14 +4217,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
|
4148
4217
|
ctx->no_alloc = no_alloc;
|
4149
4218
|
}
|
4150
4219
|
|
4151
|
-
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4220
|
+
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
|
4152
4221
|
return ctx->mem_buffer;
|
4153
4222
|
}
|
4154
4223
|
|
4155
|
-
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4224
|
+
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
4156
4225
|
return ctx->mem_size;
|
4157
4226
|
}
|
4158
4227
|
|
4228
|
+
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
4229
|
+
size_t max_size = 0;
|
4230
|
+
|
4231
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4232
|
+
|
4233
|
+
while (obj != NULL) {
|
4234
|
+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
4235
|
+
|
4236
|
+
const size_t size = ggml_nbytes(tensor);
|
4237
|
+
|
4238
|
+
if (max_size < size) {
|
4239
|
+
max_size = size;
|
4240
|
+
}
|
4241
|
+
|
4242
|
+
obj = obj->next;
|
4243
|
+
}
|
4244
|
+
|
4245
|
+
return max_size;
|
4246
|
+
}
|
4247
|
+
|
4159
4248
|
// IMPORTANT:
|
4160
4249
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4161
4250
|
// this is an error prone process, but it is necessary to support inplace
|
@@ -4639,9 +4728,10 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
|
4639
4728
|
return tensor->name;
|
4640
4729
|
}
|
4641
4730
|
|
4642
|
-
|
4731
|
+
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
4643
4732
|
strncpy(tensor->name, name, sizeof(tensor->name));
|
4644
4733
|
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
4734
|
+
return tensor;
|
4645
4735
|
}
|
4646
4736
|
|
4647
4737
|
struct ggml_tensor * ggml_view_tensor(
|
@@ -5420,6 +5510,40 @@ struct ggml_tensor * ggml_gelu_inplace(
|
|
5420
5510
|
return ggml_gelu_impl(ctx, a, true);
|
5421
5511
|
}
|
5422
5512
|
|
5513
|
+
// ggml_gelu_quick
|
5514
|
+
|
5515
|
+
struct ggml_tensor * ggml_gelu_quick_impl(
|
5516
|
+
struct ggml_context * ctx,
|
5517
|
+
struct ggml_tensor * a,
|
5518
|
+
bool inplace) {
|
5519
|
+
bool is_node = false;
|
5520
|
+
|
5521
|
+
if (!inplace && (a->grad)) {
|
5522
|
+
is_node = true;
|
5523
|
+
}
|
5524
|
+
|
5525
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5526
|
+
|
5527
|
+
result->op = GGML_OP_GELU_QUICK;
|
5528
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5529
|
+
result->src0 = a;
|
5530
|
+
result->src1 = NULL;
|
5531
|
+
|
5532
|
+
return result;
|
5533
|
+
}
|
5534
|
+
|
5535
|
+
struct ggml_tensor * ggml_gelu_quick(
|
5536
|
+
struct ggml_context * ctx,
|
5537
|
+
struct ggml_tensor * a) {
|
5538
|
+
return ggml_gelu_quick_impl(ctx, a, false);
|
5539
|
+
}
|
5540
|
+
|
5541
|
+
struct ggml_tensor * ggml_gelu_quick_inplace(
|
5542
|
+
struct ggml_context * ctx,
|
5543
|
+
struct ggml_tensor * a) {
|
5544
|
+
return ggml_gelu_quick_impl(ctx, a, true);
|
5545
|
+
}
|
5546
|
+
|
5423
5547
|
// ggml_silu
|
5424
5548
|
|
5425
5549
|
struct ggml_tensor * ggml_silu_impl(
|
@@ -6619,7 +6743,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6619
6743
|
|
6620
6744
|
ggml_scratch_save(ctx);
|
6621
6745
|
|
6622
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx,
|
6746
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
|
6623
6747
|
|
6624
6748
|
((float *) b->data)[0] = min;
|
6625
6749
|
((float *) b->data)[1] = max;
|
@@ -6634,9 +6758,9 @@ struct ggml_tensor * ggml_clamp(
|
|
6634
6758
|
return result;
|
6635
6759
|
}
|
6636
6760
|
|
6637
|
-
//
|
6761
|
+
// ggml_conv_1d_s1_ph
|
6638
6762
|
|
6639
|
-
struct ggml_tensor *
|
6763
|
+
struct ggml_tensor * ggml_conv_1d_s1_ph(
|
6640
6764
|
struct ggml_context * ctx,
|
6641
6765
|
struct ggml_tensor * a,
|
6642
6766
|
struct ggml_tensor * b) {
|
@@ -6653,7 +6777,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6653
6777
|
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
6654
6778
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6655
6779
|
|
6656
|
-
result->op =
|
6780
|
+
result->op = GGML_OP_CONV_1D_S1_PH;
|
6657
6781
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6658
6782
|
result->src0 = a;
|
6659
6783
|
result->src1 = b;
|
@@ -6661,9 +6785,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6661
6785
|
return result;
|
6662
6786
|
}
|
6663
6787
|
|
6664
|
-
//
|
6788
|
+
// ggml_conv_1d_s2_ph
|
6665
6789
|
|
6666
|
-
struct ggml_tensor *
|
6790
|
+
struct ggml_tensor * ggml_conv_1d_s2_ph(
|
6667
6791
|
struct ggml_context * ctx,
|
6668
6792
|
struct ggml_tensor * a,
|
6669
6793
|
struct ggml_tensor * b) {
|
@@ -6680,7 +6804,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
6680
6804
|
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
6681
6805
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6682
6806
|
|
6683
|
-
result->op =
|
6807
|
+
result->op = GGML_OP_CONV_1D_S2_PH;
|
6808
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6809
|
+
result->src0 = a;
|
6810
|
+
result->src1 = b;
|
6811
|
+
|
6812
|
+
return result;
|
6813
|
+
}
|
6814
|
+
|
6815
|
+
// ggml_conv_2d_sk_p0
|
6816
|
+
|
6817
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6818
|
+
struct ggml_context * ctx,
|
6819
|
+
struct ggml_tensor * a,
|
6820
|
+
struct ggml_tensor * b) {
|
6821
|
+
GGML_ASSERT(b->ne[3] == 1);
|
6822
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
6823
|
+
GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
|
6824
|
+
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
6825
|
+
bool is_node = false;
|
6826
|
+
|
6827
|
+
if (a->grad || b->grad) {
|
6828
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6829
|
+
is_node = true;
|
6830
|
+
}
|
6831
|
+
|
6832
|
+
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
6833
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6834
|
+
|
6835
|
+
result->op = GGML_OP_CONV_2D_SK_P0;
|
6684
6836
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6685
6837
|
result->src0 = a;
|
6686
6838
|
result->src1 = b;
|
@@ -6814,6 +6966,89 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6814
6966
|
return result;
|
6815
6967
|
}
|
6816
6968
|
|
6969
|
+
// ggml_win_part
|
6970
|
+
|
6971
|
+
struct ggml_tensor * ggml_win_part(
|
6972
|
+
struct ggml_context * ctx,
|
6973
|
+
struct ggml_tensor * a,
|
6974
|
+
int w) {
|
6975
|
+
GGML_ASSERT(a->ne[3] == 1);
|
6976
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
6977
|
+
|
6978
|
+
bool is_node = false;
|
6979
|
+
|
6980
|
+
if (a->grad) {
|
6981
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6982
|
+
is_node = true;
|
6983
|
+
}
|
6984
|
+
|
6985
|
+
// padding
|
6986
|
+
const int px = (w - a->ne[1]%w)%w;
|
6987
|
+
const int py = (w - a->ne[2]%w)%w;
|
6988
|
+
|
6989
|
+
const int npx = (px + a->ne[1])/w;
|
6990
|
+
const int npy = (py + a->ne[2])/w;
|
6991
|
+
const int np = npx*npy;
|
6992
|
+
|
6993
|
+
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
6994
|
+
|
6995
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6996
|
+
|
6997
|
+
ggml_scratch_save(ctx);
|
6998
|
+
|
6999
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7000
|
+
|
7001
|
+
((int32_t *) b->data)[0] = npx;
|
7002
|
+
((int32_t *) b->data)[1] = npy;
|
7003
|
+
((int32_t *) b->data)[2] = w;
|
7004
|
+
|
7005
|
+
ggml_scratch_load(ctx);
|
7006
|
+
|
7007
|
+
result->op = GGML_OP_WIN_PART;
|
7008
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7009
|
+
result->src0 = a;
|
7010
|
+
result->src1 = NULL;
|
7011
|
+
result->opt[0] = b;
|
7012
|
+
|
7013
|
+
return result;
|
7014
|
+
}
|
7015
|
+
|
7016
|
+
// ggml_win_unpart
|
7017
|
+
|
7018
|
+
struct ggml_tensor * ggml_win_unpart(
|
7019
|
+
struct ggml_context * ctx,
|
7020
|
+
struct ggml_tensor * a,
|
7021
|
+
int w0,
|
7022
|
+
int h0,
|
7023
|
+
int w) {
|
7024
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
7025
|
+
|
7026
|
+
bool is_node = false;
|
7027
|
+
|
7028
|
+
if (a->grad) {
|
7029
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7030
|
+
is_node = true;
|
7031
|
+
}
|
7032
|
+
|
7033
|
+
const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
|
7034
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7035
|
+
|
7036
|
+
ggml_scratch_save(ctx);
|
7037
|
+
|
7038
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
7039
|
+
|
7040
|
+
((int32_t *) b->data)[0] = w;
|
7041
|
+
|
7042
|
+
ggml_scratch_load(ctx);
|
7043
|
+
|
7044
|
+
result->op = GGML_OP_WIN_UNPART;
|
7045
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7046
|
+
result->src0 = a;
|
7047
|
+
result->src1 = NULL;
|
7048
|
+
result->opt[0] = b;
|
7049
|
+
|
7050
|
+
return result;
|
7051
|
+
}
|
6817
7052
|
|
6818
7053
|
// ggml_map_unary
|
6819
7054
|
|
@@ -7892,7 +8127,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
7892
8127
|
|
7893
8128
|
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
7894
8129
|
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
7895
|
-
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*
|
8130
|
+
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
7896
8131
|
|
7897
8132
|
assert(ne00 % 32 == 0);
|
7898
8133
|
|
@@ -9453,8 +9688,65 @@ static void ggml_compute_forward_gelu(
|
|
9453
9688
|
GGML_ASSERT(false);
|
9454
9689
|
} break;
|
9455
9690
|
}
|
9691
|
+
}
|
9692
|
+
|
9693
|
+
// ggml_compute_forward_gelu_quick
|
9694
|
+
|
9695
|
+
static void ggml_compute_forward_gelu_quick_f32(
|
9696
|
+
const struct ggml_compute_params * params,
|
9697
|
+
const struct ggml_tensor * src0,
|
9698
|
+
struct ggml_tensor * dst) {
|
9699
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
9700
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
9701
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
9702
|
+
|
9703
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9704
|
+
return;
|
9705
|
+
}
|
9706
|
+
|
9707
|
+
const int ith = params->ith;
|
9708
|
+
const int nth = params->nth;
|
9709
|
+
|
9710
|
+
const int nc = src0->ne[0];
|
9711
|
+
const int nr = ggml_nrows(src0);
|
9456
9712
|
|
9457
|
-
//
|
9713
|
+
// rows per thread
|
9714
|
+
const int dr = (nr + nth - 1)/nth;
|
9715
|
+
|
9716
|
+
// row range for this thread
|
9717
|
+
const int ir0 = dr*ith;
|
9718
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
9719
|
+
|
9720
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
9721
|
+
ggml_vec_gelu_quick_f32(nc,
|
9722
|
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
9723
|
+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
9724
|
+
|
9725
|
+
#ifndef NDEBUG
|
9726
|
+
for (int k = 0; k < nc; k++) {
|
9727
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
9728
|
+
UNUSED(x);
|
9729
|
+
assert(!isnan(x));
|
9730
|
+
assert(!isinf(x));
|
9731
|
+
}
|
9732
|
+
#endif
|
9733
|
+
}
|
9734
|
+
}
|
9735
|
+
|
9736
|
+
static void ggml_compute_forward_gelu_quick(
|
9737
|
+
const struct ggml_compute_params * params,
|
9738
|
+
const struct ggml_tensor * src0,
|
9739
|
+
struct ggml_tensor * dst) {
|
9740
|
+
switch (src0->type) {
|
9741
|
+
case GGML_TYPE_F32:
|
9742
|
+
{
|
9743
|
+
ggml_compute_forward_gelu_quick_f32(params, src0, dst);
|
9744
|
+
} break;
|
9745
|
+
default:
|
9746
|
+
{
|
9747
|
+
GGML_ASSERT(false);
|
9748
|
+
} break;
|
9749
|
+
}
|
9458
9750
|
}
|
9459
9751
|
|
9460
9752
|
// ggml_compute_forward_silu
|
@@ -10852,7 +11144,7 @@ static void ggml_compute_forward_set_f32(
|
|
10852
11144
|
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
10853
11145
|
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
10854
11146
|
|
10855
|
-
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3
|
11147
|
+
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
10856
11148
|
|
10857
11149
|
GGML_ASSERT(nb10 == sizeof(float));
|
10858
11150
|
|
@@ -11573,8 +11865,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11573
11865
|
const struct ggml_tensor * src1,
|
11574
11866
|
struct ggml_tensor * dst) {
|
11575
11867
|
assert(params->ith == 0);
|
11576
|
-
|
11577
|
-
|
11868
|
+
|
11869
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11870
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11578
11871
|
|
11579
11872
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11580
11873
|
return;
|
@@ -11637,8 +11930,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11637
11930
|
const struct ggml_tensor * src1,
|
11638
11931
|
struct ggml_tensor * dst) {
|
11639
11932
|
assert(params->ith == 0);
|
11640
|
-
|
11641
|
-
|
11933
|
+
|
11934
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11935
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11642
11936
|
|
11643
11937
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11644
11938
|
return;
|
@@ -11740,15 +12034,16 @@ static void ggml_compute_forward_clamp_f32(
|
|
11740
12034
|
const struct ggml_tensor * src1,
|
11741
12035
|
struct ggml_tensor * dst) {
|
11742
12036
|
assert(params->ith == 0);
|
11743
|
-
|
11744
|
-
|
12037
|
+
|
12038
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12039
|
+
GGML_ASSERT(ggml_nelements(src1) == 2);
|
11745
12040
|
|
11746
12041
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11747
12042
|
return;
|
11748
12043
|
}
|
11749
12044
|
|
11750
|
-
const
|
11751
|
-
const
|
12045
|
+
const float min = ((float *) src1->data)[0];
|
12046
|
+
const float max = ((float *) src1->data)[1];
|
11752
12047
|
|
11753
12048
|
const int ith = params->ith;
|
11754
12049
|
const int nth = params->nth;
|
@@ -12306,9 +12601,9 @@ static void ggml_compute_forward_rope_back(
|
|
12306
12601
|
}
|
12307
12602
|
}
|
12308
12603
|
|
12309
|
-
//
|
12604
|
+
// ggml_compute_forward_conv_1d_s1_ph
|
12310
12605
|
|
12311
|
-
static void
|
12606
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12312
12607
|
const struct ggml_compute_params * params,
|
12313
12608
|
const struct ggml_tensor * src0,
|
12314
12609
|
const struct ggml_tensor * src1,
|
@@ -12428,7 +12723,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
12428
12723
|
}
|
12429
12724
|
}
|
12430
12725
|
|
12431
|
-
static void
|
12726
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
12432
12727
|
const struct ggml_compute_params * params,
|
12433
12728
|
const struct ggml_tensor * src0,
|
12434
12729
|
const struct ggml_tensor * src1,
|
@@ -12548,7 +12843,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
12548
12843
|
}
|
12549
12844
|
}
|
12550
12845
|
|
12551
|
-
static void
|
12846
|
+
static void ggml_compute_forward_conv_1d_s1_ph(
|
12552
12847
|
const struct ggml_compute_params * params,
|
12553
12848
|
const struct ggml_tensor * src0,
|
12554
12849
|
const struct ggml_tensor * src1,
|
@@ -12556,11 +12851,11 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12556
12851
|
switch (src0->type) {
|
12557
12852
|
case GGML_TYPE_F16:
|
12558
12853
|
{
|
12559
|
-
|
12854
|
+
ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
|
12560
12855
|
} break;
|
12561
12856
|
case GGML_TYPE_F32:
|
12562
12857
|
{
|
12563
|
-
|
12858
|
+
ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
|
12564
12859
|
} break;
|
12565
12860
|
default:
|
12566
12861
|
{
|
@@ -12569,9 +12864,9 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12569
12864
|
}
|
12570
12865
|
}
|
12571
12866
|
|
12572
|
-
//
|
12867
|
+
// ggml_compute_forward_conv_1d_s2_ph
|
12573
12868
|
|
12574
|
-
static void
|
12869
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
12575
12870
|
const struct ggml_compute_params * params,
|
12576
12871
|
const struct ggml_tensor * src0,
|
12577
12872
|
const struct ggml_tensor * src1,
|
@@ -12691,7 +12986,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
12691
12986
|
}
|
12692
12987
|
}
|
12693
12988
|
|
12694
|
-
static void
|
12989
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
12695
12990
|
const struct ggml_compute_params * params,
|
12696
12991
|
const struct ggml_tensor * src0,
|
12697
12992
|
const struct ggml_tensor * src1,
|
@@ -12811,7 +13106,143 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
12811
13106
|
}
|
12812
13107
|
}
|
12813
13108
|
|
12814
|
-
static void
|
13109
|
+
static void ggml_compute_forward_conv_1d_s2_ph(
|
13110
|
+
const struct ggml_compute_params * params,
|
13111
|
+
const struct ggml_tensor * src0,
|
13112
|
+
const struct ggml_tensor * src1,
|
13113
|
+
struct ggml_tensor * dst) {
|
13114
|
+
switch (src0->type) {
|
13115
|
+
case GGML_TYPE_F16:
|
13116
|
+
{
|
13117
|
+
ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
|
13118
|
+
} break;
|
13119
|
+
case GGML_TYPE_F32:
|
13120
|
+
{
|
13121
|
+
ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
|
13122
|
+
} break;
|
13123
|
+
default:
|
13124
|
+
{
|
13125
|
+
GGML_ASSERT(false);
|
13126
|
+
} break;
|
13127
|
+
}
|
13128
|
+
}
|
13129
|
+
|
13130
|
+
// ggml_compute_forward_conv_2d_sk_p0
|
13131
|
+
|
13132
|
+
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
13133
|
+
const struct ggml_compute_params * params,
|
13134
|
+
const struct ggml_tensor * src0,
|
13135
|
+
const struct ggml_tensor * src1,
|
13136
|
+
struct ggml_tensor * dst) {
|
13137
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13138
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13139
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13140
|
+
|
13141
|
+
int64_t t0 = ggml_perf_time_us();
|
13142
|
+
UNUSED(t0);
|
13143
|
+
|
13144
|
+
const int ne00 = src0->ne[0];
|
13145
|
+
const int ne01 = src0->ne[1];
|
13146
|
+
const int ne02 = src0->ne[2];
|
13147
|
+
//const int ne03 = src0->ne[3];
|
13148
|
+
|
13149
|
+
const int ne10 = src1->ne[0];
|
13150
|
+
//const int ne11 = src1->ne[1];
|
13151
|
+
const int ne12 = src1->ne[2];
|
13152
|
+
//const int ne13 = src1->ne[3];
|
13153
|
+
|
13154
|
+
const int ne0 = dst->ne[0];
|
13155
|
+
const int ne1 = dst->ne[1];
|
13156
|
+
const int ne2 = dst->ne[2];
|
13157
|
+
//const int ne3 = dst->ne[3];
|
13158
|
+
//const int ne = ne0*ne1*ne2*ne3;
|
13159
|
+
|
13160
|
+
const int nb00 = src0->nb[0];
|
13161
|
+
//const int nb01 = src0->nb[1];
|
13162
|
+
//const int nb02 = src0->nb[2];
|
13163
|
+
const int nb03 = src0->nb[3];
|
13164
|
+
|
13165
|
+
const int nb10 = src1->nb[0];
|
13166
|
+
//const int nb11 = src1->nb[1];
|
13167
|
+
const int nb12 = src1->nb[2];
|
13168
|
+
//const int nb13 = src1->nb[3];
|
13169
|
+
|
13170
|
+
//const int nb0 = dst->nb[0];
|
13171
|
+
//const int nb1 = dst->nb[1];
|
13172
|
+
const int nb2 = dst->nb[2];
|
13173
|
+
//const int nb3 = dst->nb[3];
|
13174
|
+
|
13175
|
+
const int ith = params->ith;
|
13176
|
+
const int nth = params->nth;
|
13177
|
+
|
13178
|
+
const int nk0 = ne00;
|
13179
|
+
const int nk1 = ne01;
|
13180
|
+
|
13181
|
+
// size of the convolution row - the kernel size unrolled across all channels
|
13182
|
+
// round-up so it is more suitable for SIMD
|
13183
|
+
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
13184
|
+
|
13185
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13186
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13187
|
+
|
13188
|
+
if (params->type == GGML_TASK_INIT) {
|
13189
|
+
// TODO: fix this memset (wsize is overestimated)
|
13190
|
+
memset(params->wdata, 0, params->wsize);
|
13191
|
+
|
13192
|
+
// prepare source data (src1)
|
13193
|
+
{
|
13194
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13195
|
+
|
13196
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13197
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13198
|
+
ggml_fp16_t * dst_data = wdata;
|
13199
|
+
|
13200
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13201
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13202
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13203
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13204
|
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13205
|
+
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
13206
|
+
}
|
13207
|
+
}
|
13208
|
+
}
|
13209
|
+
}
|
13210
|
+
}
|
13211
|
+
}
|
13212
|
+
|
13213
|
+
return;
|
13214
|
+
}
|
13215
|
+
|
13216
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13217
|
+
return;
|
13218
|
+
}
|
13219
|
+
|
13220
|
+
// total patches in dst
|
13221
|
+
const int np = ne2;
|
13222
|
+
|
13223
|
+
// patches per thread
|
13224
|
+
const int dp = (np + nth - 1)/nth;
|
13225
|
+
|
13226
|
+
// patch range for this thread
|
13227
|
+
const int ip0 = dp*ith;
|
13228
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13229
|
+
|
13230
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13231
|
+
|
13232
|
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13233
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13234
|
+
|
13235
|
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13236
|
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13237
|
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13238
|
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13239
|
+
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
|
13240
|
+
}
|
13241
|
+
}
|
13242
|
+
}
|
13243
|
+
}
|
13244
|
+
|
13245
|
+
static void ggml_compute_forward_conv_2d_sk_p0(
|
12815
13246
|
const struct ggml_compute_params * params,
|
12816
13247
|
const struct ggml_tensor * src0,
|
12817
13248
|
const struct ggml_tensor * src1,
|
@@ -12819,11 +13250,12 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12819
13250
|
switch (src0->type) {
|
12820
13251
|
case GGML_TYPE_F16:
|
12821
13252
|
{
|
12822
|
-
|
13253
|
+
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
12823
13254
|
} break;
|
12824
13255
|
case GGML_TYPE_F32:
|
12825
13256
|
{
|
12826
|
-
|
13257
|
+
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
13258
|
+
GGML_ASSERT(false);
|
12827
13259
|
} break;
|
12828
13260
|
default:
|
12829
13261
|
{
|
@@ -13926,6 +14358,145 @@ static void ggml_compute_forward_flash_attn_back(
|
|
13926
14358
|
}
|
13927
14359
|
}
|
13928
14360
|
|
14361
|
+
// ggml_compute_forward_win_part
|
14362
|
+
|
14363
|
+
static void ggml_compute_forward_win_part_f32(
|
14364
|
+
const struct ggml_compute_params * params,
|
14365
|
+
const struct ggml_tensor * src0,
|
14366
|
+
const struct ggml_tensor * opt0,
|
14367
|
+
struct ggml_tensor * dst) {
|
14368
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14369
|
+
return;
|
14370
|
+
}
|
14371
|
+
|
14372
|
+
const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
|
14373
|
+
const int64_t ne01 = src0->ne[1];
|
14374
|
+
const int64_t ne02 = src0->ne[2];
|
14375
|
+
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14376
|
+
|
14377
|
+
const int64_t ne0 = dst->ne[0];
|
14378
|
+
const int64_t ne1 = dst->ne[1];
|
14379
|
+
const int64_t ne2 = dst->ne[2];
|
14380
|
+
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
14381
|
+
|
14382
|
+
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14383
|
+
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
14384
|
+
const int32_t w = ((const int32_t *)(opt0->data))[2];
|
14385
|
+
|
14386
|
+
assert(ne00 == ne0);
|
14387
|
+
assert(ne3 == nep0*nep1);
|
14388
|
+
|
14389
|
+
// TODO: optimize / multi-thread
|
14390
|
+
for (int py = 0; py < nep1; ++py) {
|
14391
|
+
for (int px = 0; px < nep0; ++px) {
|
14392
|
+
const int64_t i3 = py*nep0 + px;
|
14393
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14394
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14395
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14396
|
+
const int64_t i02 = py*w + i2;
|
14397
|
+
const int64_t i01 = px*w + i1;
|
14398
|
+
const int64_t i00 = i0;
|
14399
|
+
|
14400
|
+
const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
|
14401
|
+
const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
|
14402
|
+
|
14403
|
+
if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
|
14404
|
+
((float *) dst->data)[i] = 0.0f;
|
14405
|
+
} else {
|
14406
|
+
((float *) dst->data)[i] = ((float *) src0->data)[j];
|
14407
|
+
}
|
14408
|
+
}
|
14409
|
+
}
|
14410
|
+
}
|
14411
|
+
}
|
14412
|
+
}
|
14413
|
+
}
|
14414
|
+
|
14415
|
+
static void ggml_compute_forward_win_part(
|
14416
|
+
const struct ggml_compute_params * params,
|
14417
|
+
const struct ggml_tensor * src0,
|
14418
|
+
const struct ggml_tensor * opt0,
|
14419
|
+
struct ggml_tensor * dst) {
|
14420
|
+
switch (src0->type) {
|
14421
|
+
case GGML_TYPE_F32:
|
14422
|
+
{
|
14423
|
+
ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
|
14424
|
+
} break;
|
14425
|
+
default:
|
14426
|
+
{
|
14427
|
+
GGML_ASSERT(false);
|
14428
|
+
} break;
|
14429
|
+
}
|
14430
|
+
}
|
14431
|
+
|
14432
|
+
// ggml_compute_forward_win_unpart
|
14433
|
+
|
14434
|
+
static void ggml_compute_forward_win_unpart_f32(
|
14435
|
+
const struct ggml_compute_params * params,
|
14436
|
+
const struct ggml_tensor * src0,
|
14437
|
+
const struct ggml_tensor * opt0,
|
14438
|
+
struct ggml_tensor * dst) {
|
14439
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14440
|
+
return;
|
14441
|
+
}
|
14442
|
+
|
14443
|
+
const int64_t ne00 = src0->ne[0];
|
14444
|
+
const int64_t ne01 = src0->ne[1];
|
14445
|
+
const int64_t ne02 = src0->ne[2];
|
14446
|
+
//const int64_t ne03 = src0->ne[3];
|
14447
|
+
|
14448
|
+
const int64_t ne0 = dst->ne[0];
|
14449
|
+
const int64_t ne1 = dst->ne[1];
|
14450
|
+
const int64_t ne2 = dst->ne[2];
|
14451
|
+
|
14452
|
+
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14453
|
+
|
14454
|
+
// padding
|
14455
|
+
const int px = (w - ne1%w)%w;
|
14456
|
+
//const int py = (w - ne2%w)%w;
|
14457
|
+
|
14458
|
+
const int npx = (px + ne1)/w;
|
14459
|
+
//const int npy = (py + ne2)/w;
|
14460
|
+
|
14461
|
+
assert(ne0 == ne00);
|
14462
|
+
|
14463
|
+
// TODO: optimize / multi-thread
|
14464
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14465
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14466
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14467
|
+
const int ip2 = i2/w;
|
14468
|
+
const int ip1 = i1/w;
|
14469
|
+
|
14470
|
+
const int64_t i02 = i2%w;
|
14471
|
+
const int64_t i01 = i1%w;
|
14472
|
+
const int64_t i00 = i0;
|
14473
|
+
|
14474
|
+
const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
|
14475
|
+
const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
|
14476
|
+
|
14477
|
+
((float *) dst->data)[j] = ((float *) src0->data)[i];
|
14478
|
+
}
|
14479
|
+
}
|
14480
|
+
}
|
14481
|
+
}
|
14482
|
+
|
14483
|
+
static void ggml_compute_forward_win_unpart(
|
14484
|
+
const struct ggml_compute_params * params,
|
14485
|
+
const struct ggml_tensor * src0,
|
14486
|
+
const struct ggml_tensor * opt0,
|
14487
|
+
struct ggml_tensor * dst) {
|
14488
|
+
switch (src0->type) {
|
14489
|
+
case GGML_TYPE_F32:
|
14490
|
+
{
|
14491
|
+
ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
|
14492
|
+
} break;
|
14493
|
+
default:
|
14494
|
+
{
|
14495
|
+
GGML_ASSERT(false);
|
14496
|
+
} break;
|
14497
|
+
}
|
14498
|
+
}
|
14499
|
+
|
13929
14500
|
// ggml_compute_forward_map_unary
|
13930
14501
|
|
13931
14502
|
static void ggml_compute_forward_map_unary_f32(
|
@@ -14398,6 +14969,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14398
14969
|
{
|
14399
14970
|
ggml_compute_forward_gelu(params, tensor->src0, tensor);
|
14400
14971
|
} break;
|
14972
|
+
case GGML_OP_GELU_QUICK:
|
14973
|
+
{
|
14974
|
+
ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
|
14975
|
+
} break;
|
14401
14976
|
case GGML_OP_SILU:
|
14402
14977
|
{
|
14403
14978
|
ggml_compute_forward_silu(params, tensor->src0, tensor);
|
@@ -14502,19 +15077,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14502
15077
|
{
|
14503
15078
|
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
14504
15079
|
} break;
|
14505
|
-
case
|
15080
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15081
|
+
{
|
15082
|
+
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15083
|
+
} break;
|
15084
|
+
case GGML_OP_CONV_1D_S2_PH:
|
14506
15085
|
{
|
14507
|
-
|
15086
|
+
ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
|
14508
15087
|
} break;
|
14509
|
-
case
|
15088
|
+
case GGML_OP_CONV_2D_SK_P0:
|
14510
15089
|
{
|
14511
|
-
|
15090
|
+
ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
|
14512
15091
|
} break;
|
14513
15092
|
case GGML_OP_FLASH_ATTN:
|
14514
15093
|
{
|
14515
|
-
int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
15094
|
+
const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
14516
15095
|
GGML_ASSERT(t == 0 || t == 1);
|
14517
|
-
bool masked = t != 0;
|
15096
|
+
const bool masked = t != 0;
|
14518
15097
|
ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
|
14519
15098
|
} break;
|
14520
15099
|
case GGML_OP_FLASH_FF:
|
@@ -14528,6 +15107,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14528
15107
|
bool masked = t != 0;
|
14529
15108
|
ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
|
14530
15109
|
} break;
|
15110
|
+
case GGML_OP_WIN_PART:
|
15111
|
+
{
|
15112
|
+
ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
|
15113
|
+
} break;
|
15114
|
+
case GGML_OP_WIN_UNPART:
|
15115
|
+
{
|
15116
|
+
ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
|
15117
|
+
} break;
|
14531
15118
|
case GGML_OP_MAP_UNARY:
|
14532
15119
|
{
|
14533
15120
|
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
@@ -14799,6 +15386,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14799
15386
|
{
|
14800
15387
|
GGML_ASSERT(false); // TODO: not implemented
|
14801
15388
|
} break;
|
15389
|
+
case GGML_OP_GELU_QUICK:
|
15390
|
+
{
|
15391
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15392
|
+
} break;
|
14802
15393
|
case GGML_OP_ALIBI:
|
14803
15394
|
{
|
14804
15395
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15161,11 +15752,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15161
15752
|
// noop
|
15162
15753
|
}
|
15163
15754
|
} break;
|
15164
|
-
case
|
15755
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15756
|
+
{
|
15757
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15758
|
+
} break;
|
15759
|
+
case GGML_OP_CONV_1D_S2_PH:
|
15165
15760
|
{
|
15166
15761
|
GGML_ASSERT(false); // TODO: not implemented
|
15167
15762
|
} break;
|
15168
|
-
case
|
15763
|
+
case GGML_OP_CONV_2D_SK_P0:
|
15169
15764
|
{
|
15170
15765
|
GGML_ASSERT(false); // TODO: not implemented
|
15171
15766
|
} break;
|
@@ -15334,6 +15929,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15334
15929
|
{
|
15335
15930
|
GGML_ASSERT(false); // not supported
|
15336
15931
|
} break;
|
15932
|
+
case GGML_OP_WIN_PART:
|
15933
|
+
case GGML_OP_WIN_UNPART:
|
15337
15934
|
case GGML_OP_MAP_UNARY:
|
15338
15935
|
case GGML_OP_MAP_BINARY:
|
15339
15936
|
{
|
@@ -15742,6 +16339,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15742
16339
|
} break;
|
15743
16340
|
case GGML_OP_MUL:
|
15744
16341
|
case GGML_OP_GELU:
|
16342
|
+
case GGML_OP_GELU_QUICK:
|
15745
16343
|
case GGML_OP_SILU:
|
15746
16344
|
case GGML_OP_SILU_BACK:
|
15747
16345
|
case GGML_OP_NORM:
|
@@ -15848,8 +16446,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15848
16446
|
{
|
15849
16447
|
node->n_tasks = 1; //TODO
|
15850
16448
|
} break;
|
15851
|
-
case
|
15852
|
-
case
|
16449
|
+
case GGML_OP_CONV_1D_S1_PH:
|
16450
|
+
case GGML_OP_CONV_1D_S2_PH:
|
15853
16451
|
{
|
15854
16452
|
node->n_tasks = n_threads;
|
15855
16453
|
|
@@ -15876,6 +16474,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15876
16474
|
GGML_ASSERT(false);
|
15877
16475
|
}
|
15878
16476
|
|
16477
|
+
work_size = MAX(work_size, cur);
|
16478
|
+
} break;
|
16479
|
+
case GGML_OP_CONV_2D_SK_P0:
|
16480
|
+
{
|
16481
|
+
node->n_tasks = n_threads;
|
16482
|
+
|
16483
|
+
GGML_ASSERT(node->src1->ne[3] == 1);
|
16484
|
+
|
16485
|
+
const int64_t ne00 = node->src0->ne[0]; // W
|
16486
|
+
const int64_t ne01 = node->src0->ne[1]; // H
|
16487
|
+
const int64_t ne02 = node->src0->ne[2]; // C
|
16488
|
+
const int64_t ne03 = node->src0->ne[3]; // N
|
16489
|
+
|
16490
|
+
const int64_t ne10 = node->src1->ne[0]; // W
|
16491
|
+
const int64_t ne11 = node->src1->ne[1]; // H
|
16492
|
+
const int64_t ne12 = node->src1->ne[2]; // C
|
16493
|
+
|
16494
|
+
const int64_t nk = ne00*ne01;
|
16495
|
+
|
16496
|
+
UNUSED(ne02);
|
16497
|
+
UNUSED(ne03);
|
16498
|
+
UNUSED(nk);
|
16499
|
+
|
16500
|
+
size_t cur = 0;
|
16501
|
+
|
16502
|
+
if (node->src0->type == GGML_TYPE_F16 &&
|
16503
|
+
node->src1->type == GGML_TYPE_F32) {
|
16504
|
+
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
16505
|
+
} else if (node->src0->type == GGML_TYPE_F32 &&
|
16506
|
+
node->src1->type == GGML_TYPE_F32) {
|
16507
|
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
16508
|
+
} else {
|
16509
|
+
GGML_ASSERT(false);
|
16510
|
+
}
|
16511
|
+
|
15879
16512
|
work_size = MAX(work_size, cur);
|
15880
16513
|
} break;
|
15881
16514
|
case GGML_OP_FLASH_ATTN:
|
@@ -15937,6 +16570,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15937
16570
|
|
15938
16571
|
work_size = MAX(work_size, cur);
|
15939
16572
|
} break;
|
16573
|
+
case GGML_OP_WIN_PART:
|
16574
|
+
case GGML_OP_WIN_UNPART:
|
15940
16575
|
case GGML_OP_MAP_UNARY:
|
15941
16576
|
case GGML_OP_MAP_BINARY:
|
15942
16577
|
{
|
@@ -16469,16 +17104,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16469
17104
|
|
16470
17105
|
if (!*ctx_data) {
|
16471
17106
|
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
17107
|
+
fclose(fin);
|
16472
17108
|
return result;
|
16473
17109
|
}
|
16474
17110
|
}
|
16475
17111
|
|
16476
17112
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
16477
17113
|
|
16478
|
-
|
16479
|
-
|
16480
|
-
|
16481
|
-
|
17114
|
+
{
|
17115
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
17116
|
+
if (ret != fsize) {
|
17117
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
17118
|
+
fclose(fin);
|
17119
|
+
return result;
|
17120
|
+
}
|
16482
17121
|
}
|
16483
17122
|
|
16484
17123
|
fclose(fin);
|
@@ -17598,7 +18237,6 @@ GGML_API void ggml_opt_init(
|
|
17598
18237
|
ggml_set_zero(opt->lbfgs.g);
|
17599
18238
|
ggml_set_zero(opt->lbfgs.gp);
|
17600
18239
|
ggml_set_zero(opt->lbfgs.d);
|
17601
|
-
ggml_set_zero(opt->lbfgs.pf);
|
17602
18240
|
if (opt->lbfgs.pf) {
|
17603
18241
|
ggml_set_zero(opt->lbfgs.pf);
|
17604
18242
|
}
|