llama_cpp 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
// Defines CLOCK_MONOTONIC on Linux
|
2
|
-
#define
|
1
|
+
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <stdio.h>
|
25
25
|
#include <float.h>
|
26
26
|
#include <limits.h>
|
27
|
+
#include <stdarg.h>
|
27
28
|
|
28
29
|
#ifdef GGML_USE_METAL
|
29
30
|
#include <unistd.h>
|
@@ -35,6 +36,12 @@
|
|
35
36
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
36
37
|
#endif
|
37
38
|
|
39
|
+
#if defined(_MSC_VER)
|
40
|
+
// disable "possible loss of data" to avoid hundreds of casts
|
41
|
+
// we should just be careful :)
|
42
|
+
#pragma warning(disable: 4244 4267)
|
43
|
+
#endif
|
44
|
+
|
38
45
|
#if defined(_WIN32)
|
39
46
|
|
40
47
|
#include <windows.h>
|
@@ -84,6 +91,11 @@ static int sched_yield (void) {
|
|
84
91
|
#include <stdatomic.h>
|
85
92
|
|
86
93
|
typedef void* thread_ret_t;
|
94
|
+
|
95
|
+
#include <sys/types.h>
|
96
|
+
#include <sys/stat.h>
|
97
|
+
#include <unistd.h>
|
98
|
+
|
87
99
|
#endif
|
88
100
|
|
89
101
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -106,11 +118,36 @@ typedef void* thread_ret_t;
|
|
106
118
|
/*#define GGML_PERF*/
|
107
119
|
#define GGML_DEBUG 0
|
108
120
|
#define GGML_GELU_FP16
|
121
|
+
#define GGML_GELU_QUICK_FP16
|
109
122
|
#define GGML_SILU_FP16
|
110
123
|
|
111
124
|
#define GGML_SOFT_MAX_UNROLL 4
|
112
125
|
#define GGML_VEC_DOT_UNROLL 2
|
113
126
|
|
127
|
+
//
|
128
|
+
// logging
|
129
|
+
//
|
130
|
+
|
131
|
+
#if (GGML_DEBUG >= 1)
|
132
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
133
|
+
#else
|
134
|
+
#define GGML_PRINT_DEBUG(...)
|
135
|
+
#endif
|
136
|
+
|
137
|
+
#if (GGML_DEBUG >= 5)
|
138
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
139
|
+
#else
|
140
|
+
#define GGML_PRINT_DEBUG_5(...)
|
141
|
+
#endif
|
142
|
+
|
143
|
+
#if (GGML_DEBUG >= 10)
|
144
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
145
|
+
#else
|
146
|
+
#define GGML_PRINT_DEBUG_10(...)
|
147
|
+
#endif
|
148
|
+
|
149
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
150
|
+
|
114
151
|
#ifdef GGML_USE_ACCELERATE
|
115
152
|
// uncomment to use vDSP for soft max computation
|
116
153
|
// note: not sure if it is actually faster
|
@@ -123,6 +160,34 @@ typedef void* thread_ret_t;
|
|
123
160
|
#define GGML_MEM_ALIGN 16
|
124
161
|
#endif
|
125
162
|
|
163
|
+
//
|
164
|
+
// logging
|
165
|
+
//
|
166
|
+
|
167
|
+
#if (GGML_DEBUG >= 1)
|
168
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
169
|
+
#else
|
170
|
+
#define GGML_PRINT_DEBUG(...)
|
171
|
+
#endif
|
172
|
+
|
173
|
+
#if (GGML_DEBUG >= 5)
|
174
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
175
|
+
#else
|
176
|
+
#define GGML_PRINT_DEBUG_5(...)
|
177
|
+
#endif
|
178
|
+
|
179
|
+
#if (GGML_DEBUG >= 10)
|
180
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
181
|
+
#else
|
182
|
+
#define GGML_PRINT_DEBUG_10(...)
|
183
|
+
#endif
|
184
|
+
|
185
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
186
|
+
|
187
|
+
//
|
188
|
+
// end of logging block
|
189
|
+
//
|
190
|
+
|
126
191
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
127
192
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
128
193
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -136,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
136
201
|
#endif
|
137
202
|
if (result != 0) {
|
138
203
|
// Handle allocation failure
|
204
|
+
const char *error_desc = "unknown allocation error";
|
205
|
+
switch (result) {
|
206
|
+
case EINVAL:
|
207
|
+
error_desc = "invalid alignment value";
|
208
|
+
break;
|
209
|
+
case ENOMEM:
|
210
|
+
error_desc = "insufficient memory";
|
211
|
+
break;
|
212
|
+
}
|
213
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
214
|
+
__func__, error_desc, size/(1024.0*1024.0));
|
139
215
|
return NULL;
|
140
216
|
}
|
141
217
|
return aligned_memory;
|
@@ -334,6 +410,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
334
410
|
// precomputed gelu table for f16 (128 KB)
|
335
411
|
static ggml_fp16_t table_gelu_f16[1 << 16];
|
336
412
|
|
413
|
+
// precomputed quick gelu table for f16 (128 KB)
|
414
|
+
static ggml_fp16_t table_gelu_quick_f16[1 << 16];
|
415
|
+
|
337
416
|
// precomputed silu table for f16 (128 KB)
|
338
417
|
static ggml_fp16_t table_silu_f16[1 << 16];
|
339
418
|
|
@@ -409,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
409
488
|
}
|
410
489
|
}
|
411
490
|
|
412
|
-
|
413
491
|
//
|
414
492
|
// timing
|
415
493
|
//
|
@@ -472,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
|
|
472
550
|
#define ggml_perf_cycles_per_ms() 0
|
473
551
|
#endif
|
474
552
|
|
553
|
+
|
475
554
|
//
|
476
555
|
// cache line
|
477
556
|
//
|
@@ -1671,14 +1750,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1671
1750
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1672
1751
|
#define GGML_F32x4_REDUCE(res, x) \
|
1673
1752
|
{ \
|
1674
|
-
|
1675
|
-
|
1753
|
+
int offset = GGML_F32_ARR >> 1; \
|
1754
|
+
for (int i = 0; i < offset; ++i) { \
|
1755
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1676
1756
|
} \
|
1677
|
-
|
1678
|
-
|
1757
|
+
offset >>= 1; \
|
1758
|
+
for (int i = 0; i < offset; ++i) { \
|
1759
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1679
1760
|
} \
|
1680
|
-
|
1681
|
-
|
1761
|
+
offset >>= 1; \
|
1762
|
+
for (int i = 0; i < offset; ++i) { \
|
1763
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1682
1764
|
} \
|
1683
1765
|
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1684
1766
|
}
|
@@ -1709,14 +1791,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1709
1791
|
#define GGML_F16x8_MUL vmulq_f16
|
1710
1792
|
#define GGML_F16x8_REDUCE(res, x) \
|
1711
1793
|
{ \
|
1712
|
-
|
1713
|
-
|
1794
|
+
int offset = GGML_F16_ARR >> 1; \
|
1795
|
+
for (int i = 0; i < offset; ++i) { \
|
1796
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1714
1797
|
} \
|
1715
|
-
|
1716
|
-
|
1798
|
+
offset >>= 1; \
|
1799
|
+
for (int i = 0; i < offset; ++i) { \
|
1800
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1717
1801
|
} \
|
1718
|
-
|
1719
|
-
|
1802
|
+
offset >>= 1; \
|
1803
|
+
for (int i = 0; i < offset; ++i) { \
|
1804
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1720
1805
|
} \
|
1721
1806
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1722
1807
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
@@ -1783,14 +1868,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1783
1868
|
#define GGML_F32x8_MUL _mm256_mul_ps
|
1784
1869
|
#define GGML_F32x8_REDUCE(res, x) \
|
1785
1870
|
{ \
|
1786
|
-
|
1787
|
-
|
1871
|
+
int offset = GGML_F32_ARR >> 1; \
|
1872
|
+
for (int i = 0; i < offset; ++i) { \
|
1873
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1788
1874
|
} \
|
1789
|
-
|
1790
|
-
|
1875
|
+
offset >>= 1; \
|
1876
|
+
for (int i = 0; i < offset; ++i) { \
|
1877
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1791
1878
|
} \
|
1792
|
-
|
1793
|
-
|
1879
|
+
offset >>= 1; \
|
1880
|
+
for (int i = 0; i < offset; ++i) { \
|
1881
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1794
1882
|
} \
|
1795
1883
|
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1796
1884
|
_mm256_extractf128_ps(x[0], 1)); \
|
@@ -1880,14 +1968,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1880
1968
|
#define GGML_F32x4_MUL vec_mul
|
1881
1969
|
#define GGML_F32x4_REDUCE(res, x) \
|
1882
1970
|
{ \
|
1883
|
-
|
1884
|
-
|
1971
|
+
int offset = GGML_F32_ARR >> 1; \
|
1972
|
+
for (int i = 0; i < offset; ++i) { \
|
1973
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1885
1974
|
} \
|
1886
|
-
|
1887
|
-
|
1975
|
+
offset >>= 1; \
|
1976
|
+
for (int i = 0; i < offset; ++i) { \
|
1977
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1888
1978
|
} \
|
1889
|
-
|
1890
|
-
|
1979
|
+
offset >>= 1; \
|
1980
|
+
for (int i = 0; i < offset; ++i) { \
|
1981
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1891
1982
|
} \
|
1892
1983
|
res = vec_extract(x[0], 0) + \
|
1893
1984
|
vec_extract(x[0], 1) + \
|
@@ -1943,14 +2034,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1943
2034
|
#define GGML_F32x4_MUL wasm_f32x4_mul
|
1944
2035
|
#define GGML_F32x4_REDUCE(res, x) \
|
1945
2036
|
{ \
|
1946
|
-
|
1947
|
-
|
2037
|
+
int offset = GGML_F32_ARR >> 1; \
|
2038
|
+
for (int i = 0; i < offset; ++i) { \
|
2039
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1948
2040
|
} \
|
1949
|
-
|
1950
|
-
|
2041
|
+
offset >>= 1; \
|
2042
|
+
for (int i = 0; i < offset; ++i) { \
|
2043
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1951
2044
|
} \
|
1952
|
-
|
1953
|
-
|
2045
|
+
offset >>= 1; \
|
2046
|
+
for (int i = 0; i < offset; ++i) { \
|
2047
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1954
2048
|
} \
|
1955
2049
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
1956
2050
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2005,14 +2099,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2005
2099
|
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2006
2100
|
#define GGML_F16x4_REDUCE(res, x) \
|
2007
2101
|
{ \
|
2008
|
-
|
2009
|
-
|
2102
|
+
int offset = GGML_F16_ARR >> 1; \
|
2103
|
+
for (int i = 0; i < offset; ++i) { \
|
2104
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2010
2105
|
} \
|
2011
|
-
|
2012
|
-
|
2106
|
+
offset >>= 1; \
|
2107
|
+
for (int i = 0; i < offset; ++i) { \
|
2108
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2013
2109
|
} \
|
2014
|
-
|
2015
|
-
|
2110
|
+
offset >>= 1; \
|
2111
|
+
for (int i = 0; i < offset; ++i) { \
|
2112
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2016
2113
|
} \
|
2017
2114
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2018
2115
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2054,14 +2151,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2054
2151
|
#define GGML_F32x4_MUL _mm_mul_ps
|
2055
2152
|
#define GGML_F32x4_REDUCE(res, x) \
|
2056
2153
|
{ \
|
2057
|
-
|
2058
|
-
|
2154
|
+
int offset = GGML_F32_ARR >> 1; \
|
2155
|
+
for (int i = 0; i < offset; ++i) { \
|
2156
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2059
2157
|
} \
|
2060
|
-
|
2061
|
-
|
2158
|
+
offset >>= 1; \
|
2159
|
+
for (int i = 0; i < offset; ++i) { \
|
2160
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2062
2161
|
} \
|
2063
|
-
|
2064
|
-
|
2162
|
+
offset >>= 1; \
|
2163
|
+
for (int i = 0; i < offset; ++i) { \
|
2164
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2065
2165
|
} \
|
2066
2166
|
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2067
2167
|
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
@@ -3350,6 +3450,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
3350
3450
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3351
3451
|
|
3352
3452
|
static const float GELU_COEF_A = 0.044715f;
|
3453
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3353
3454
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3354
3455
|
|
3355
3456
|
inline static float ggml_gelu_f32(float x) {
|
@@ -3380,6 +3481,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
3380
3481
|
}
|
3381
3482
|
#endif
|
3382
3483
|
|
3484
|
+
inline static float ggml_gelu_quick_f32(float x) {
|
3485
|
+
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
3486
|
+
}
|
3487
|
+
|
3488
|
+
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3489
|
+
// const uint16_t * i16 = (const uint16_t *) x;
|
3490
|
+
// for (int i = 0; i < n; ++i) {
|
3491
|
+
// y[i] = table_gelu_quick_f16[i16[i]];
|
3492
|
+
// }
|
3493
|
+
//}
|
3494
|
+
|
3495
|
+
#ifdef GGML_GELU_QUICK_FP16
|
3496
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3497
|
+
uint16_t t;
|
3498
|
+
for (int i = 0; i < n; ++i) {
|
3499
|
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3500
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
3501
|
+
y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
|
3502
|
+
}
|
3503
|
+
}
|
3504
|
+
#else
|
3505
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3506
|
+
for (int i = 0; i < n; ++i) {
|
3507
|
+
y[i] = ggml_gelu_quick_f32(x[i]);
|
3508
|
+
}
|
3509
|
+
}
|
3510
|
+
#endif
|
3511
|
+
|
3383
3512
|
// Sigmoid Linear Unit (SiLU) function
|
3384
3513
|
inline static float ggml_silu_f32(float x) {
|
3385
3514
|
return x/(1.0f + expf(-x));
|
@@ -3469,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
3469
3598
|
*s = 1.f/(*s);
|
3470
3599
|
}
|
3471
3600
|
|
3472
|
-
//
|
3473
|
-
// logging
|
3474
|
-
//
|
3475
|
-
|
3476
|
-
#if (GGML_DEBUG >= 1)
|
3477
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
3478
|
-
#else
|
3479
|
-
#define GGML_PRINT_DEBUG(...)
|
3480
|
-
#endif
|
3481
|
-
|
3482
|
-
#if (GGML_DEBUG >= 5)
|
3483
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
3484
|
-
#else
|
3485
|
-
#define GGML_PRINT_DEBUG_5(...)
|
3486
|
-
#endif
|
3487
|
-
|
3488
|
-
#if (GGML_DEBUG >= 10)
|
3489
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
3490
|
-
#else
|
3491
|
-
#define GGML_PRINT_DEBUG_10(...)
|
3492
|
-
#endif
|
3493
|
-
|
3494
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
3495
|
-
|
3496
3601
|
//
|
3497
3602
|
// data types
|
3498
3603
|
//
|
@@ -3610,6 +3715,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3610
3715
|
"STEP",
|
3611
3716
|
"RELU",
|
3612
3717
|
"GELU",
|
3718
|
+
"GELU_QUICK",
|
3613
3719
|
"SILU",
|
3614
3720
|
"SILU_BACK",
|
3615
3721
|
"NORM",
|
@@ -3638,21 +3744,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3638
3744
|
"ROPE_BACK",
|
3639
3745
|
"ALIBI",
|
3640
3746
|
"CLAMP",
|
3641
|
-
"
|
3642
|
-
"
|
3747
|
+
"CONV_1D_S1_PH",
|
3748
|
+
"CONV_1D_S2_PH",
|
3749
|
+
"CONV_2D_SK_P0",
|
3643
3750
|
|
3644
3751
|
"FLASH_ATTN",
|
3645
3752
|
"FLASH_FF",
|
3646
3753
|
"FLASH_ATTN_BACK",
|
3754
|
+
"WIN_PART",
|
3755
|
+
"WIN_UNPART",
|
3647
3756
|
|
3648
3757
|
"MAP_UNARY",
|
3649
3758
|
"MAP_BINARY",
|
3650
3759
|
|
3760
|
+
"MAP_CUSTOM1",
|
3761
|
+
"MAP_CUSTOM2",
|
3762
|
+
"MAP_CUSTOM3",
|
3763
|
+
|
3651
3764
|
"CROSS_ENTROPY_LOSS",
|
3652
3765
|
"CROSS_ENTROPY_LOSS_BACK",
|
3653
3766
|
};
|
3654
3767
|
|
3655
|
-
static_assert(GGML_OP_COUNT ==
|
3768
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3656
3769
|
|
3657
3770
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3658
3771
|
"none",
|
@@ -3678,6 +3791,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3678
3791
|
"step(x)",
|
3679
3792
|
"relu(x)",
|
3680
3793
|
"gelu(x)",
|
3794
|
+
"gelu_quick(x)",
|
3681
3795
|
"silu(x)",
|
3682
3796
|
"silu_back(x)",
|
3683
3797
|
"norm(x)",
|
@@ -3706,21 +3820,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3706
3820
|
"rope_back(x)",
|
3707
3821
|
"alibi(x)",
|
3708
3822
|
"clamp(x)",
|
3709
|
-
"
|
3710
|
-
"
|
3823
|
+
"conv_1d_s1_ph(x)",
|
3824
|
+
"conv_1d_s2_ph(x)",
|
3825
|
+
"conv_2d_sk_p0(x)",
|
3711
3826
|
|
3712
3827
|
"flash_attn(x)",
|
3713
3828
|
"flash_ff(x)",
|
3714
3829
|
"flash_attn_back(x)",
|
3830
|
+
"win_part(x)",
|
3831
|
+
"win_unpart(x)",
|
3715
3832
|
|
3716
3833
|
"f(x)",
|
3717
3834
|
"f(x,y)",
|
3718
3835
|
|
3836
|
+
"custom(x)",
|
3837
|
+
"custom(x,y)",
|
3838
|
+
"custom(x,y,z)",
|
3839
|
+
|
3719
3840
|
"cross_entropy_loss(x,y)",
|
3720
3841
|
"cross_entropy_loss_back(x,y)",
|
3721
3842
|
};
|
3722
3843
|
|
3723
|
-
static_assert(GGML_OP_COUNT ==
|
3844
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3724
3845
|
|
3725
3846
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3726
3847
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3751,12 +3872,31 @@ struct ggml_context_container {
|
|
3751
3872
|
struct ggml_context context;
|
3752
3873
|
};
|
3753
3874
|
|
3875
|
+
//
|
3876
|
+
// NUMA support
|
3877
|
+
//
|
3878
|
+
|
3879
|
+
#define GGML_NUMA_MAX_NODES 8
|
3880
|
+
#define GGML_NUMA_MAX_CPUS 512
|
3881
|
+
|
3882
|
+
struct ggml_numa_node {
|
3883
|
+
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
3884
|
+
uint32_t n_cpus;
|
3885
|
+
};
|
3886
|
+
|
3887
|
+
struct ggml_numa_nodes {
|
3888
|
+
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
3889
|
+
uint32_t n_nodes;
|
3890
|
+
uint32_t total_cpus; // hardware threads on system
|
3891
|
+
};
|
3892
|
+
|
3754
3893
|
//
|
3755
3894
|
// ggml state
|
3756
3895
|
//
|
3757
3896
|
|
3758
3897
|
struct ggml_state {
|
3759
3898
|
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
3899
|
+
struct ggml_numa_nodes numa;
|
3760
3900
|
};
|
3761
3901
|
|
3762
3902
|
// global state
|
@@ -3781,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
|
|
3781
3921
|
atomic_fetch_sub(&g_state_barrier, 1);
|
3782
3922
|
}
|
3783
3923
|
|
3924
|
+
void ggml_numa_init(void) {
|
3925
|
+
if (g_state.numa.n_nodes > 0) {
|
3926
|
+
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
3927
|
+
|
3928
|
+
return;
|
3929
|
+
}
|
3930
|
+
|
3931
|
+
#ifdef __linux__
|
3932
|
+
struct stat st;
|
3933
|
+
char path[256];
|
3934
|
+
int rv;
|
3935
|
+
|
3936
|
+
// enumerate nodes
|
3937
|
+
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
3938
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
3939
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3940
|
+
if (stat(path, &st) != 0) { break; }
|
3941
|
+
++g_state.numa.n_nodes;
|
3942
|
+
}
|
3943
|
+
|
3944
|
+
// enumerate CPUs
|
3945
|
+
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
3946
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
3947
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3948
|
+
if (stat(path, &st) != 0) { break; }
|
3949
|
+
++g_state.numa.total_cpus;
|
3950
|
+
}
|
3951
|
+
|
3952
|
+
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
3953
|
+
|
3954
|
+
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
3955
|
+
g_state.numa.n_nodes = 0;
|
3956
|
+
return;
|
3957
|
+
}
|
3958
|
+
|
3959
|
+
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
3960
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
3961
|
+
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
3962
|
+
node->n_cpus = 0;
|
3963
|
+
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
|
3964
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
3965
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3966
|
+
if (stat(path, &st) == 0) {
|
3967
|
+
node->cpus[node->n_cpus++] = c;
|
3968
|
+
GGML_PRINT_DEBUG(" %u", c);
|
3969
|
+
}
|
3970
|
+
}
|
3971
|
+
GGML_PRINT_DEBUG("\n");
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
if (ggml_is_numa()) {
|
3975
|
+
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
3976
|
+
if (fptr != NULL) {
|
3977
|
+
char buf[42];
|
3978
|
+
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
3979
|
+
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
3980
|
+
}
|
3981
|
+
fclose(fptr);
|
3982
|
+
}
|
3983
|
+
}
|
3984
|
+
#else
|
3985
|
+
// TODO
|
3986
|
+
#endif
|
3987
|
+
}
|
3988
|
+
|
3989
|
+
bool ggml_is_numa(void) {
|
3990
|
+
return g_state.numa.n_nodes > 1;
|
3991
|
+
}
|
3992
|
+
|
3784
3993
|
////////////////////////////////////////////////////////////////////////////////
|
3785
3994
|
|
3786
3995
|
void ggml_print_object(const struct ggml_object * obj) {
|
@@ -4011,7 +4220,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4011
4220
|
// initialize time system (required on Windows)
|
4012
4221
|
ggml_time_init();
|
4013
4222
|
|
4014
|
-
// initialize GELU, SILU and EXP F32 tables
|
4223
|
+
// initialize GELU, Quick GELU, SILU and EXP F32 tables
|
4015
4224
|
{
|
4016
4225
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
4017
4226
|
|
@@ -4021,13 +4230,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4021
4230
|
memcpy(&ii, &ui, sizeof(ii));
|
4022
4231
|
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
4023
4232
|
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
4233
|
+
table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
4024
4234
|
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
4025
4235
|
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4026
4236
|
}
|
4027
4237
|
|
4028
4238
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
4029
4239
|
|
4030
|
-
GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4240
|
+
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4031
4241
|
}
|
4032
4242
|
|
4033
4243
|
// initialize g_state
|
@@ -4036,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4036
4246
|
|
4037
4247
|
g_state = (struct ggml_state) {
|
4038
4248
|
/*.contexts =*/ { { 0 } },
|
4249
|
+
/*.numa =*/ {
|
4250
|
+
.n_nodes = 0,
|
4251
|
+
.total_cpus = 0,
|
4252
|
+
},
|
4039
4253
|
};
|
4040
4254
|
|
4041
4255
|
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
@@ -4148,14 +4362,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
|
4148
4362
|
ctx->no_alloc = no_alloc;
|
4149
4363
|
}
|
4150
4364
|
|
4151
|
-
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4365
|
+
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
|
4152
4366
|
return ctx->mem_buffer;
|
4153
4367
|
}
|
4154
4368
|
|
4155
|
-
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4369
|
+
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
4156
4370
|
return ctx->mem_size;
|
4157
4371
|
}
|
4158
4372
|
|
4373
|
+
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
4374
|
+
size_t max_size = 0;
|
4375
|
+
|
4376
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4377
|
+
|
4378
|
+
while (obj != NULL) {
|
4379
|
+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
4380
|
+
|
4381
|
+
const size_t size = ggml_nbytes(tensor);
|
4382
|
+
|
4383
|
+
if (max_size < size) {
|
4384
|
+
max_size = size;
|
4385
|
+
}
|
4386
|
+
|
4387
|
+
obj = obj->next;
|
4388
|
+
}
|
4389
|
+
|
4390
|
+
return max_size;
|
4391
|
+
}
|
4392
|
+
|
4159
4393
|
// IMPORTANT:
|
4160
4394
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4161
4395
|
// this is an error prone process, but it is necessary to support inplace
|
@@ -4639,15 +4873,25 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
|
4639
4873
|
return tensor->name;
|
4640
4874
|
}
|
4641
4875
|
|
4642
|
-
|
4876
|
+
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
4643
4877
|
strncpy(tensor->name, name, sizeof(tensor->name));
|
4644
4878
|
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
4879
|
+
return tensor;
|
4880
|
+
}
|
4881
|
+
|
4882
|
+
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
4883
|
+
va_list args;
|
4884
|
+
va_start(args, fmt);
|
4885
|
+
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
4886
|
+
va_end(args);
|
4887
|
+
return tensor;
|
4645
4888
|
}
|
4646
4889
|
|
4647
4890
|
struct ggml_tensor * ggml_view_tensor(
|
4648
4891
|
struct ggml_context * ctx,
|
4649
4892
|
const struct ggml_tensor * src) {
|
4650
4893
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
4894
|
+
ggml_format_name(result, "%s (view)", src->name);
|
4651
4895
|
|
4652
4896
|
result->nb[0] = src->nb[0];
|
4653
4897
|
result->nb[1] = src->nb[1];
|
@@ -5420,6 +5664,40 @@ struct ggml_tensor * ggml_gelu_inplace(
|
|
5420
5664
|
return ggml_gelu_impl(ctx, a, true);
|
5421
5665
|
}
|
5422
5666
|
|
5667
|
+
// ggml_gelu_quick
|
5668
|
+
|
5669
|
+
struct ggml_tensor * ggml_gelu_quick_impl(
|
5670
|
+
struct ggml_context * ctx,
|
5671
|
+
struct ggml_tensor * a,
|
5672
|
+
bool inplace) {
|
5673
|
+
bool is_node = false;
|
5674
|
+
|
5675
|
+
if (!inplace && (a->grad)) {
|
5676
|
+
is_node = true;
|
5677
|
+
}
|
5678
|
+
|
5679
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5680
|
+
|
5681
|
+
result->op = GGML_OP_GELU_QUICK;
|
5682
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5683
|
+
result->src0 = a;
|
5684
|
+
result->src1 = NULL;
|
5685
|
+
|
5686
|
+
return result;
|
5687
|
+
}
|
5688
|
+
|
5689
|
+
struct ggml_tensor * ggml_gelu_quick(
|
5690
|
+
struct ggml_context * ctx,
|
5691
|
+
struct ggml_tensor * a) {
|
5692
|
+
return ggml_gelu_quick_impl(ctx, a, false);
|
5693
|
+
}
|
5694
|
+
|
5695
|
+
struct ggml_tensor * ggml_gelu_quick_inplace(
|
5696
|
+
struct ggml_context * ctx,
|
5697
|
+
struct ggml_tensor * a) {
|
5698
|
+
return ggml_gelu_quick_impl(ctx, a, true);
|
5699
|
+
}
|
5700
|
+
|
5423
5701
|
// ggml_silu
|
5424
5702
|
|
5425
5703
|
struct ggml_tensor * ggml_silu_impl(
|
@@ -5775,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
5775
6053
|
|
5776
6054
|
// make a view of the destination
|
5777
6055
|
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
6056
|
+
if (strlen(b->name) > 0) {
|
6057
|
+
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
6058
|
+
} else {
|
6059
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
6060
|
+
}
|
5778
6061
|
|
5779
6062
|
result->op = GGML_OP_CPY;
|
5780
6063
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5811,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
|
|
5811
6094
|
}
|
5812
6095
|
|
5813
6096
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6097
|
+
ggml_format_name(result, "%s (cont)", a->name);
|
5814
6098
|
|
5815
6099
|
result->op = GGML_OP_CONT;
|
5816
6100
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5854,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
|
|
5854
6138
|
}
|
5855
6139
|
|
5856
6140
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
6141
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5857
6142
|
|
5858
6143
|
result->op = GGML_OP_RESHAPE;
|
5859
6144
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5878,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
5878
6163
|
|
5879
6164
|
const int64_t ne[1] = { ne0 };
|
5880
6165
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
6166
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5881
6167
|
|
5882
6168
|
result->op = GGML_OP_RESHAPE;
|
5883
6169
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5903,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
5903
6189
|
|
5904
6190
|
const int64_t ne[2] = { ne0, ne1 };
|
5905
6191
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
6192
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5906
6193
|
|
5907
6194
|
result->op = GGML_OP_RESHAPE;
|
5908
6195
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5929,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
5929
6216
|
|
5930
6217
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
5931
6218
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
6219
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5932
6220
|
|
5933
6221
|
result->op = GGML_OP_RESHAPE;
|
5934
6222
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5957,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
5957
6245
|
|
5958
6246
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
5959
6247
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
6248
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5960
6249
|
|
5961
6250
|
result->op = GGML_OP_RESHAPE;
|
5962
6251
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5981,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
|
|
5981
6270
|
}
|
5982
6271
|
|
5983
6272
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
6273
|
+
ggml_format_name(result, "%s (view)", a->name);
|
5984
6274
|
|
5985
6275
|
ggml_scratch_save(ctx);
|
5986
6276
|
|
5987
6277
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6278
|
+
ggml_set_name(offs, "offset");
|
5988
6279
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5989
6280
|
|
5990
6281
|
ggml_scratch_load(ctx);
|
@@ -6017,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
|
|
6017
6308
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6018
6309
|
|
6019
6310
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
6311
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6020
6312
|
|
6021
6313
|
ggml_scratch_save(ctx);
|
6022
6314
|
|
6023
6315
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6316
|
+
ggml_set_name(offs, "offset");
|
6024
6317
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6025
6318
|
|
6026
6319
|
ggml_scratch_load(ctx);
|
@@ -6059,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
|
|
6059
6352
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6060
6353
|
|
6061
6354
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
6355
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6062
6356
|
|
6063
6357
|
ggml_scratch_save(ctx);
|
6064
6358
|
|
6065
6359
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6360
|
+
ggml_set_name(offs, "offset");
|
6066
6361
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6067
6362
|
|
6068
6363
|
ggml_scratch_load(ctx);
|
@@ -6103,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
|
|
6103
6398
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6104
6399
|
|
6105
6400
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
6401
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6106
6402
|
|
6107
6403
|
ggml_scratch_save(ctx);
|
6108
6404
|
|
6109
6405
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6406
|
+
ggml_set_name(offs, "offset");
|
6110
6407
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6111
6408
|
|
6112
6409
|
ggml_scratch_load(ctx);
|
@@ -6152,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
|
|
6152
6449
|
}
|
6153
6450
|
|
6154
6451
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6452
|
+
ggml_format_name(result, "%s (permuted)", a->name);
|
6155
6453
|
|
6156
6454
|
int ne[GGML_MAX_DIMS];
|
6157
6455
|
int nb[GGML_MAX_DIMS];
|
@@ -6211,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
|
|
6211
6509
|
}
|
6212
6510
|
|
6213
6511
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6512
|
+
ggml_format_name(result, "%s (transposed)", a->name);
|
6214
6513
|
|
6215
6514
|
result->ne[0] = a->ne[1];
|
6216
6515
|
result->ne[1] = a->ne[0];
|
@@ -6479,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6479
6778
|
int n_past,
|
6480
6779
|
int n_dims,
|
6481
6780
|
int mode,
|
6781
|
+
int n_ctx,
|
6482
6782
|
bool inplace) {
|
6483
6783
|
GGML_ASSERT(n_past >= 0);
|
6484
6784
|
bool is_node = false;
|
@@ -6491,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6491
6791
|
|
6492
6792
|
ggml_scratch_save(ctx);
|
6493
6793
|
|
6494
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6794
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
6495
6795
|
|
6496
6796
|
((int32_t *) b->data)[0] = n_past;
|
6497
6797
|
((int32_t *) b->data)[1] = n_dims;
|
6498
6798
|
((int32_t *) b->data)[2] = mode;
|
6799
|
+
((int32_t *) b->data)[3] = n_ctx;
|
6499
6800
|
|
6500
6801
|
ggml_scratch_load(ctx);
|
6501
6802
|
|
@@ -6512,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
|
|
6512
6813
|
struct ggml_tensor * a,
|
6513
6814
|
int n_past,
|
6514
6815
|
int n_dims,
|
6515
|
-
int mode
|
6516
|
-
|
6816
|
+
int mode,
|
6817
|
+
int n_ctx) {
|
6818
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
6517
6819
|
}
|
6518
6820
|
|
6519
6821
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6521,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6521
6823
|
struct ggml_tensor * a,
|
6522
6824
|
int n_past,
|
6523
6825
|
int n_dims,
|
6524
|
-
int mode
|
6525
|
-
|
6826
|
+
int mode,
|
6827
|
+
int n_ctx) {
|
6828
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
6526
6829
|
}
|
6527
6830
|
|
6528
6831
|
// ggml_rope_back
|
@@ -6619,7 +6922,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6619
6922
|
|
6620
6923
|
ggml_scratch_save(ctx);
|
6621
6924
|
|
6622
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx,
|
6925
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
|
6623
6926
|
|
6624
6927
|
((float *) b->data)[0] = min;
|
6625
6928
|
((float *) b->data)[1] = max;
|
@@ -6634,9 +6937,9 @@ struct ggml_tensor * ggml_clamp(
|
|
6634
6937
|
return result;
|
6635
6938
|
}
|
6636
6939
|
|
6637
|
-
//
|
6940
|
+
// ggml_conv_1d_s1_ph
|
6638
6941
|
|
6639
|
-
struct ggml_tensor *
|
6942
|
+
struct ggml_tensor * ggml_conv_1d_s1_ph(
|
6640
6943
|
struct ggml_context * ctx,
|
6641
6944
|
struct ggml_tensor * a,
|
6642
6945
|
struct ggml_tensor * b) {
|
@@ -6653,7 +6956,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6653
6956
|
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
6654
6957
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6655
6958
|
|
6656
|
-
result->op =
|
6959
|
+
result->op = GGML_OP_CONV_1D_S1_PH;
|
6657
6960
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6658
6961
|
result->src0 = a;
|
6659
6962
|
result->src1 = b;
|
@@ -6661,9 +6964,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6661
6964
|
return result;
|
6662
6965
|
}
|
6663
6966
|
|
6664
|
-
//
|
6967
|
+
// ggml_conv_1d_s2_ph
|
6665
6968
|
|
6666
|
-
struct ggml_tensor *
|
6969
|
+
struct ggml_tensor * ggml_conv_1d_s2_ph(
|
6667
6970
|
struct ggml_context * ctx,
|
6668
6971
|
struct ggml_tensor * a,
|
6669
6972
|
struct ggml_tensor * b) {
|
@@ -6680,7 +6983,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
6680
6983
|
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
6681
6984
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6682
6985
|
|
6683
|
-
result->op =
|
6986
|
+
result->op = GGML_OP_CONV_1D_S2_PH;
|
6987
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6988
|
+
result->src0 = a;
|
6989
|
+
result->src1 = b;
|
6990
|
+
|
6991
|
+
return result;
|
6992
|
+
}
|
6993
|
+
|
6994
|
+
// ggml_conv_2d_sk_p0
|
6995
|
+
|
6996
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6997
|
+
struct ggml_context * ctx,
|
6998
|
+
struct ggml_tensor * a,
|
6999
|
+
struct ggml_tensor * b) {
|
7000
|
+
GGML_ASSERT(b->ne[3] == 1);
|
7001
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
7002
|
+
GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
|
7003
|
+
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
7004
|
+
bool is_node = false;
|
7005
|
+
|
7006
|
+
if (a->grad || b->grad) {
|
7007
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7008
|
+
is_node = true;
|
7009
|
+
}
|
7010
|
+
|
7011
|
+
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
7012
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7013
|
+
|
7014
|
+
result->op = GGML_OP_CONV_2D_SK_P0;
|
6684
7015
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6685
7016
|
result->src0 = a;
|
6686
7017
|
result->src1 = b;
|
@@ -6814,45 +7145,133 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6814
7145
|
return result;
|
6815
7146
|
}
|
6816
7147
|
|
7148
|
+
// ggml_win_part
|
6817
7149
|
|
6818
|
-
|
7150
|
+
struct ggml_tensor * ggml_win_part(
|
7151
|
+
struct ggml_context * ctx,
|
7152
|
+
struct ggml_tensor * a,
|
7153
|
+
int w) {
|
7154
|
+
GGML_ASSERT(a->ne[3] == 1);
|
7155
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
6819
7156
|
|
6820
|
-
struct ggml_tensor * ggml_map_unary_impl_f32(
|
6821
|
-
struct ggml_context * ctx,
|
6822
|
-
struct ggml_tensor * a,
|
6823
|
-
const ggml_unary_op_f32_t fun,
|
6824
|
-
bool inplace) {
|
6825
7157
|
bool is_node = false;
|
6826
7158
|
|
6827
|
-
if (
|
7159
|
+
if (a->grad) {
|
7160
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6828
7161
|
is_node = true;
|
6829
7162
|
}
|
6830
7163
|
|
6831
|
-
|
6832
|
-
|
6833
|
-
|
7164
|
+
// padding
|
7165
|
+
const int px = (w - a->ne[1]%w)%w;
|
7166
|
+
const int py = (w - a->ne[2]%w)%w;
|
6834
7167
|
|
6835
|
-
|
7168
|
+
const int npx = (px + a->ne[1])/w;
|
7169
|
+
const int npy = (py + a->ne[2])/w;
|
7170
|
+
const int np = npx*npy;
|
7171
|
+
|
7172
|
+
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
7173
|
+
|
7174
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7175
|
+
|
7176
|
+
ggml_scratch_save(ctx);
|
7177
|
+
|
7178
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7179
|
+
|
7180
|
+
((int32_t *) b->data)[0] = npx;
|
7181
|
+
((int32_t *) b->data)[1] = npy;
|
7182
|
+
((int32_t *) b->data)[2] = w;
|
7183
|
+
|
7184
|
+
ggml_scratch_load(ctx);
|
7185
|
+
|
7186
|
+
result->op = GGML_OP_WIN_PART;
|
6836
7187
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6837
7188
|
result->src0 = a;
|
6838
|
-
result->
|
7189
|
+
result->src1 = NULL;
|
7190
|
+
result->opt[0] = b;
|
6839
7191
|
|
6840
7192
|
return result;
|
6841
7193
|
}
|
6842
7194
|
|
6843
|
-
|
6844
|
-
struct ggml_context * ctx,
|
6845
|
-
struct ggml_tensor * a,
|
6846
|
-
const ggml_unary_op_f32_t fun) {
|
6847
|
-
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
6848
|
-
}
|
7195
|
+
// ggml_win_unpart
|
6849
7196
|
|
6850
|
-
struct ggml_tensor *
|
6851
|
-
struct ggml_context
|
6852
|
-
struct ggml_tensor
|
6853
|
-
|
6854
|
-
|
6855
|
-
|
7197
|
+
struct ggml_tensor * ggml_win_unpart(
|
7198
|
+
struct ggml_context * ctx,
|
7199
|
+
struct ggml_tensor * a,
|
7200
|
+
int w0,
|
7201
|
+
int h0,
|
7202
|
+
int w) {
|
7203
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
7204
|
+
|
7205
|
+
bool is_node = false;
|
7206
|
+
|
7207
|
+
if (a->grad) {
|
7208
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7209
|
+
is_node = true;
|
7210
|
+
}
|
7211
|
+
|
7212
|
+
const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
|
7213
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7214
|
+
|
7215
|
+
ggml_scratch_save(ctx);
|
7216
|
+
|
7217
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
7218
|
+
|
7219
|
+
((int32_t *) b->data)[0] = w;
|
7220
|
+
|
7221
|
+
ggml_scratch_load(ctx);
|
7222
|
+
|
7223
|
+
result->op = GGML_OP_WIN_UNPART;
|
7224
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7225
|
+
result->src0 = a;
|
7226
|
+
result->src1 = NULL;
|
7227
|
+
result->opt[0] = b;
|
7228
|
+
|
7229
|
+
return result;
|
7230
|
+
}
|
7231
|
+
|
7232
|
+
// ggml_map_unary
|
7233
|
+
|
7234
|
+
struct ggml_tensor * ggml_map_unary_impl_f32(
|
7235
|
+
struct ggml_context * ctx,
|
7236
|
+
struct ggml_tensor * a,
|
7237
|
+
const ggml_unary_op_f32_t fun,
|
7238
|
+
bool inplace) {
|
7239
|
+
bool is_node = false;
|
7240
|
+
|
7241
|
+
if (!inplace && a->grad) {
|
7242
|
+
is_node = true;
|
7243
|
+
}
|
7244
|
+
|
7245
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7246
|
+
|
7247
|
+
ggml_scratch_save(ctx);
|
7248
|
+
|
7249
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7250
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7251
|
+
|
7252
|
+
ggml_scratch_load(ctx);
|
7253
|
+
|
7254
|
+
result->op = GGML_OP_MAP_UNARY;
|
7255
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7256
|
+
result->src0 = a;
|
7257
|
+
result->opt[0] = addr_tensor;
|
7258
|
+
|
7259
|
+
return result;
|
7260
|
+
}
|
7261
|
+
|
7262
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
7263
|
+
struct ggml_context * ctx,
|
7264
|
+
struct ggml_tensor * a,
|
7265
|
+
const ggml_unary_op_f32_t fun) {
|
7266
|
+
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
7267
|
+
}
|
7268
|
+
|
7269
|
+
struct ggml_tensor * ggml_map_unary_inplace_f32(
|
7270
|
+
struct ggml_context * ctx,
|
7271
|
+
struct ggml_tensor * a,
|
7272
|
+
const ggml_unary_op_f32_t fun) {
|
7273
|
+
return ggml_map_unary_impl_f32(ctx, a, fun, true);
|
7274
|
+
}
|
6856
7275
|
|
6857
7276
|
// ggml_map_binary
|
6858
7277
|
|
@@ -6870,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
|
6870
7289
|
is_node = true;
|
6871
7290
|
}
|
6872
7291
|
|
7292
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7293
|
+
|
7294
|
+
ggml_scratch_save(ctx);
|
7295
|
+
|
6873
7296
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
6874
7297
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
6875
|
-
|
7298
|
+
|
7299
|
+
ggml_scratch_load(ctx);
|
6876
7300
|
|
6877
7301
|
result->op = GGML_OP_MAP_BINARY;
|
6878
7302
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6899,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
6899
7323
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
6900
7324
|
}
|
6901
7325
|
|
7326
|
+
// ggml_map_custom1
|
7327
|
+
|
7328
|
+
struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7329
|
+
struct ggml_context * ctx,
|
7330
|
+
struct ggml_tensor * a,
|
7331
|
+
const ggml_custom1_op_f32_t fun,
|
7332
|
+
bool inplace) {
|
7333
|
+
bool is_node = false;
|
7334
|
+
|
7335
|
+
if (!inplace && a->grad) {
|
7336
|
+
is_node = true;
|
7337
|
+
}
|
7338
|
+
|
7339
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7340
|
+
|
7341
|
+
ggml_scratch_save(ctx);
|
7342
|
+
|
7343
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7344
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7345
|
+
|
7346
|
+
ggml_scratch_load(ctx);
|
7347
|
+
|
7348
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7349
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7350
|
+
result->src0 = a;
|
7351
|
+
result->opt[0] = addr_tensor;
|
7352
|
+
|
7353
|
+
return result;
|
7354
|
+
}
|
7355
|
+
|
7356
|
+
struct ggml_tensor * ggml_map_custom1_f32(
|
7357
|
+
struct ggml_context * ctx,
|
7358
|
+
struct ggml_tensor * a,
|
7359
|
+
const ggml_custom1_op_f32_t fun) {
|
7360
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, false);
|
7361
|
+
}
|
7362
|
+
|
7363
|
+
struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
7364
|
+
struct ggml_context * ctx,
|
7365
|
+
struct ggml_tensor * a,
|
7366
|
+
const ggml_custom1_op_f32_t fun) {
|
7367
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7368
|
+
}
|
7369
|
+
|
7370
|
+
// ggml_map_custom2
|
7371
|
+
|
7372
|
+
struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7373
|
+
struct ggml_context * ctx,
|
7374
|
+
struct ggml_tensor * a,
|
7375
|
+
struct ggml_tensor * b,
|
7376
|
+
const ggml_custom2_op_f32_t fun,
|
7377
|
+
bool inplace) {
|
7378
|
+
bool is_node = false;
|
7379
|
+
|
7380
|
+
if (!inplace && (a->grad || b->grad)) {
|
7381
|
+
is_node = true;
|
7382
|
+
}
|
7383
|
+
|
7384
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7385
|
+
|
7386
|
+
ggml_scratch_save(ctx);
|
7387
|
+
|
7388
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7389
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7390
|
+
|
7391
|
+
ggml_scratch_load(ctx);
|
7392
|
+
|
7393
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7394
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7395
|
+
result->src0 = a;
|
7396
|
+
result->src1 = b;
|
7397
|
+
result->opt[0] = addr_tensor;
|
7398
|
+
|
7399
|
+
return result;
|
7400
|
+
}
|
7401
|
+
|
7402
|
+
struct ggml_tensor * ggml_map_custom2_f32(
|
7403
|
+
struct ggml_context * ctx,
|
7404
|
+
struct ggml_tensor * a,
|
7405
|
+
struct ggml_tensor * b,
|
7406
|
+
const ggml_custom2_op_f32_t fun) {
|
7407
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
|
7408
|
+
}
|
7409
|
+
|
7410
|
+
struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
7411
|
+
struct ggml_context * ctx,
|
7412
|
+
struct ggml_tensor * a,
|
7413
|
+
struct ggml_tensor * b,
|
7414
|
+
const ggml_custom2_op_f32_t fun) {
|
7415
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7416
|
+
}
|
7417
|
+
|
7418
|
+
// ggml_map_custom3
|
7419
|
+
|
7420
|
+
struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7421
|
+
struct ggml_context * ctx,
|
7422
|
+
struct ggml_tensor * a,
|
7423
|
+
struct ggml_tensor * b,
|
7424
|
+
struct ggml_tensor * c,
|
7425
|
+
const ggml_custom3_op_f32_t fun,
|
7426
|
+
bool inplace) {
|
7427
|
+
bool is_node = false;
|
7428
|
+
|
7429
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7430
|
+
is_node = true;
|
7431
|
+
}
|
7432
|
+
|
7433
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7434
|
+
|
7435
|
+
ggml_scratch_save(ctx);
|
7436
|
+
|
7437
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7438
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7439
|
+
|
7440
|
+
ggml_scratch_load(ctx);
|
7441
|
+
|
7442
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7443
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7444
|
+
result->src0 = a;
|
7445
|
+
result->src1 = b;
|
7446
|
+
result->opt[0] = addr_tensor;
|
7447
|
+
result->opt[1] = c;
|
7448
|
+
|
7449
|
+
return result;
|
7450
|
+
}
|
7451
|
+
|
7452
|
+
struct ggml_tensor * ggml_map_custom3_f32(
|
7453
|
+
struct ggml_context * ctx,
|
7454
|
+
struct ggml_tensor * a,
|
7455
|
+
struct ggml_tensor * b,
|
7456
|
+
struct ggml_tensor * c,
|
7457
|
+
const ggml_custom3_op_f32_t fun) {
|
7458
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
|
7459
|
+
}
|
7460
|
+
|
7461
|
+
struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
7462
|
+
struct ggml_context * ctx,
|
7463
|
+
struct ggml_tensor * a,
|
7464
|
+
struct ggml_tensor * b,
|
7465
|
+
struct ggml_tensor * c,
|
7466
|
+
const ggml_custom3_op_f32_t fun) {
|
7467
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7468
|
+
}
|
7469
|
+
|
6902
7470
|
// ggml_cross_entropy_loss
|
6903
7471
|
|
6904
7472
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -7892,7 +8460,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
7892
8460
|
|
7893
8461
|
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
7894
8462
|
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
7895
|
-
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*
|
8463
|
+
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
7896
8464
|
|
7897
8465
|
assert(ne00 % 32 == 0);
|
7898
8466
|
|
@@ -9453,8 +10021,65 @@ static void ggml_compute_forward_gelu(
|
|
9453
10021
|
GGML_ASSERT(false);
|
9454
10022
|
} break;
|
9455
10023
|
}
|
10024
|
+
}
|
10025
|
+
|
10026
|
+
// ggml_compute_forward_gelu_quick
|
10027
|
+
|
10028
|
+
static void ggml_compute_forward_gelu_quick_f32(
|
10029
|
+
const struct ggml_compute_params * params,
|
10030
|
+
const struct ggml_tensor * src0,
|
10031
|
+
struct ggml_tensor * dst) {
|
10032
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
10033
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
10034
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10035
|
+
|
10036
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10037
|
+
return;
|
10038
|
+
}
|
10039
|
+
|
10040
|
+
const int ith = params->ith;
|
10041
|
+
const int nth = params->nth;
|
10042
|
+
|
10043
|
+
const int nc = src0->ne[0];
|
10044
|
+
const int nr = ggml_nrows(src0);
|
10045
|
+
|
10046
|
+
// rows per thread
|
10047
|
+
const int dr = (nr + nth - 1)/nth;
|
10048
|
+
|
10049
|
+
// row range for this thread
|
10050
|
+
const int ir0 = dr*ith;
|
10051
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
10052
|
+
|
10053
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
10054
|
+
ggml_vec_gelu_quick_f32(nc,
|
10055
|
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
10056
|
+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
10057
|
+
|
10058
|
+
#ifndef NDEBUG
|
10059
|
+
for (int k = 0; k < nc; k++) {
|
10060
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
10061
|
+
UNUSED(x);
|
10062
|
+
assert(!isnan(x));
|
10063
|
+
assert(!isinf(x));
|
10064
|
+
}
|
10065
|
+
#endif
|
10066
|
+
}
|
10067
|
+
}
|
9456
10068
|
|
9457
|
-
|
10069
|
+
static void ggml_compute_forward_gelu_quick(
|
10070
|
+
const struct ggml_compute_params * params,
|
10071
|
+
const struct ggml_tensor * src0,
|
10072
|
+
struct ggml_tensor * dst) {
|
10073
|
+
switch (src0->type) {
|
10074
|
+
case GGML_TYPE_F32:
|
10075
|
+
{
|
10076
|
+
ggml_compute_forward_gelu_quick_f32(params, src0, dst);
|
10077
|
+
} break;
|
10078
|
+
default:
|
10079
|
+
{
|
10080
|
+
GGML_ASSERT(false);
|
10081
|
+
} break;
|
10082
|
+
}
|
9458
10083
|
}
|
9459
10084
|
|
9460
10085
|
// ggml_compute_forward_silu
|
@@ -10852,7 +11477,7 @@ static void ggml_compute_forward_set_f32(
|
|
10852
11477
|
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
10853
11478
|
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
10854
11479
|
|
10855
|
-
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3
|
11480
|
+
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
10856
11481
|
|
10857
11482
|
GGML_ASSERT(nb10 == sizeof(float));
|
10858
11483
|
|
@@ -11573,8 +12198,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11573
12198
|
const struct ggml_tensor * src1,
|
11574
12199
|
struct ggml_tensor * dst) {
|
11575
12200
|
assert(params->ith == 0);
|
11576
|
-
|
11577
|
-
|
12201
|
+
|
12202
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12203
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11578
12204
|
|
11579
12205
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11580
12206
|
return;
|
@@ -11637,8 +12263,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11637
12263
|
const struct ggml_tensor * src1,
|
11638
12264
|
struct ggml_tensor * dst) {
|
11639
12265
|
assert(params->ith == 0);
|
11640
|
-
|
11641
|
-
|
12266
|
+
|
12267
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12268
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11642
12269
|
|
11643
12270
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11644
12271
|
return;
|
@@ -11740,15 +12367,16 @@ static void ggml_compute_forward_clamp_f32(
|
|
11740
12367
|
const struct ggml_tensor * src1,
|
11741
12368
|
struct ggml_tensor * dst) {
|
11742
12369
|
assert(params->ith == 0);
|
11743
|
-
|
11744
|
-
|
12370
|
+
|
12371
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12372
|
+
GGML_ASSERT(ggml_nelements(src1) == 2);
|
11745
12373
|
|
11746
12374
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11747
12375
|
return;
|
11748
12376
|
}
|
11749
12377
|
|
11750
|
-
const
|
11751
|
-
const
|
12378
|
+
const float min = ((float *) src1->data)[0];
|
12379
|
+
const float max = ((float *) src1->data)[1];
|
11752
12380
|
|
11753
12381
|
const int ith = params->ith;
|
11754
12382
|
const int nth = params->nth;
|
@@ -11816,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11816
12444
|
const struct ggml_tensor * src1,
|
11817
12445
|
struct ggml_tensor * dst) {
|
11818
12446
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11819
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12447
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
11820
12448
|
|
11821
12449
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11822
12450
|
return;
|
@@ -11825,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11825
12453
|
const int n_past = ((int32_t *) src1->data)[0];
|
11826
12454
|
const int n_dims = ((int32_t *) src1->data)[1];
|
11827
12455
|
const int mode = ((int32_t *) src1->data)[2];
|
12456
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
11828
12457
|
|
11829
12458
|
assert(n_past >= 0);
|
11830
12459
|
|
@@ -11869,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11869
12498
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
11870
12499
|
|
11871
12500
|
const bool is_neox = mode & 2;
|
12501
|
+
const bool is_glm = mode & 4;
|
11872
12502
|
|
11873
12503
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11874
12504
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -11879,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
|
|
11879
12509
|
|
11880
12510
|
float theta = (float)p;
|
11881
12511
|
|
11882
|
-
if (
|
12512
|
+
if (is_glm) {
|
12513
|
+
theta = MIN(p, n_ctx - 2);
|
12514
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12515
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12516
|
+
const float cos_theta = cosf(theta);
|
12517
|
+
const float sin_theta = sinf(theta);
|
12518
|
+
const float cos_block_theta = cosf(block_theta);
|
12519
|
+
const float sin_block_theta = sinf(block_theta);
|
12520
|
+
|
12521
|
+
theta *= theta_scale;
|
12522
|
+
block_theta *= theta_scale;
|
12523
|
+
|
12524
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12525
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12526
|
+
|
12527
|
+
const float x0 = src[0];
|
12528
|
+
const float x1 = src[n_dims/2];
|
12529
|
+
const float x2 = src[n_dims];
|
12530
|
+
const float x3 = src[n_dims/2*3];
|
12531
|
+
|
12532
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12533
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
12534
|
+
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
12535
|
+
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
12536
|
+
}
|
12537
|
+
} else if (!is_neox) {
|
11883
12538
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11884
12539
|
const float cos_theta = cosf(theta);
|
11885
12540
|
const float sin_theta = sinf(theta);
|
@@ -11929,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11929
12584
|
const struct ggml_tensor * src1,
|
11930
12585
|
struct ggml_tensor * dst) {
|
11931
12586
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11932
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12587
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
11933
12588
|
|
11934
12589
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11935
12590
|
return;
|
@@ -11938,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11938
12593
|
const int n_past = ((int32_t *) src1->data)[0];
|
11939
12594
|
const int n_dims = ((int32_t *) src1->data)[1];
|
11940
12595
|
const int mode = ((int32_t *) src1->data)[2];
|
12596
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
11941
12597
|
|
11942
12598
|
assert(n_past >= 0);
|
11943
12599
|
|
@@ -11982,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11982
12638
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
11983
12639
|
|
11984
12640
|
const bool is_neox = mode & 2;
|
12641
|
+
const bool is_glm = mode & 4;
|
11985
12642
|
|
11986
12643
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11987
12644
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -11992,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
|
|
11992
12649
|
|
11993
12650
|
float theta = (float)p;
|
11994
12651
|
|
11995
|
-
if (
|
12652
|
+
if (is_glm) {
|
12653
|
+
theta = MIN(p, n_ctx - 2);
|
12654
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12655
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12656
|
+
const float cos_theta = cosf(theta);
|
12657
|
+
const float sin_theta = sinf(theta);
|
12658
|
+
const float cos_block_theta = cosf(block_theta);
|
12659
|
+
const float sin_block_theta = sinf(block_theta);
|
12660
|
+
|
12661
|
+
theta *= theta_scale;
|
12662
|
+
block_theta *= theta_scale;
|
12663
|
+
|
12664
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12665
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12666
|
+
|
12667
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12668
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12669
|
+
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
12670
|
+
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
12671
|
+
|
12672
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12673
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12674
|
+
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
12675
|
+
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
12676
|
+
}
|
12677
|
+
} if (!is_neox) {
|
11996
12678
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11997
12679
|
const float cos_theta = cosf(theta);
|
11998
12680
|
const float sin_theta = sinf(theta);
|
@@ -12306,9 +12988,9 @@ static void ggml_compute_forward_rope_back(
|
|
12306
12988
|
}
|
12307
12989
|
}
|
12308
12990
|
|
12309
|
-
//
|
12991
|
+
// ggml_compute_forward_conv_1d_s1_ph
|
12310
12992
|
|
12311
|
-
static void
|
12993
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12312
12994
|
const struct ggml_compute_params * params,
|
12313
12995
|
const struct ggml_tensor * src0,
|
12314
12996
|
const struct ggml_tensor * src1,
|
@@ -12428,7 +13110,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
12428
13110
|
}
|
12429
13111
|
}
|
12430
13112
|
|
12431
|
-
static void
|
13113
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
12432
13114
|
const struct ggml_compute_params * params,
|
12433
13115
|
const struct ggml_tensor * src0,
|
12434
13116
|
const struct ggml_tensor * src1,
|
@@ -12548,7 +13230,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
12548
13230
|
}
|
12549
13231
|
}
|
12550
13232
|
|
12551
|
-
static void
|
13233
|
+
static void ggml_compute_forward_conv_1d_s1_ph(
|
12552
13234
|
const struct ggml_compute_params * params,
|
12553
13235
|
const struct ggml_tensor * src0,
|
12554
13236
|
const struct ggml_tensor * src1,
|
@@ -12556,11 +13238,11 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12556
13238
|
switch (src0->type) {
|
12557
13239
|
case GGML_TYPE_F16:
|
12558
13240
|
{
|
12559
|
-
|
13241
|
+
ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
|
12560
13242
|
} break;
|
12561
13243
|
case GGML_TYPE_F32:
|
12562
13244
|
{
|
12563
|
-
|
13245
|
+
ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
|
12564
13246
|
} break;
|
12565
13247
|
default:
|
12566
13248
|
{
|
@@ -12569,9 +13251,9 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12569
13251
|
}
|
12570
13252
|
}
|
12571
13253
|
|
12572
|
-
//
|
13254
|
+
// ggml_compute_forward_conv_1d_s2_ph
|
12573
13255
|
|
12574
|
-
static void
|
13256
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
12575
13257
|
const struct ggml_compute_params * params,
|
12576
13258
|
const struct ggml_tensor * src0,
|
12577
13259
|
const struct ggml_tensor * src1,
|
@@ -12691,7 +13373,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
12691
13373
|
}
|
12692
13374
|
}
|
12693
13375
|
|
12694
|
-
static void
|
13376
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
12695
13377
|
const struct ggml_compute_params * params,
|
12696
13378
|
const struct ggml_tensor * src0,
|
12697
13379
|
const struct ggml_tensor * src1,
|
@@ -12811,7 +13493,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
12811
13493
|
}
|
12812
13494
|
}
|
12813
13495
|
|
12814
|
-
static void
|
13496
|
+
static void ggml_compute_forward_conv_1d_s2_ph(
|
12815
13497
|
const struct ggml_compute_params * params,
|
12816
13498
|
const struct ggml_tensor * src0,
|
12817
13499
|
const struct ggml_tensor * src1,
|
@@ -12819,11 +13501,11 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12819
13501
|
switch (src0->type) {
|
12820
13502
|
case GGML_TYPE_F16:
|
12821
13503
|
{
|
12822
|
-
|
13504
|
+
ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
|
12823
13505
|
} break;
|
12824
13506
|
case GGML_TYPE_F32:
|
12825
13507
|
{
|
12826
|
-
|
13508
|
+
ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
|
12827
13509
|
} break;
|
12828
13510
|
default:
|
12829
13511
|
{
|
@@ -12832,18 +13514,154 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12832
13514
|
}
|
12833
13515
|
}
|
12834
13516
|
|
12835
|
-
//
|
13517
|
+
// ggml_compute_forward_conv_2d_sk_p0
|
12836
13518
|
|
12837
|
-
static void
|
13519
|
+
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
12838
13520
|
const struct ggml_compute_params * params,
|
12839
|
-
const struct ggml_tensor *
|
12840
|
-
const struct ggml_tensor *
|
12841
|
-
|
12842
|
-
|
12843
|
-
|
12844
|
-
|
12845
|
-
|
12846
|
-
|
13521
|
+
const struct ggml_tensor * src0,
|
13522
|
+
const struct ggml_tensor * src1,
|
13523
|
+
struct ggml_tensor * dst) {
|
13524
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13525
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13526
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13527
|
+
|
13528
|
+
int64_t t0 = ggml_perf_time_us();
|
13529
|
+
UNUSED(t0);
|
13530
|
+
|
13531
|
+
const int ne00 = src0->ne[0];
|
13532
|
+
const int ne01 = src0->ne[1];
|
13533
|
+
const int ne02 = src0->ne[2];
|
13534
|
+
//const int ne03 = src0->ne[3];
|
13535
|
+
|
13536
|
+
const int ne10 = src1->ne[0];
|
13537
|
+
//const int ne11 = src1->ne[1];
|
13538
|
+
const int ne12 = src1->ne[2];
|
13539
|
+
//const int ne13 = src1->ne[3];
|
13540
|
+
|
13541
|
+
const int ne0 = dst->ne[0];
|
13542
|
+
const int ne1 = dst->ne[1];
|
13543
|
+
const int ne2 = dst->ne[2];
|
13544
|
+
//const int ne3 = dst->ne[3];
|
13545
|
+
//const int ne = ne0*ne1*ne2*ne3;
|
13546
|
+
|
13547
|
+
const int nb00 = src0->nb[0];
|
13548
|
+
//const int nb01 = src0->nb[1];
|
13549
|
+
//const int nb02 = src0->nb[2];
|
13550
|
+
const int nb03 = src0->nb[3];
|
13551
|
+
|
13552
|
+
const int nb10 = src1->nb[0];
|
13553
|
+
//const int nb11 = src1->nb[1];
|
13554
|
+
const int nb12 = src1->nb[2];
|
13555
|
+
//const int nb13 = src1->nb[3];
|
13556
|
+
|
13557
|
+
//const int nb0 = dst->nb[0];
|
13558
|
+
//const int nb1 = dst->nb[1];
|
13559
|
+
const int nb2 = dst->nb[2];
|
13560
|
+
//const int nb3 = dst->nb[3];
|
13561
|
+
|
13562
|
+
const int ith = params->ith;
|
13563
|
+
const int nth = params->nth;
|
13564
|
+
|
13565
|
+
const int nk0 = ne00;
|
13566
|
+
const int nk1 = ne01;
|
13567
|
+
|
13568
|
+
// size of the convolution row - the kernel size unrolled across all channels
|
13569
|
+
const int ew0 = nk0*nk1*ne02;
|
13570
|
+
|
13571
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13572
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13573
|
+
|
13574
|
+
if (params->type == GGML_TASK_INIT) {
|
13575
|
+
// TODO: fix this memset (wsize is overestimated)
|
13576
|
+
memset(params->wdata, 0, params->wsize);
|
13577
|
+
|
13578
|
+
// prepare source data (src1)
|
13579
|
+
{
|
13580
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13581
|
+
|
13582
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13583
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13584
|
+
ggml_fp16_t * dst_data = wdata;
|
13585
|
+
|
13586
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13587
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13588
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13589
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13590
|
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13591
|
+
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
13592
|
+
}
|
13593
|
+
}
|
13594
|
+
}
|
13595
|
+
}
|
13596
|
+
}
|
13597
|
+
}
|
13598
|
+
|
13599
|
+
return;
|
13600
|
+
}
|
13601
|
+
|
13602
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13603
|
+
return;
|
13604
|
+
}
|
13605
|
+
|
13606
|
+
// total patches in dst
|
13607
|
+
const int np = ne2;
|
13608
|
+
|
13609
|
+
// patches per thread
|
13610
|
+
const int dp = (np + nth - 1)/nth;
|
13611
|
+
|
13612
|
+
// patch range for this thread
|
13613
|
+
const int ip0 = dp*ith;
|
13614
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13615
|
+
|
13616
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13617
|
+
|
13618
|
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13619
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13620
|
+
|
13621
|
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13622
|
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13623
|
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13624
|
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13625
|
+
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
|
13626
|
+
}
|
13627
|
+
}
|
13628
|
+
}
|
13629
|
+
}
|
13630
|
+
|
13631
|
+
static void ggml_compute_forward_conv_2d_sk_p0(
|
13632
|
+
const struct ggml_compute_params * params,
|
13633
|
+
const struct ggml_tensor * src0,
|
13634
|
+
const struct ggml_tensor * src1,
|
13635
|
+
struct ggml_tensor * dst) {
|
13636
|
+
switch (src0->type) {
|
13637
|
+
case GGML_TYPE_F16:
|
13638
|
+
{
|
13639
|
+
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
13640
|
+
} break;
|
13641
|
+
case GGML_TYPE_F32:
|
13642
|
+
{
|
13643
|
+
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
13644
|
+
GGML_ASSERT(false);
|
13645
|
+
} break;
|
13646
|
+
default:
|
13647
|
+
{
|
13648
|
+
GGML_ASSERT(false);
|
13649
|
+
} break;
|
13650
|
+
}
|
13651
|
+
}
|
13652
|
+
|
13653
|
+
// ggml_compute_forward_flash_attn
|
13654
|
+
|
13655
|
+
static void ggml_compute_forward_flash_attn_f32(
|
13656
|
+
const struct ggml_compute_params * params,
|
13657
|
+
const struct ggml_tensor * q,
|
13658
|
+
const struct ggml_tensor * k,
|
13659
|
+
const struct ggml_tensor * v,
|
13660
|
+
const bool masked,
|
13661
|
+
struct ggml_tensor * dst) {
|
13662
|
+
int64_t t0 = ggml_perf_time_us();
|
13663
|
+
UNUSED(t0);
|
13664
|
+
|
12847
13665
|
const int64_t neq0 = q->ne[0];
|
12848
13666
|
const int64_t neq1 = q->ne[1];
|
12849
13667
|
const int64_t neq2 = q->ne[2];
|
@@ -13926,6 +14744,145 @@ static void ggml_compute_forward_flash_attn_back(
|
|
13926
14744
|
}
|
13927
14745
|
}
|
13928
14746
|
|
14747
|
+
// ggml_compute_forward_win_part
|
14748
|
+
|
14749
|
+
static void ggml_compute_forward_win_part_f32(
|
14750
|
+
const struct ggml_compute_params * params,
|
14751
|
+
const struct ggml_tensor * src0,
|
14752
|
+
const struct ggml_tensor * opt0,
|
14753
|
+
struct ggml_tensor * dst) {
|
14754
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14755
|
+
return;
|
14756
|
+
}
|
14757
|
+
|
14758
|
+
const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
|
14759
|
+
const int64_t ne01 = src0->ne[1];
|
14760
|
+
const int64_t ne02 = src0->ne[2];
|
14761
|
+
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14762
|
+
|
14763
|
+
const int64_t ne0 = dst->ne[0];
|
14764
|
+
const int64_t ne1 = dst->ne[1];
|
14765
|
+
const int64_t ne2 = dst->ne[2];
|
14766
|
+
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
14767
|
+
|
14768
|
+
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14769
|
+
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
14770
|
+
const int32_t w = ((const int32_t *)(opt0->data))[2];
|
14771
|
+
|
14772
|
+
assert(ne00 == ne0);
|
14773
|
+
assert(ne3 == nep0*nep1);
|
14774
|
+
|
14775
|
+
// TODO: optimize / multi-thread
|
14776
|
+
for (int py = 0; py < nep1; ++py) {
|
14777
|
+
for (int px = 0; px < nep0; ++px) {
|
14778
|
+
const int64_t i3 = py*nep0 + px;
|
14779
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14780
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14781
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14782
|
+
const int64_t i02 = py*w + i2;
|
14783
|
+
const int64_t i01 = px*w + i1;
|
14784
|
+
const int64_t i00 = i0;
|
14785
|
+
|
14786
|
+
const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
|
14787
|
+
const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
|
14788
|
+
|
14789
|
+
if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
|
14790
|
+
((float *) dst->data)[i] = 0.0f;
|
14791
|
+
} else {
|
14792
|
+
((float *) dst->data)[i] = ((float *) src0->data)[j];
|
14793
|
+
}
|
14794
|
+
}
|
14795
|
+
}
|
14796
|
+
}
|
14797
|
+
}
|
14798
|
+
}
|
14799
|
+
}
|
14800
|
+
|
14801
|
+
static void ggml_compute_forward_win_part(
|
14802
|
+
const struct ggml_compute_params * params,
|
14803
|
+
const struct ggml_tensor * src0,
|
14804
|
+
const struct ggml_tensor * opt0,
|
14805
|
+
struct ggml_tensor * dst) {
|
14806
|
+
switch (src0->type) {
|
14807
|
+
case GGML_TYPE_F32:
|
14808
|
+
{
|
14809
|
+
ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
|
14810
|
+
} break;
|
14811
|
+
default:
|
14812
|
+
{
|
14813
|
+
GGML_ASSERT(false);
|
14814
|
+
} break;
|
14815
|
+
}
|
14816
|
+
}
|
14817
|
+
|
14818
|
+
// ggml_compute_forward_win_unpart
|
14819
|
+
|
14820
|
+
static void ggml_compute_forward_win_unpart_f32(
|
14821
|
+
const struct ggml_compute_params * params,
|
14822
|
+
const struct ggml_tensor * src0,
|
14823
|
+
const struct ggml_tensor * opt0,
|
14824
|
+
struct ggml_tensor * dst) {
|
14825
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14826
|
+
return;
|
14827
|
+
}
|
14828
|
+
|
14829
|
+
const int64_t ne00 = src0->ne[0];
|
14830
|
+
const int64_t ne01 = src0->ne[1];
|
14831
|
+
const int64_t ne02 = src0->ne[2];
|
14832
|
+
//const int64_t ne03 = src0->ne[3];
|
14833
|
+
|
14834
|
+
const int64_t ne0 = dst->ne[0];
|
14835
|
+
const int64_t ne1 = dst->ne[1];
|
14836
|
+
const int64_t ne2 = dst->ne[2];
|
14837
|
+
|
14838
|
+
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14839
|
+
|
14840
|
+
// padding
|
14841
|
+
const int px = (w - ne1%w)%w;
|
14842
|
+
//const int py = (w - ne2%w)%w;
|
14843
|
+
|
14844
|
+
const int npx = (px + ne1)/w;
|
14845
|
+
//const int npy = (py + ne2)/w;
|
14846
|
+
|
14847
|
+
assert(ne0 == ne00);
|
14848
|
+
|
14849
|
+
// TODO: optimize / multi-thread
|
14850
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14851
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14852
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14853
|
+
const int ip2 = i2/w;
|
14854
|
+
const int ip1 = i1/w;
|
14855
|
+
|
14856
|
+
const int64_t i02 = i2%w;
|
14857
|
+
const int64_t i01 = i1%w;
|
14858
|
+
const int64_t i00 = i0;
|
14859
|
+
|
14860
|
+
const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
|
14861
|
+
const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
|
14862
|
+
|
14863
|
+
((float *) dst->data)[j] = ((float *) src0->data)[i];
|
14864
|
+
}
|
14865
|
+
}
|
14866
|
+
}
|
14867
|
+
}
|
14868
|
+
|
14869
|
+
static void ggml_compute_forward_win_unpart(
|
14870
|
+
const struct ggml_compute_params * params,
|
14871
|
+
const struct ggml_tensor * src0,
|
14872
|
+
const struct ggml_tensor * opt0,
|
14873
|
+
struct ggml_tensor * dst) {
|
14874
|
+
switch (src0->type) {
|
14875
|
+
case GGML_TYPE_F32:
|
14876
|
+
{
|
14877
|
+
ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
|
14878
|
+
} break;
|
14879
|
+
default:
|
14880
|
+
{
|
14881
|
+
GGML_ASSERT(false);
|
14882
|
+
} break;
|
14883
|
+
}
|
14884
|
+
}
|
14885
|
+
|
13929
14886
|
// ggml_compute_forward_map_unary
|
13930
14887
|
|
13931
14888
|
static void ggml_compute_forward_map_unary_f32(
|
@@ -14019,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
|
|
14019
14976
|
}
|
14020
14977
|
}
|
14021
14978
|
|
14979
|
+
// ggml_compute_forward_map_custom1
|
14980
|
+
|
14981
|
+
static void ggml_compute_forward_map_custom1_f32(
|
14982
|
+
const struct ggml_compute_params * params,
|
14983
|
+
const struct ggml_tensor * a,
|
14984
|
+
struct ggml_tensor * dst,
|
14985
|
+
const ggml_custom1_op_f32_t fun) {
|
14986
|
+
assert(params->ith == 0);
|
14987
|
+
|
14988
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14989
|
+
return;
|
14990
|
+
}
|
14991
|
+
|
14992
|
+
fun(dst, a);
|
14993
|
+
}
|
14994
|
+
|
14995
|
+
|
14996
|
+
static void ggml_compute_forward_map_custom1(
|
14997
|
+
const struct ggml_compute_params * params,
|
14998
|
+
const struct ggml_tensor * a,
|
14999
|
+
struct ggml_tensor * dst,
|
15000
|
+
const ggml_custom1_op_f32_t fun) {
|
15001
|
+
switch (a->type) {
|
15002
|
+
case GGML_TYPE_F32:
|
15003
|
+
{
|
15004
|
+
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
15005
|
+
} break;
|
15006
|
+
default:
|
15007
|
+
{
|
15008
|
+
GGML_ASSERT(false);
|
15009
|
+
} break;
|
15010
|
+
}
|
15011
|
+
}
|
15012
|
+
|
15013
|
+
// ggml_compute_forward_map_custom2
|
15014
|
+
|
15015
|
+
static void ggml_compute_forward_map_custom2_f32(
|
15016
|
+
const struct ggml_compute_params * params,
|
15017
|
+
const struct ggml_tensor * a,
|
15018
|
+
const struct ggml_tensor * b,
|
15019
|
+
struct ggml_tensor * dst,
|
15020
|
+
const ggml_custom2_op_f32_t fun) {
|
15021
|
+
assert(params->ith == 0);
|
15022
|
+
|
15023
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15024
|
+
return;
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
fun(dst, a, b);
|
15028
|
+
}
|
15029
|
+
|
15030
|
+
|
15031
|
+
static void ggml_compute_forward_map_custom2(
|
15032
|
+
const struct ggml_compute_params * params,
|
15033
|
+
const struct ggml_tensor * a,
|
15034
|
+
const struct ggml_tensor * b,
|
15035
|
+
struct ggml_tensor * dst,
|
15036
|
+
const ggml_custom2_op_f32_t fun) {
|
15037
|
+
switch (a->type) {
|
15038
|
+
case GGML_TYPE_F32:
|
15039
|
+
{
|
15040
|
+
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
15041
|
+
} break;
|
15042
|
+
default:
|
15043
|
+
{
|
15044
|
+
GGML_ASSERT(false);
|
15045
|
+
} break;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
// ggml_compute_forward_map_custom3
|
15050
|
+
|
15051
|
+
static void ggml_compute_forward_map_custom3_f32(
|
15052
|
+
const struct ggml_compute_params * params,
|
15053
|
+
const struct ggml_tensor * a,
|
15054
|
+
const struct ggml_tensor * b,
|
15055
|
+
const struct ggml_tensor * c,
|
15056
|
+
struct ggml_tensor * dst,
|
15057
|
+
const ggml_custom3_op_f32_t fun) {
|
15058
|
+
assert(params->ith == 0);
|
15059
|
+
|
15060
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15061
|
+
return;
|
15062
|
+
}
|
15063
|
+
|
15064
|
+
fun(dst, a, b, c);
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
|
15068
|
+
static void ggml_compute_forward_map_custom3(
|
15069
|
+
const struct ggml_compute_params * params,
|
15070
|
+
const struct ggml_tensor * a,
|
15071
|
+
const struct ggml_tensor * b,
|
15072
|
+
const struct ggml_tensor * c,
|
15073
|
+
struct ggml_tensor * dst,
|
15074
|
+
const ggml_custom3_op_f32_t fun) {
|
15075
|
+
switch (a->type) {
|
15076
|
+
case GGML_TYPE_F32:
|
15077
|
+
{
|
15078
|
+
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
15079
|
+
} break;
|
15080
|
+
default:
|
15081
|
+
{
|
15082
|
+
GGML_ASSERT(false);
|
15083
|
+
} break;
|
15084
|
+
}
|
15085
|
+
}
|
15086
|
+
|
14022
15087
|
// ggml_compute_forward_cross_entropy_loss
|
14023
15088
|
|
14024
15089
|
static void ggml_compute_forward_cross_entropy_loss_f32(
|
@@ -14309,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14309
15374
|
if (skip_cpu) {
|
14310
15375
|
return;
|
14311
15376
|
}
|
14312
|
-
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
15377
|
+
GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
|
14313
15378
|
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
14314
15379
|
#endif // GGML_USE_CUBLAS
|
14315
15380
|
|
@@ -14398,6 +15463,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14398
15463
|
{
|
14399
15464
|
ggml_compute_forward_gelu(params, tensor->src0, tensor);
|
14400
15465
|
} break;
|
15466
|
+
case GGML_OP_GELU_QUICK:
|
15467
|
+
{
|
15468
|
+
ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
|
15469
|
+
} break;
|
14401
15470
|
case GGML_OP_SILU:
|
14402
15471
|
{
|
14403
15472
|
ggml_compute_forward_silu(params, tensor->src0, tensor);
|
@@ -14502,19 +15571,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14502
15571
|
{
|
14503
15572
|
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
14504
15573
|
} break;
|
14505
|
-
case
|
15574
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15575
|
+
{
|
15576
|
+
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15577
|
+
} break;
|
15578
|
+
case GGML_OP_CONV_1D_S2_PH:
|
14506
15579
|
{
|
14507
|
-
|
15580
|
+
ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
|
14508
15581
|
} break;
|
14509
|
-
case
|
15582
|
+
case GGML_OP_CONV_2D_SK_P0:
|
14510
15583
|
{
|
14511
|
-
|
15584
|
+
ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
|
14512
15585
|
} break;
|
14513
15586
|
case GGML_OP_FLASH_ATTN:
|
14514
15587
|
{
|
14515
|
-
int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
15588
|
+
const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
14516
15589
|
GGML_ASSERT(t == 0 || t == 1);
|
14517
|
-
bool masked = t != 0;
|
15590
|
+
const bool masked = t != 0;
|
14518
15591
|
ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
|
14519
15592
|
} break;
|
14520
15593
|
case GGML_OP_FLASH_FF:
|
@@ -14528,6 +15601,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14528
15601
|
bool masked = t != 0;
|
14529
15602
|
ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
|
14530
15603
|
} break;
|
15604
|
+
case GGML_OP_WIN_PART:
|
15605
|
+
{
|
15606
|
+
ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
|
15607
|
+
} break;
|
15608
|
+
case GGML_OP_WIN_UNPART:
|
15609
|
+
{
|
15610
|
+
ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
|
15611
|
+
} break;
|
14531
15612
|
case GGML_OP_MAP_UNARY:
|
14532
15613
|
{
|
14533
15614
|
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
@@ -14540,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14540
15621
|
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
14541
15622
|
}
|
14542
15623
|
break;
|
15624
|
+
case GGML_OP_MAP_CUSTOM1:
|
15625
|
+
{
|
15626
|
+
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
|
15627
|
+
ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
|
15628
|
+
}
|
15629
|
+
break;
|
15630
|
+
case GGML_OP_MAP_CUSTOM2:
|
15631
|
+
{
|
15632
|
+
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
|
15633
|
+
ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
|
15634
|
+
}
|
15635
|
+
break;
|
15636
|
+
case GGML_OP_MAP_CUSTOM3:
|
15637
|
+
{
|
15638
|
+
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
|
15639
|
+
ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
|
15640
|
+
}
|
15641
|
+
break;
|
14543
15642
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
14544
15643
|
{
|
14545
15644
|
ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
|
@@ -14799,6 +15898,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14799
15898
|
{
|
14800
15899
|
GGML_ASSERT(false); // TODO: not implemented
|
14801
15900
|
} break;
|
15901
|
+
case GGML_OP_GELU_QUICK:
|
15902
|
+
{
|
15903
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15904
|
+
} break;
|
14802
15905
|
case GGML_OP_ALIBI:
|
14803
15906
|
{
|
14804
15907
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15144,28 +16247,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15144
16247
|
{
|
15145
16248
|
if (src0->grad) {
|
15146
16249
|
assert(src1->type == GGML_TYPE_I32);
|
15147
|
-
assert(ggml_nelements(src1) ==
|
16250
|
+
assert(ggml_nelements(src1) == 4);
|
15148
16251
|
const int n_past = ((int32_t *) src1->data)[0];
|
15149
16252
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15150
16253
|
const int mode = ((int32_t *) src1->data)[2];
|
16254
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15151
16255
|
src0->grad = ggml_add_impl(ctx,
|
15152
16256
|
src0->grad,
|
15153
16257
|
ggml_rope(ctx,
|
15154
16258
|
tensor->grad,
|
15155
16259
|
n_past,
|
15156
16260
|
n_dims,
|
15157
|
-
mode
|
16261
|
+
mode,
|
16262
|
+
n_ctx),
|
15158
16263
|
inplace);
|
15159
16264
|
}
|
15160
16265
|
if (src1->grad) {
|
15161
16266
|
// noop
|
15162
16267
|
}
|
15163
16268
|
} break;
|
15164
|
-
case
|
16269
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15165
16270
|
{
|
15166
16271
|
GGML_ASSERT(false); // TODO: not implemented
|
15167
16272
|
} break;
|
15168
|
-
case
|
16273
|
+
case GGML_OP_CONV_1D_S2_PH:
|
16274
|
+
{
|
16275
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16276
|
+
} break;
|
16277
|
+
case GGML_OP_CONV_2D_SK_P0:
|
15169
16278
|
{
|
15170
16279
|
GGML_ASSERT(false); // TODO: not implemented
|
15171
16280
|
} break;
|
@@ -15334,8 +16443,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15334
16443
|
{
|
15335
16444
|
GGML_ASSERT(false); // not supported
|
15336
16445
|
} break;
|
16446
|
+
case GGML_OP_WIN_PART:
|
16447
|
+
case GGML_OP_WIN_UNPART:
|
15337
16448
|
case GGML_OP_MAP_UNARY:
|
15338
16449
|
case GGML_OP_MAP_BINARY:
|
16450
|
+
case GGML_OP_MAP_CUSTOM1:
|
16451
|
+
case GGML_OP_MAP_CUSTOM2:
|
16452
|
+
case GGML_OP_MAP_CUSTOM3:
|
15339
16453
|
{
|
15340
16454
|
GGML_ASSERT(false); // not supported
|
15341
16455
|
} break;
|
@@ -15407,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15407
16521
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
15408
16522
|
|
15409
16523
|
if (strlen(node->name) == 0) {
|
15410
|
-
|
16524
|
+
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
15411
16525
|
}
|
15412
16526
|
|
15413
16527
|
cgraph->leafs[cgraph->n_leafs] = node;
|
@@ -15416,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15416
16530
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
15417
16531
|
|
15418
16532
|
if (strlen(node->name) == 0) {
|
15419
|
-
|
16533
|
+
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
15420
16534
|
}
|
15421
16535
|
|
15422
16536
|
cgraph->nodes[cgraph->n_nodes] = node;
|
@@ -15570,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
|
|
15570
16684
|
|
15571
16685
|
#endif
|
15572
16686
|
|
16687
|
+
// Android's libc implementation "bionic" does not support setting affinity
|
16688
|
+
#if defined(__linux__) && !defined(__BIONIC__)
|
16689
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) {
|
16690
|
+
if (!ggml_is_numa()) {
|
16691
|
+
return;
|
16692
|
+
}
|
16693
|
+
|
16694
|
+
// run thread on node_num thread_n / (threads per node)
|
16695
|
+
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
16696
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
16697
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16698
|
+
|
16699
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16700
|
+
CPU_ZERO_S(setsize, cpus);
|
16701
|
+
for (size_t i = 0; i < node->n_cpus; ++i) {
|
16702
|
+
CPU_SET_S(node->cpus[i], setsize, cpus);
|
16703
|
+
}
|
16704
|
+
|
16705
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16706
|
+
if (rv) {
|
16707
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16708
|
+
strerror(rv));
|
16709
|
+
}
|
16710
|
+
|
16711
|
+
CPU_FREE(cpus);
|
16712
|
+
}
|
16713
|
+
|
16714
|
+
void clear_numa_thread_affinity(void) {
|
16715
|
+
if (!ggml_is_numa()) {
|
16716
|
+
return;
|
16717
|
+
}
|
16718
|
+
|
16719
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16720
|
+
|
16721
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16722
|
+
CPU_ZERO_S(setsize, cpus);
|
16723
|
+
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
|
16724
|
+
CPU_SET_S(i, setsize, cpus);
|
16725
|
+
}
|
16726
|
+
|
16727
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16728
|
+
if (rv) {
|
16729
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16730
|
+
strerror(rv));
|
16731
|
+
}
|
16732
|
+
|
16733
|
+
CPU_FREE(cpus);
|
16734
|
+
}
|
16735
|
+
#else
|
16736
|
+
// TODO: Windows etc.
|
16737
|
+
// (the linux implementation may also work on BSD, someone should test)
|
16738
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
16739
|
+
void clear_numa_thread_affinity(void) {}
|
16740
|
+
#endif
|
16741
|
+
|
15573
16742
|
struct ggml_compute_state_shared {
|
15574
|
-
|
16743
|
+
struct ggml_cgraph * cgraph;
|
16744
|
+
|
16745
|
+
int64_t perf_node_start_cycles;
|
16746
|
+
int64_t perf_node_start_time_us;
|
15575
16747
|
|
15576
16748
|
int n_threads;
|
15577
16749
|
|
15578
16750
|
// synchronization primitives
|
15579
|
-
atomic_int
|
15580
|
-
|
15581
|
-
atomic_bool stop; // stop all threads
|
16751
|
+
atomic_int n_active; // num active threads
|
16752
|
+
atomic_int node_n; // active graph node
|
15582
16753
|
};
|
15583
16754
|
|
15584
16755
|
struct ggml_compute_state {
|
15585
16756
|
ggml_thread_t thrd;
|
15586
|
-
|
15587
|
-
struct ggml_compute_params params;
|
15588
|
-
struct ggml_tensor * node;
|
15589
|
-
|
16757
|
+
int ith;
|
15590
16758
|
struct ggml_compute_state_shared * shared;
|
15591
16759
|
};
|
15592
16760
|
|
16761
|
+
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
16762
|
+
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
16763
|
+
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
16764
|
+
|
16765
|
+
node->perf_runs++;
|
16766
|
+
node->perf_cycles += cycles_cur;
|
16767
|
+
node->perf_time_us += time_us_cur;
|
16768
|
+
}
|
16769
|
+
|
15593
16770
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
15594
16771
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
16772
|
+
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
15595
16773
|
|
15596
16774
|
const int n_threads = state->shared->n_threads;
|
16775
|
+
set_numa_thread_affinity(state->ith, n_threads);
|
16776
|
+
|
16777
|
+
int node_n = -1;
|
15597
16778
|
|
15598
16779
|
while (true) {
|
15599
|
-
if (
|
15600
|
-
|
15601
|
-
|
15602
|
-
|
15603
|
-
|
15604
|
-
|
15605
|
-
|
15606
|
-
|
15607
|
-
|
16780
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
16781
|
+
// all other threads are finished and spinning
|
16782
|
+
// do finalize and init here so we don't have synchronize again
|
16783
|
+
struct ggml_compute_params params = {
|
16784
|
+
/*.type =*/ GGML_TASK_FINALIZE,
|
16785
|
+
/*.ith =*/ 0,
|
16786
|
+
/*.nth =*/ 0,
|
16787
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16788
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16789
|
+
};
|
16790
|
+
|
16791
|
+
if (node_n != -1) {
|
16792
|
+
/* FINALIZE */
|
16793
|
+
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16794
|
+
params.nth = node->n_tasks;
|
16795
|
+
ggml_compute_forward(¶ms, node);
|
16796
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
15608
16797
|
}
|
15609
|
-
}
|
15610
16798
|
|
15611
|
-
|
16799
|
+
// distribute new work or execute it direct if 1T
|
16800
|
+
while (++node_n < cgraph->n_nodes) {
|
16801
|
+
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16802
|
+
|
16803
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16804
|
+
|
16805
|
+
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16806
|
+
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16807
|
+
|
16808
|
+
/* INIT */
|
16809
|
+
params.type = GGML_TASK_INIT;
|
16810
|
+
params.nth = node->n_tasks;
|
16811
|
+
ggml_compute_forward(¶ms, node);
|
16812
|
+
|
16813
|
+
if (node->n_tasks == 1) {
|
16814
|
+
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16815
|
+
// they do something more efficient than spinning (?)
|
16816
|
+
params.type = GGML_TASK_COMPUTE;
|
16817
|
+
ggml_compute_forward(¶ms, node);
|
15612
16818
|
|
15613
|
-
|
15614
|
-
|
15615
|
-
|
15616
|
-
|
16819
|
+
params.type = GGML_TASK_FINALIZE;
|
16820
|
+
ggml_compute_forward(¶ms, node);
|
16821
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16822
|
+
} else {
|
16823
|
+
break;
|
16824
|
+
}
|
15617
16825
|
}
|
15618
|
-
|
15619
|
-
|
16826
|
+
|
16827
|
+
atomic_store(&state->shared->n_active, n_threads);
|
16828
|
+
atomic_store(&state->shared->node_n, node_n);
|
16829
|
+
} else {
|
16830
|
+
// wait for other threads to finish
|
16831
|
+
const int last = node_n;
|
16832
|
+
do {
|
16833
|
+
sched_yield();
|
16834
|
+
node_n = atomic_load(&state->shared->node_n);
|
16835
|
+
} while (node_n == last);
|
15620
16836
|
}
|
15621
16837
|
|
15622
16838
|
// check if we should stop
|
15623
|
-
if (
|
15624
|
-
break;
|
15625
|
-
}
|
16839
|
+
if (node_n >= cgraph->n_nodes) break;
|
15626
16840
|
|
15627
|
-
|
15628
|
-
|
15629
|
-
ggml_compute_forward(&state->params, state->node);
|
15630
|
-
}
|
16841
|
+
/* COMPUTE */
|
16842
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
15631
16843
|
|
15632
|
-
|
15633
|
-
|
15634
|
-
|
16844
|
+
struct ggml_compute_params params = {
|
16845
|
+
/*.type =*/ GGML_TASK_COMPUTE,
|
16846
|
+
/*.ith =*/ state->ith,
|
16847
|
+
/*.nth =*/ node->n_tasks,
|
16848
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16849
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16850
|
+
};
|
16851
|
+
|
16852
|
+
if (state->ith < node->n_tasks) {
|
16853
|
+
ggml_compute_forward(¶ms, node);
|
15635
16854
|
}
|
15636
16855
|
}
|
15637
16856
|
|
@@ -15642,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15642
16861
|
const int n_threads = cgraph->n_threads;
|
15643
16862
|
|
15644
16863
|
struct ggml_compute_state_shared state_shared = {
|
15645
|
-
/*.
|
15646
|
-
/*.
|
15647
|
-
/*.
|
15648
|
-
/*.
|
15649
|
-
/*.
|
16864
|
+
/*.cgraph =*/ cgraph,
|
16865
|
+
/*.perf_node_start_cycles =*/ 0,
|
16866
|
+
/*.perf_node_start_time_us =*/ 0,
|
16867
|
+
/*.n_threads =*/ n_threads,
|
16868
|
+
/*.n_active =*/ n_threads,
|
16869
|
+
/*.node_n =*/ -1,
|
15650
16870
|
};
|
15651
|
-
struct ggml_compute_state * workers =
|
15652
|
-
|
15653
|
-
// create thread pool
|
15654
|
-
if (n_threads > 1) {
|
15655
|
-
ggml_lock_init(&state_shared.spin);
|
15656
|
-
|
15657
|
-
atomic_store(&state_shared.has_work, true);
|
15658
|
-
|
15659
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
15660
|
-
workers[j] = (struct ggml_compute_state) {
|
15661
|
-
.thrd = 0,
|
15662
|
-
.params = {
|
15663
|
-
.type = GGML_TASK_COMPUTE,
|
15664
|
-
.ith = j + 1,
|
15665
|
-
.nth = n_threads,
|
15666
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
15667
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
15668
|
-
},
|
15669
|
-
.node = NULL,
|
15670
|
-
.shared = &state_shared,
|
15671
|
-
};
|
15672
|
-
|
15673
|
-
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
15674
|
-
GGML_ASSERT(rc == 0);
|
15675
|
-
UNUSED(rc);
|
15676
|
-
}
|
15677
|
-
}
|
16871
|
+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
15678
16872
|
|
15679
16873
|
// initialize tasks + work buffer
|
15680
16874
|
{
|
@@ -15742,6 +16936,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15742
16936
|
} break;
|
15743
16937
|
case GGML_OP_MUL:
|
15744
16938
|
case GGML_OP_GELU:
|
16939
|
+
case GGML_OP_GELU_QUICK:
|
15745
16940
|
case GGML_OP_SILU:
|
15746
16941
|
case GGML_OP_SILU_BACK:
|
15747
16942
|
case GGML_OP_NORM:
|
@@ -15817,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15817
17012
|
} break;
|
15818
17013
|
case GGML_OP_SCALE:
|
15819
17014
|
{
|
15820
|
-
node->n_tasks =
|
17015
|
+
node->n_tasks = 1;
|
15821
17016
|
} break;
|
15822
17017
|
case GGML_OP_SET:
|
15823
17018
|
case GGML_OP_CONT:
|
@@ -15848,8 +17043,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15848
17043
|
{
|
15849
17044
|
node->n_tasks = 1; //TODO
|
15850
17045
|
} break;
|
15851
|
-
case
|
15852
|
-
case
|
17046
|
+
case GGML_OP_CONV_1D_S1_PH:
|
17047
|
+
case GGML_OP_CONV_1D_S2_PH:
|
15853
17048
|
{
|
15854
17049
|
node->n_tasks = n_threads;
|
15855
17050
|
|
@@ -15876,6 +17071,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15876
17071
|
GGML_ASSERT(false);
|
15877
17072
|
}
|
15878
17073
|
|
17074
|
+
work_size = MAX(work_size, cur);
|
17075
|
+
} break;
|
17076
|
+
case GGML_OP_CONV_2D_SK_P0:
|
17077
|
+
{
|
17078
|
+
node->n_tasks = n_threads;
|
17079
|
+
|
17080
|
+
GGML_ASSERT(node->src1->ne[3] == 1);
|
17081
|
+
|
17082
|
+
const int64_t ne00 = node->src0->ne[0]; // W
|
17083
|
+
const int64_t ne01 = node->src0->ne[1]; // H
|
17084
|
+
const int64_t ne02 = node->src0->ne[2]; // C
|
17085
|
+
const int64_t ne03 = node->src0->ne[3]; // N
|
17086
|
+
|
17087
|
+
const int64_t ne10 = node->src1->ne[0]; // W
|
17088
|
+
const int64_t ne11 = node->src1->ne[1]; // H
|
17089
|
+
const int64_t ne12 = node->src1->ne[2]; // C
|
17090
|
+
|
17091
|
+
const int64_t nk = ne00*ne01;
|
17092
|
+
|
17093
|
+
UNUSED(ne02);
|
17094
|
+
UNUSED(ne03);
|
17095
|
+
UNUSED(nk);
|
17096
|
+
|
17097
|
+
size_t cur = 0;
|
17098
|
+
|
17099
|
+
if (node->src0->type == GGML_TYPE_F16 &&
|
17100
|
+
node->src1->type == GGML_TYPE_F32) {
|
17101
|
+
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
17102
|
+
} else if (node->src0->type == GGML_TYPE_F32 &&
|
17103
|
+
node->src1->type == GGML_TYPE_F32) {
|
17104
|
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
17105
|
+
} else {
|
17106
|
+
GGML_ASSERT(false);
|
17107
|
+
}
|
17108
|
+
|
15879
17109
|
work_size = MAX(work_size, cur);
|
15880
17110
|
} break;
|
15881
17111
|
case GGML_OP_FLASH_ATTN:
|
@@ -15937,8 +17167,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15937
17167
|
|
15938
17168
|
work_size = MAX(work_size, cur);
|
15939
17169
|
} break;
|
17170
|
+
case GGML_OP_WIN_PART:
|
17171
|
+
case GGML_OP_WIN_UNPART:
|
15940
17172
|
case GGML_OP_MAP_UNARY:
|
15941
17173
|
case GGML_OP_MAP_BINARY:
|
17174
|
+
case GGML_OP_MAP_CUSTOM1:
|
17175
|
+
case GGML_OP_MAP_CUSTOM2:
|
17176
|
+
case GGML_OP_MAP_CUSTOM3:
|
15942
17177
|
{
|
15943
17178
|
node->n_tasks = 1;
|
15944
17179
|
} break;
|
@@ -15981,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15981
17216
|
}
|
15982
17217
|
}
|
15983
17218
|
|
15984
|
-
|
15985
|
-
|
15986
|
-
|
15987
|
-
|
15988
|
-
|
15989
|
-
|
15990
|
-
|
15991
|
-
|
15992
|
-
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
|
15993
|
-
//if (node->grad == NULL && node->perf_runs > 0) {
|
15994
|
-
// continue;
|
15995
|
-
//}
|
15996
|
-
|
15997
|
-
const int64_t perf_node_start_cycles = ggml_perf_cycles();
|
15998
|
-
const int64_t perf_node_start_time_us = ggml_perf_time_us();
|
15999
|
-
|
16000
|
-
// INIT
|
16001
|
-
struct ggml_compute_params params = {
|
16002
|
-
/*.type =*/ GGML_TASK_INIT,
|
16003
|
-
/*.ith =*/ 0,
|
16004
|
-
/*.nth =*/ node->n_tasks,
|
16005
|
-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16006
|
-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16007
|
-
};
|
16008
|
-
|
16009
|
-
ggml_compute_forward(¶ms, node);
|
16010
|
-
|
16011
|
-
// COMPUTE
|
16012
|
-
if (node->n_tasks > 1) {
|
16013
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16014
|
-
atomic_store(&state_shared.has_work, false);
|
16015
|
-
}
|
16016
|
-
|
16017
|
-
while (atomic_load(&state_shared.has_work)) {
|
16018
|
-
ggml_lock_lock (&state_shared.spin);
|
16019
|
-
ggml_lock_unlock(&state_shared.spin);
|
16020
|
-
}
|
16021
|
-
|
16022
|
-
// launch thread pool
|
16023
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16024
|
-
workers[j].params = (struct ggml_compute_params) {
|
16025
|
-
.type = GGML_TASK_COMPUTE,
|
16026
|
-
.ith = j + 1,
|
16027
|
-
.nth = node->n_tasks,
|
16028
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16029
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16030
|
-
};
|
16031
|
-
workers[j].node = node;
|
16032
|
-
}
|
16033
|
-
|
16034
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16035
|
-
|
16036
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16037
|
-
ggml_lock_lock (&state_shared.spin);
|
16038
|
-
ggml_lock_unlock(&state_shared.spin);
|
16039
|
-
}
|
16040
|
-
|
16041
|
-
atomic_store(&state_shared.has_work, true);
|
16042
|
-
}
|
16043
|
-
|
16044
|
-
params.type = GGML_TASK_COMPUTE;
|
16045
|
-
ggml_compute_forward(¶ms, node);
|
16046
|
-
|
16047
|
-
// wait for thread pool
|
16048
|
-
if (node->n_tasks > 1) {
|
16049
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16050
|
-
atomic_store(&state_shared.has_work, false);
|
16051
|
-
}
|
16052
|
-
|
16053
|
-
while (atomic_load(&state_shared.has_work)) {
|
16054
|
-
ggml_lock_lock (&state_shared.spin);
|
16055
|
-
ggml_lock_unlock(&state_shared.spin);
|
16056
|
-
}
|
16057
|
-
|
16058
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16059
|
-
|
16060
|
-
while (atomic_load(&state_shared.n_ready) != 0) {
|
16061
|
-
ggml_lock_lock (&state_shared.spin);
|
16062
|
-
ggml_lock_unlock(&state_shared.spin);
|
16063
|
-
}
|
16064
|
-
}
|
16065
|
-
|
16066
|
-
// FINALIZE
|
16067
|
-
if (node->n_tasks > 1) {
|
16068
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16069
|
-
atomic_store(&state_shared.has_work, false);
|
16070
|
-
}
|
16071
|
-
|
16072
|
-
while (atomic_load(&state_shared.has_work)) {
|
16073
|
-
ggml_lock_lock (&state_shared.spin);
|
16074
|
-
ggml_lock_unlock(&state_shared.spin);
|
16075
|
-
}
|
16076
|
-
|
16077
|
-
// launch thread pool
|
16078
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16079
|
-
workers[j].params = (struct ggml_compute_params) {
|
16080
|
-
.type = GGML_TASK_FINALIZE,
|
16081
|
-
.ith = j + 1,
|
16082
|
-
.nth = node->n_tasks,
|
16083
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16084
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16085
|
-
};
|
16086
|
-
workers[j].node = node;
|
16087
|
-
}
|
16088
|
-
|
16089
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16090
|
-
|
16091
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16092
|
-
ggml_lock_lock (&state_shared.spin);
|
16093
|
-
ggml_lock_unlock(&state_shared.spin);
|
16094
|
-
}
|
17219
|
+
// create thread pool
|
17220
|
+
if (n_threads > 1) {
|
17221
|
+
for (int j = 1; j < n_threads; ++j) {
|
17222
|
+
workers[j] = (struct ggml_compute_state) {
|
17223
|
+
.thrd = 0,
|
17224
|
+
.ith = j,
|
17225
|
+
.shared = &state_shared,
|
17226
|
+
};
|
16095
17227
|
|
16096
|
-
|
17228
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
17229
|
+
GGML_ASSERT(rc == 0);
|
16097
17230
|
}
|
17231
|
+
}
|
17232
|
+
workers[0].ith = 0;
|
17233
|
+
workers[0].shared = &state_shared;
|
16098
17234
|
|
16099
|
-
|
16100
|
-
|
16101
|
-
|
16102
|
-
// wait for thread pool
|
16103
|
-
if (node->n_tasks > 1) {
|
16104
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16105
|
-
atomic_store(&state_shared.has_work, false);
|
16106
|
-
}
|
16107
|
-
|
16108
|
-
while (atomic_load(&state_shared.has_work)) {
|
16109
|
-
ggml_lock_lock (&state_shared.spin);
|
16110
|
-
ggml_lock_unlock(&state_shared.spin);
|
16111
|
-
}
|
16112
|
-
|
16113
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
17235
|
+
const int64_t perf_start_cycles = ggml_perf_cycles();
|
17236
|
+
const int64_t perf_start_time_us = ggml_perf_time_us();
|
16114
17237
|
|
16115
|
-
|
16116
|
-
|
16117
|
-
ggml_lock_unlock(&state_shared.spin);
|
16118
|
-
}
|
16119
|
-
}
|
17238
|
+
// this is a work thread too
|
17239
|
+
ggml_graph_compute_thread(&workers[0]);
|
16120
17240
|
|
16121
|
-
|
16122
|
-
|
16123
|
-
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
|
16124
|
-
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
|
16125
|
-
|
16126
|
-
node->perf_runs++;
|
16127
|
-
node->perf_cycles += perf_cycles_cur;
|
16128
|
-
node->perf_time_us += perf_time_us_cur;
|
16129
|
-
}
|
16130
|
-
}
|
17241
|
+
// don't leave affinity set on the main thread
|
17242
|
+
clear_numa_thread_affinity();
|
16131
17243
|
|
16132
17244
|
// join thread pool
|
16133
17245
|
if (n_threads > 1) {
|
16134
|
-
|
16135
|
-
|
16136
|
-
|
16137
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16138
|
-
int rc = ggml_thread_join(workers[j].thrd, NULL);
|
17246
|
+
for (int j = 1; j < n_threads; j++) {
|
17247
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
16139
17248
|
GGML_ASSERT(rc == 0);
|
16140
|
-
UNUSED(rc);
|
16141
17249
|
}
|
16142
|
-
|
16143
|
-
ggml_lock_destroy(&state_shared.spin);
|
16144
17250
|
}
|
16145
17251
|
|
16146
17252
|
// performance stats (graph)
|
@@ -16469,16 +17575,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16469
17575
|
|
16470
17576
|
if (!*ctx_data) {
|
16471
17577
|
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
17578
|
+
fclose(fin);
|
16472
17579
|
return result;
|
16473
17580
|
}
|
16474
17581
|
}
|
16475
17582
|
|
16476
17583
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
16477
17584
|
|
16478
|
-
|
16479
|
-
|
16480
|
-
|
16481
|
-
|
17585
|
+
{
|
17586
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
17587
|
+
if (ret != fsize) {
|
17588
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
17589
|
+
fclose(fin);
|
17590
|
+
return result;
|
17591
|
+
}
|
16482
17592
|
}
|
16483
17593
|
|
16484
17594
|
fclose(fin);
|
@@ -16758,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
|
16758
17868
|
return NULL;
|
16759
17869
|
}
|
16760
17870
|
|
17871
|
+
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17872
|
+
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
17873
|
+
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
17874
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
17875
|
+
gparent0 ? (void *) gparent0 : (void *) parent,
|
17876
|
+
gparent0 ? "g" : "x",
|
17877
|
+
gparent ? (void *) gparent : (void *) node,
|
17878
|
+
gparent ? "g" : "x",
|
17879
|
+
gparent ? "empty" : "vee",
|
17880
|
+
gparent ? "dashed" : "solid",
|
17881
|
+
label);
|
17882
|
+
}
|
17883
|
+
|
17884
|
+
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17885
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
17886
|
+
(void *) parent, "x",
|
17887
|
+
(void *) node, "x",
|
17888
|
+
label);
|
17889
|
+
}
|
17890
|
+
|
16761
17891
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
16762
17892
|
char color[16];
|
16763
17893
|
|
@@ -16793,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16793
17923
|
(void *) node, color);
|
16794
17924
|
|
16795
17925
|
if (strlen(node->name) > 0) {
|
16796
|
-
fprintf(fp, "%s |", node->name);
|
17926
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17927
|
+
} else {
|
17928
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16797
17929
|
}
|
16798
17930
|
|
16799
17931
|
if (node->n_dims == 2) {
|
@@ -16802,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16802
17934
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
16803
17935
|
}
|
16804
17936
|
|
16805
|
-
|
16806
17937
|
if (node->grad) {
|
16807
17938
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
16808
17939
|
} else {
|
@@ -16821,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16821
17952
|
(void *) node, color);
|
16822
17953
|
|
16823
17954
|
if (strlen(node->name) > 0) {
|
16824
|
-
|
17955
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17956
|
+
} else {
|
17957
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16825
17958
|
}
|
16826
|
-
|
16827
|
-
|
16828
|
-
|
16829
|
-
|
16830
|
-
|
16831
|
-
|
17959
|
+
|
17960
|
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17961
|
+
if (ggml_nelements(node) < 5) {
|
17962
|
+
fprintf(fp, " | (");
|
17963
|
+
for (int j = 0; j < ggml_nelements(node); j++) {
|
17964
|
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
17965
|
+
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
17966
|
+
}
|
17967
|
+
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
17968
|
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
17969
|
+
}
|
17970
|
+
else {
|
17971
|
+
fprintf(fp, "#");
|
17972
|
+
}
|
17973
|
+
if (j < ggml_nelements(node) - 1) {
|
17974
|
+
fprintf(fp, ", ");
|
17975
|
+
}
|
16832
17976
|
}
|
16833
|
-
|
16834
|
-
else {
|
16835
|
-
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17977
|
+
fprintf(fp, ")");
|
16836
17978
|
}
|
16837
17979
|
fprintf(fp, "\"; ]\n");
|
16838
17980
|
}
|
@@ -16840,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16840
17982
|
for (int i = 0; i < gb->n_nodes; i++) {
|
16841
17983
|
struct ggml_tensor * node = gb->nodes[i];
|
16842
17984
|
|
16843
|
-
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
16844
|
-
|
16845
17985
|
if (node->src0) {
|
16846
|
-
|
16847
|
-
|
16848
|
-
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
16849
|
-
parent0 ? (void *) parent0 : (void *) node->src0,
|
16850
|
-
parent0 ? "g" : "x",
|
16851
|
-
parent ? (void *) parent : (void *) node,
|
16852
|
-
parent ? "g" : "x",
|
16853
|
-
parent ? "empty" : "vee",
|
16854
|
-
parent ? "dashed" : "solid");
|
17986
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
16855
17987
|
}
|
16856
17988
|
|
16857
17989
|
if (node->src1) {
|
16858
|
-
|
16859
|
-
|
16860
|
-
|
16861
|
-
|
16862
|
-
|
16863
|
-
|
16864
|
-
|
16865
|
-
|
16866
|
-
|
17990
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17991
|
+
}
|
17992
|
+
|
17993
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17994
|
+
if (node->opt[j]) {
|
17995
|
+
char label[16];
|
17996
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
17997
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
17998
|
+
}
|
16867
17999
|
}
|
16868
18000
|
}
|
16869
18001
|
|
@@ -16871,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16871
18003
|
struct ggml_tensor * node = gb->leafs[i];
|
16872
18004
|
|
16873
18005
|
if (node->src0) {
|
16874
|
-
|
16875
|
-
(void *) node->src0, "x",
|
16876
|
-
(void *) node, "x");
|
18006
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
16877
18007
|
}
|
16878
18008
|
|
16879
18009
|
if (node->src1) {
|
16880
|
-
|
16881
|
-
|
16882
|
-
|
18010
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
18011
|
+
}
|
18012
|
+
|
18013
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
18014
|
+
if (node->opt[j]) {
|
18015
|
+
char label[16];
|
18016
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
18017
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
18018
|
+
}
|
16883
18019
|
}
|
16884
18020
|
}
|
16885
18021
|
|
@@ -17598,7 +18734,6 @@ GGML_API void ggml_opt_init(
|
|
17598
18734
|
ggml_set_zero(opt->lbfgs.g);
|
17599
18735
|
ggml_set_zero(opt->lbfgs.gp);
|
17600
18736
|
ggml_set_zero(opt->lbfgs.d);
|
17601
|
-
ggml_set_zero(opt->lbfgs.pf);
|
17602
18737
|
if (opt->lbfgs.pf) {
|
17603
18738
|
ggml_set_zero(opt->lbfgs.pf);
|
17604
18739
|
}
|