llama_cpp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
// Defines CLOCK_MONOTONIC on Linux
|
2
|
-
#define
|
1
|
+
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
|
+
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
3
|
|
4
4
|
#include "ggml.h"
|
5
5
|
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <stdio.h>
|
25
25
|
#include <float.h>
|
26
26
|
#include <limits.h>
|
27
|
+
#include <stdarg.h>
|
27
28
|
|
28
29
|
#ifdef GGML_USE_METAL
|
29
30
|
#include <unistd.h>
|
@@ -35,6 +36,12 @@
|
|
35
36
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
36
37
|
#endif
|
37
38
|
|
39
|
+
#if defined(_MSC_VER)
|
40
|
+
// disable "possible loss of data" to avoid hundreds of casts
|
41
|
+
// we should just be careful :)
|
42
|
+
#pragma warning(disable: 4244 4267)
|
43
|
+
#endif
|
44
|
+
|
38
45
|
#if defined(_WIN32)
|
39
46
|
|
40
47
|
#include <windows.h>
|
@@ -84,6 +91,11 @@ static int sched_yield (void) {
|
|
84
91
|
#include <stdatomic.h>
|
85
92
|
|
86
93
|
typedef void* thread_ret_t;
|
94
|
+
|
95
|
+
#include <sys/types.h>
|
96
|
+
#include <sys/stat.h>
|
97
|
+
#include <unistd.h>
|
98
|
+
|
87
99
|
#endif
|
88
100
|
|
89
101
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -106,11 +118,36 @@ typedef void* thread_ret_t;
|
|
106
118
|
/*#define GGML_PERF*/
|
107
119
|
#define GGML_DEBUG 0
|
108
120
|
#define GGML_GELU_FP16
|
121
|
+
#define GGML_GELU_QUICK_FP16
|
109
122
|
#define GGML_SILU_FP16
|
110
123
|
|
111
124
|
#define GGML_SOFT_MAX_UNROLL 4
|
112
125
|
#define GGML_VEC_DOT_UNROLL 2
|
113
126
|
|
127
|
+
//
|
128
|
+
// logging
|
129
|
+
//
|
130
|
+
|
131
|
+
#if (GGML_DEBUG >= 1)
|
132
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
133
|
+
#else
|
134
|
+
#define GGML_PRINT_DEBUG(...)
|
135
|
+
#endif
|
136
|
+
|
137
|
+
#if (GGML_DEBUG >= 5)
|
138
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
139
|
+
#else
|
140
|
+
#define GGML_PRINT_DEBUG_5(...)
|
141
|
+
#endif
|
142
|
+
|
143
|
+
#if (GGML_DEBUG >= 10)
|
144
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
145
|
+
#else
|
146
|
+
#define GGML_PRINT_DEBUG_10(...)
|
147
|
+
#endif
|
148
|
+
|
149
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
150
|
+
|
114
151
|
#ifdef GGML_USE_ACCELERATE
|
115
152
|
// uncomment to use vDSP for soft max computation
|
116
153
|
// note: not sure if it is actually faster
|
@@ -123,6 +160,34 @@ typedef void* thread_ret_t;
|
|
123
160
|
#define GGML_MEM_ALIGN 16
|
124
161
|
#endif
|
125
162
|
|
163
|
+
//
|
164
|
+
// logging
|
165
|
+
//
|
166
|
+
|
167
|
+
#if (GGML_DEBUG >= 1)
|
168
|
+
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
169
|
+
#else
|
170
|
+
#define GGML_PRINT_DEBUG(...)
|
171
|
+
#endif
|
172
|
+
|
173
|
+
#if (GGML_DEBUG >= 5)
|
174
|
+
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
175
|
+
#else
|
176
|
+
#define GGML_PRINT_DEBUG_5(...)
|
177
|
+
#endif
|
178
|
+
|
179
|
+
#if (GGML_DEBUG >= 10)
|
180
|
+
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
181
|
+
#else
|
182
|
+
#define GGML_PRINT_DEBUG_10(...)
|
183
|
+
#endif
|
184
|
+
|
185
|
+
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
186
|
+
|
187
|
+
//
|
188
|
+
// end of logging block
|
189
|
+
//
|
190
|
+
|
126
191
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
127
192
|
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
128
193
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -136,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|
136
201
|
#endif
|
137
202
|
if (result != 0) {
|
138
203
|
// Handle allocation failure
|
204
|
+
const char *error_desc = "unknown allocation error";
|
205
|
+
switch (result) {
|
206
|
+
case EINVAL:
|
207
|
+
error_desc = "invalid alignment value";
|
208
|
+
break;
|
209
|
+
case ENOMEM:
|
210
|
+
error_desc = "insufficient memory";
|
211
|
+
break;
|
212
|
+
}
|
213
|
+
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
|
214
|
+
__func__, error_desc, size/(1024.0*1024.0));
|
139
215
|
return NULL;
|
140
216
|
}
|
141
217
|
return aligned_memory;
|
@@ -334,6 +410,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
334
410
|
// precomputed gelu table for f16 (128 KB)
|
335
411
|
static ggml_fp16_t table_gelu_f16[1 << 16];
|
336
412
|
|
413
|
+
// precomputed quick gelu table for f16 (128 KB)
|
414
|
+
static ggml_fp16_t table_gelu_quick_f16[1 << 16];
|
415
|
+
|
337
416
|
// precomputed silu table for f16 (128 KB)
|
338
417
|
static ggml_fp16_t table_silu_f16[1 << 16];
|
339
418
|
|
@@ -409,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
|
409
488
|
}
|
410
489
|
}
|
411
490
|
|
412
|
-
|
413
491
|
//
|
414
492
|
// timing
|
415
493
|
//
|
@@ -472,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
|
|
472
550
|
#define ggml_perf_cycles_per_ms() 0
|
473
551
|
#endif
|
474
552
|
|
553
|
+
|
475
554
|
//
|
476
555
|
// cache line
|
477
556
|
//
|
@@ -1671,14 +1750,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1671
1750
|
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1672
1751
|
#define GGML_F32x4_REDUCE(res, x) \
|
1673
1752
|
{ \
|
1674
|
-
|
1675
|
-
|
1753
|
+
int offset = GGML_F32_ARR >> 1; \
|
1754
|
+
for (int i = 0; i < offset; ++i) { \
|
1755
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1676
1756
|
} \
|
1677
|
-
|
1678
|
-
|
1757
|
+
offset >>= 1; \
|
1758
|
+
for (int i = 0; i < offset; ++i) { \
|
1759
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1679
1760
|
} \
|
1680
|
-
|
1681
|
-
|
1761
|
+
offset >>= 1; \
|
1762
|
+
for (int i = 0; i < offset; ++i) { \
|
1763
|
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1682
1764
|
} \
|
1683
1765
|
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1684
1766
|
}
|
@@ -1709,14 +1791,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1709
1791
|
#define GGML_F16x8_MUL vmulq_f16
|
1710
1792
|
#define GGML_F16x8_REDUCE(res, x) \
|
1711
1793
|
{ \
|
1712
|
-
|
1713
|
-
|
1794
|
+
int offset = GGML_F16_ARR >> 1; \
|
1795
|
+
for (int i = 0; i < offset; ++i) { \
|
1796
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1714
1797
|
} \
|
1715
|
-
|
1716
|
-
|
1798
|
+
offset >>= 1; \
|
1799
|
+
for (int i = 0; i < offset; ++i) { \
|
1800
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1717
1801
|
} \
|
1718
|
-
|
1719
|
-
|
1802
|
+
offset >>= 1; \
|
1803
|
+
for (int i = 0; i < offset; ++i) { \
|
1804
|
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1720
1805
|
} \
|
1721
1806
|
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1722
1807
|
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
@@ -1783,14 +1868,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1783
1868
|
#define GGML_F32x8_MUL _mm256_mul_ps
|
1784
1869
|
#define GGML_F32x8_REDUCE(res, x) \
|
1785
1870
|
{ \
|
1786
|
-
|
1787
|
-
|
1871
|
+
int offset = GGML_F32_ARR >> 1; \
|
1872
|
+
for (int i = 0; i < offset; ++i) { \
|
1873
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1788
1874
|
} \
|
1789
|
-
|
1790
|
-
|
1875
|
+
offset >>= 1; \
|
1876
|
+
for (int i = 0; i < offset; ++i) { \
|
1877
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1791
1878
|
} \
|
1792
|
-
|
1793
|
-
|
1879
|
+
offset >>= 1; \
|
1880
|
+
for (int i = 0; i < offset; ++i) { \
|
1881
|
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1794
1882
|
} \
|
1795
1883
|
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1796
1884
|
_mm256_extractf128_ps(x[0], 1)); \
|
@@ -1880,14 +1968,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1880
1968
|
#define GGML_F32x4_MUL vec_mul
|
1881
1969
|
#define GGML_F32x4_REDUCE(res, x) \
|
1882
1970
|
{ \
|
1883
|
-
|
1884
|
-
|
1971
|
+
int offset = GGML_F32_ARR >> 1; \
|
1972
|
+
for (int i = 0; i < offset; ++i) { \
|
1973
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1885
1974
|
} \
|
1886
|
-
|
1887
|
-
|
1975
|
+
offset >>= 1; \
|
1976
|
+
for (int i = 0; i < offset; ++i) { \
|
1977
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1888
1978
|
} \
|
1889
|
-
|
1890
|
-
|
1979
|
+
offset >>= 1; \
|
1980
|
+
for (int i = 0; i < offset; ++i) { \
|
1981
|
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1891
1982
|
} \
|
1892
1983
|
res = vec_extract(x[0], 0) + \
|
1893
1984
|
vec_extract(x[0], 1) + \
|
@@ -1943,14 +2034,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1943
2034
|
#define GGML_F32x4_MUL wasm_f32x4_mul
|
1944
2035
|
#define GGML_F32x4_REDUCE(res, x) \
|
1945
2036
|
{ \
|
1946
|
-
|
1947
|
-
|
2037
|
+
int offset = GGML_F32_ARR >> 1; \
|
2038
|
+
for (int i = 0; i < offset; ++i) { \
|
2039
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1948
2040
|
} \
|
1949
|
-
|
1950
|
-
|
2041
|
+
offset >>= 1; \
|
2042
|
+
for (int i = 0; i < offset; ++i) { \
|
2043
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1951
2044
|
} \
|
1952
|
-
|
1953
|
-
|
2045
|
+
offset >>= 1; \
|
2046
|
+
for (int i = 0; i < offset; ++i) { \
|
2047
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1954
2048
|
} \
|
1955
2049
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
1956
2050
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2005,14 +2099,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2005
2099
|
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2006
2100
|
#define GGML_F16x4_REDUCE(res, x) \
|
2007
2101
|
{ \
|
2008
|
-
|
2009
|
-
|
2102
|
+
int offset = GGML_F16_ARR >> 1; \
|
2103
|
+
for (int i = 0; i < offset; ++i) { \
|
2104
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2010
2105
|
} \
|
2011
|
-
|
2012
|
-
|
2106
|
+
offset >>= 1; \
|
2107
|
+
for (int i = 0; i < offset; ++i) { \
|
2108
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2013
2109
|
} \
|
2014
|
-
|
2015
|
-
|
2110
|
+
offset >>= 1; \
|
2111
|
+
for (int i = 0; i < offset; ++i) { \
|
2112
|
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2016
2113
|
} \
|
2017
2114
|
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2018
2115
|
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2054,14 +2151,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2054
2151
|
#define GGML_F32x4_MUL _mm_mul_ps
|
2055
2152
|
#define GGML_F32x4_REDUCE(res, x) \
|
2056
2153
|
{ \
|
2057
|
-
|
2058
|
-
|
2154
|
+
int offset = GGML_F32_ARR >> 1; \
|
2155
|
+
for (int i = 0; i < offset; ++i) { \
|
2156
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2059
2157
|
} \
|
2060
|
-
|
2061
|
-
|
2158
|
+
offset >>= 1; \
|
2159
|
+
for (int i = 0; i < offset; ++i) { \
|
2160
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2062
2161
|
} \
|
2063
|
-
|
2064
|
-
|
2162
|
+
offset >>= 1; \
|
2163
|
+
for (int i = 0; i < offset; ++i) { \
|
2164
|
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2065
2165
|
} \
|
2066
2166
|
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2067
2167
|
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
@@ -3350,6 +3450,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
3350
3450
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3351
3451
|
|
3352
3452
|
static const float GELU_COEF_A = 0.044715f;
|
3453
|
+
static const float GELU_QUICK_COEF = -1.702f;
|
3353
3454
|
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3354
3455
|
|
3355
3456
|
inline static float ggml_gelu_f32(float x) {
|
@@ -3380,6 +3481,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
3380
3481
|
}
|
3381
3482
|
#endif
|
3382
3483
|
|
3484
|
+
inline static float ggml_gelu_quick_f32(float x) {
|
3485
|
+
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
3486
|
+
}
|
3487
|
+
|
3488
|
+
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3489
|
+
// const uint16_t * i16 = (const uint16_t *) x;
|
3490
|
+
// for (int i = 0; i < n; ++i) {
|
3491
|
+
// y[i] = table_gelu_quick_f16[i16[i]];
|
3492
|
+
// }
|
3493
|
+
//}
|
3494
|
+
|
3495
|
+
#ifdef GGML_GELU_QUICK_FP16
|
3496
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3497
|
+
uint16_t t;
|
3498
|
+
for (int i = 0; i < n; ++i) {
|
3499
|
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3500
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
3501
|
+
y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
|
3502
|
+
}
|
3503
|
+
}
|
3504
|
+
#else
|
3505
|
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3506
|
+
for (int i = 0; i < n; ++i) {
|
3507
|
+
y[i] = ggml_gelu_quick_f32(x[i]);
|
3508
|
+
}
|
3509
|
+
}
|
3510
|
+
#endif
|
3511
|
+
|
3383
3512
|
// Sigmoid Linear Unit (SiLU) function
|
3384
3513
|
inline static float ggml_silu_f32(float x) {
|
3385
3514
|
return x/(1.0f + expf(-x));
|
@@ -3469,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
|
3469
3598
|
*s = 1.f/(*s);
|
3470
3599
|
}
|
3471
3600
|
|
3472
|
-
//
|
3473
|
-
// logging
|
3474
|
-
//
|
3475
|
-
|
3476
|
-
#if (GGML_DEBUG >= 1)
|
3477
|
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
3478
|
-
#else
|
3479
|
-
#define GGML_PRINT_DEBUG(...)
|
3480
|
-
#endif
|
3481
|
-
|
3482
|
-
#if (GGML_DEBUG >= 5)
|
3483
|
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
3484
|
-
#else
|
3485
|
-
#define GGML_PRINT_DEBUG_5(...)
|
3486
|
-
#endif
|
3487
|
-
|
3488
|
-
#if (GGML_DEBUG >= 10)
|
3489
|
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
3490
|
-
#else
|
3491
|
-
#define GGML_PRINT_DEBUG_10(...)
|
3492
|
-
#endif
|
3493
|
-
|
3494
|
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
3495
|
-
|
3496
3601
|
//
|
3497
3602
|
// data types
|
3498
3603
|
//
|
@@ -3610,6 +3715,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3610
3715
|
"STEP",
|
3611
3716
|
"RELU",
|
3612
3717
|
"GELU",
|
3718
|
+
"GELU_QUICK",
|
3613
3719
|
"SILU",
|
3614
3720
|
"SILU_BACK",
|
3615
3721
|
"NORM",
|
@@ -3638,21 +3744,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3638
3744
|
"ROPE_BACK",
|
3639
3745
|
"ALIBI",
|
3640
3746
|
"CLAMP",
|
3641
|
-
"
|
3642
|
-
"
|
3747
|
+
"CONV_1D_S1_PH",
|
3748
|
+
"CONV_1D_S2_PH",
|
3749
|
+
"CONV_2D_SK_P0",
|
3643
3750
|
|
3644
3751
|
"FLASH_ATTN",
|
3645
3752
|
"FLASH_FF",
|
3646
3753
|
"FLASH_ATTN_BACK",
|
3754
|
+
"WIN_PART",
|
3755
|
+
"WIN_UNPART",
|
3647
3756
|
|
3648
3757
|
"MAP_UNARY",
|
3649
3758
|
"MAP_BINARY",
|
3650
3759
|
|
3760
|
+
"MAP_CUSTOM1",
|
3761
|
+
"MAP_CUSTOM2",
|
3762
|
+
"MAP_CUSTOM3",
|
3763
|
+
|
3651
3764
|
"CROSS_ENTROPY_LOSS",
|
3652
3765
|
"CROSS_ENTROPY_LOSS_BACK",
|
3653
3766
|
};
|
3654
3767
|
|
3655
|
-
static_assert(GGML_OP_COUNT ==
|
3768
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3656
3769
|
|
3657
3770
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3658
3771
|
"none",
|
@@ -3678,6 +3791,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3678
3791
|
"step(x)",
|
3679
3792
|
"relu(x)",
|
3680
3793
|
"gelu(x)",
|
3794
|
+
"gelu_quick(x)",
|
3681
3795
|
"silu(x)",
|
3682
3796
|
"silu_back(x)",
|
3683
3797
|
"norm(x)",
|
@@ -3706,21 +3820,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3706
3820
|
"rope_back(x)",
|
3707
3821
|
"alibi(x)",
|
3708
3822
|
"clamp(x)",
|
3709
|
-
"
|
3710
|
-
"
|
3823
|
+
"conv_1d_s1_ph(x)",
|
3824
|
+
"conv_1d_s2_ph(x)",
|
3825
|
+
"conv_2d_sk_p0(x)",
|
3711
3826
|
|
3712
3827
|
"flash_attn(x)",
|
3713
3828
|
"flash_ff(x)",
|
3714
3829
|
"flash_attn_back(x)",
|
3830
|
+
"win_part(x)",
|
3831
|
+
"win_unpart(x)",
|
3715
3832
|
|
3716
3833
|
"f(x)",
|
3717
3834
|
"f(x,y)",
|
3718
3835
|
|
3836
|
+
"custom(x)",
|
3837
|
+
"custom(x,y)",
|
3838
|
+
"custom(x,y,z)",
|
3839
|
+
|
3719
3840
|
"cross_entropy_loss(x,y)",
|
3720
3841
|
"cross_entropy_loss_back(x,y)",
|
3721
3842
|
};
|
3722
3843
|
|
3723
|
-
static_assert(GGML_OP_COUNT ==
|
3844
|
+
static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
3724
3845
|
|
3725
3846
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3726
3847
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -3751,12 +3872,31 @@ struct ggml_context_container {
|
|
3751
3872
|
struct ggml_context context;
|
3752
3873
|
};
|
3753
3874
|
|
3875
|
+
//
|
3876
|
+
// NUMA support
|
3877
|
+
//
|
3878
|
+
|
3879
|
+
#define GGML_NUMA_MAX_NODES 8
|
3880
|
+
#define GGML_NUMA_MAX_CPUS 512
|
3881
|
+
|
3882
|
+
struct ggml_numa_node {
|
3883
|
+
uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
|
3884
|
+
uint32_t n_cpus;
|
3885
|
+
};
|
3886
|
+
|
3887
|
+
struct ggml_numa_nodes {
|
3888
|
+
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
3889
|
+
uint32_t n_nodes;
|
3890
|
+
uint32_t total_cpus; // hardware threads on system
|
3891
|
+
};
|
3892
|
+
|
3754
3893
|
//
|
3755
3894
|
// ggml state
|
3756
3895
|
//
|
3757
3896
|
|
3758
3897
|
struct ggml_state {
|
3759
3898
|
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
3899
|
+
struct ggml_numa_nodes numa;
|
3760
3900
|
};
|
3761
3901
|
|
3762
3902
|
// global state
|
@@ -3781,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
|
|
3781
3921
|
atomic_fetch_sub(&g_state_barrier, 1);
|
3782
3922
|
}
|
3783
3923
|
|
3924
|
+
void ggml_numa_init(void) {
|
3925
|
+
if (g_state.numa.n_nodes > 0) {
|
3926
|
+
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
3927
|
+
|
3928
|
+
return;
|
3929
|
+
}
|
3930
|
+
|
3931
|
+
#ifdef __linux__
|
3932
|
+
struct stat st;
|
3933
|
+
char path[256];
|
3934
|
+
int rv;
|
3935
|
+
|
3936
|
+
// enumerate nodes
|
3937
|
+
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
3938
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
3939
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3940
|
+
if (stat(path, &st) != 0) { break; }
|
3941
|
+
++g_state.numa.n_nodes;
|
3942
|
+
}
|
3943
|
+
|
3944
|
+
// enumerate CPUs
|
3945
|
+
while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
|
3946
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
|
3947
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3948
|
+
if (stat(path, &st) != 0) { break; }
|
3949
|
+
++g_state.numa.total_cpus;
|
3950
|
+
}
|
3951
|
+
|
3952
|
+
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
3953
|
+
|
3954
|
+
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
3955
|
+
g_state.numa.n_nodes = 0;
|
3956
|
+
return;
|
3957
|
+
}
|
3958
|
+
|
3959
|
+
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
3960
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
3961
|
+
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
3962
|
+
node->n_cpus = 0;
|
3963
|
+
for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
|
3964
|
+
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
|
3965
|
+
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
|
3966
|
+
if (stat(path, &st) == 0) {
|
3967
|
+
node->cpus[node->n_cpus++] = c;
|
3968
|
+
GGML_PRINT_DEBUG(" %u", c);
|
3969
|
+
}
|
3970
|
+
}
|
3971
|
+
GGML_PRINT_DEBUG("\n");
|
3972
|
+
}
|
3973
|
+
|
3974
|
+
if (ggml_is_numa()) {
|
3975
|
+
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
|
3976
|
+
if (fptr != NULL) {
|
3977
|
+
char buf[42];
|
3978
|
+
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
3979
|
+
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
3980
|
+
}
|
3981
|
+
fclose(fptr);
|
3982
|
+
}
|
3983
|
+
}
|
3984
|
+
#else
|
3985
|
+
// TODO
|
3986
|
+
#endif
|
3987
|
+
}
|
3988
|
+
|
3989
|
+
bool ggml_is_numa(void) {
|
3990
|
+
return g_state.numa.n_nodes > 1;
|
3991
|
+
}
|
3992
|
+
|
3784
3993
|
////////////////////////////////////////////////////////////////////////////////
|
3785
3994
|
|
3786
3995
|
void ggml_print_object(const struct ggml_object * obj) {
|
@@ -4011,7 +4220,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4011
4220
|
// initialize time system (required on Windows)
|
4012
4221
|
ggml_time_init();
|
4013
4222
|
|
4014
|
-
// initialize GELU, SILU and EXP F32 tables
|
4223
|
+
// initialize GELU, Quick GELU, SILU and EXP F32 tables
|
4015
4224
|
{
|
4016
4225
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
4017
4226
|
|
@@ -4021,13 +4230,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4021
4230
|
memcpy(&ii, &ui, sizeof(ii));
|
4022
4231
|
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
4023
4232
|
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
4233
|
+
table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
4024
4234
|
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
4025
4235
|
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4026
4236
|
}
|
4027
4237
|
|
4028
4238
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
4029
4239
|
|
4030
|
-
GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4240
|
+
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4031
4241
|
}
|
4032
4242
|
|
4033
4243
|
// initialize g_state
|
@@ -4036,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4036
4246
|
|
4037
4247
|
g_state = (struct ggml_state) {
|
4038
4248
|
/*.contexts =*/ { { 0 } },
|
4249
|
+
/*.numa =*/ {
|
4250
|
+
.n_nodes = 0,
|
4251
|
+
.total_cpus = 0,
|
4252
|
+
},
|
4039
4253
|
};
|
4040
4254
|
|
4041
4255
|
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
@@ -4148,14 +4362,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
|
4148
4362
|
ctx->no_alloc = no_alloc;
|
4149
4363
|
}
|
4150
4364
|
|
4151
|
-
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4365
|
+
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
|
4152
4366
|
return ctx->mem_buffer;
|
4153
4367
|
}
|
4154
4368
|
|
4155
|
-
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4369
|
+
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
4156
4370
|
return ctx->mem_size;
|
4157
4371
|
}
|
4158
4372
|
|
4373
|
+
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
4374
|
+
size_t max_size = 0;
|
4375
|
+
|
4376
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4377
|
+
|
4378
|
+
while (obj != NULL) {
|
4379
|
+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
4380
|
+
|
4381
|
+
const size_t size = ggml_nbytes(tensor);
|
4382
|
+
|
4383
|
+
if (max_size < size) {
|
4384
|
+
max_size = size;
|
4385
|
+
}
|
4386
|
+
|
4387
|
+
obj = obj->next;
|
4388
|
+
}
|
4389
|
+
|
4390
|
+
return max_size;
|
4391
|
+
}
|
4392
|
+
|
4159
4393
|
// IMPORTANT:
|
4160
4394
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4161
4395
|
// this is an error prone process, but it is necessary to support inplace
|
@@ -4639,15 +4873,25 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
|
4639
4873
|
return tensor->name;
|
4640
4874
|
}
|
4641
4875
|
|
4642
|
-
|
4876
|
+
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
4643
4877
|
strncpy(tensor->name, name, sizeof(tensor->name));
|
4644
4878
|
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
4879
|
+
return tensor;
|
4880
|
+
}
|
4881
|
+
|
4882
|
+
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
4883
|
+
va_list args;
|
4884
|
+
va_start(args, fmt);
|
4885
|
+
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
4886
|
+
va_end(args);
|
4887
|
+
return tensor;
|
4645
4888
|
}
|
4646
4889
|
|
4647
4890
|
struct ggml_tensor * ggml_view_tensor(
|
4648
4891
|
struct ggml_context * ctx,
|
4649
4892
|
const struct ggml_tensor * src) {
|
4650
4893
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
4894
|
+
ggml_format_name(result, "%s (view)", src->name);
|
4651
4895
|
|
4652
4896
|
result->nb[0] = src->nb[0];
|
4653
4897
|
result->nb[1] = src->nb[1];
|
@@ -5420,6 +5664,40 @@ struct ggml_tensor * ggml_gelu_inplace(
|
|
5420
5664
|
return ggml_gelu_impl(ctx, a, true);
|
5421
5665
|
}
|
5422
5666
|
|
5667
|
+
// ggml_gelu_quick
|
5668
|
+
|
5669
|
+
struct ggml_tensor * ggml_gelu_quick_impl(
|
5670
|
+
struct ggml_context * ctx,
|
5671
|
+
struct ggml_tensor * a,
|
5672
|
+
bool inplace) {
|
5673
|
+
bool is_node = false;
|
5674
|
+
|
5675
|
+
if (!inplace && (a->grad)) {
|
5676
|
+
is_node = true;
|
5677
|
+
}
|
5678
|
+
|
5679
|
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5680
|
+
|
5681
|
+
result->op = GGML_OP_GELU_QUICK;
|
5682
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5683
|
+
result->src0 = a;
|
5684
|
+
result->src1 = NULL;
|
5685
|
+
|
5686
|
+
return result;
|
5687
|
+
}
|
5688
|
+
|
5689
|
+
struct ggml_tensor * ggml_gelu_quick(
|
5690
|
+
struct ggml_context * ctx,
|
5691
|
+
struct ggml_tensor * a) {
|
5692
|
+
return ggml_gelu_quick_impl(ctx, a, false);
|
5693
|
+
}
|
5694
|
+
|
5695
|
+
struct ggml_tensor * ggml_gelu_quick_inplace(
|
5696
|
+
struct ggml_context * ctx,
|
5697
|
+
struct ggml_tensor * a) {
|
5698
|
+
return ggml_gelu_quick_impl(ctx, a, true);
|
5699
|
+
}
|
5700
|
+
|
5423
5701
|
// ggml_silu
|
5424
5702
|
|
5425
5703
|
struct ggml_tensor * ggml_silu_impl(
|
@@ -5775,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
5775
6053
|
|
5776
6054
|
// make a view of the destination
|
5777
6055
|
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
6056
|
+
if (strlen(b->name) > 0) {
|
6057
|
+
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
6058
|
+
} else {
|
6059
|
+
ggml_format_name(result, "%s (copy)", a->name);
|
6060
|
+
}
|
5778
6061
|
|
5779
6062
|
result->op = GGML_OP_CPY;
|
5780
6063
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5811,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
|
|
5811
6094
|
}
|
5812
6095
|
|
5813
6096
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6097
|
+
ggml_format_name(result, "%s (cont)", a->name);
|
5814
6098
|
|
5815
6099
|
result->op = GGML_OP_CONT;
|
5816
6100
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5854,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
|
|
5854
6138
|
}
|
5855
6139
|
|
5856
6140
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
6141
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5857
6142
|
|
5858
6143
|
result->op = GGML_OP_RESHAPE;
|
5859
6144
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5878,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
5878
6163
|
|
5879
6164
|
const int64_t ne[1] = { ne0 };
|
5880
6165
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
6166
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5881
6167
|
|
5882
6168
|
result->op = GGML_OP_RESHAPE;
|
5883
6169
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5903,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
5903
6189
|
|
5904
6190
|
const int64_t ne[2] = { ne0, ne1 };
|
5905
6191
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
6192
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5906
6193
|
|
5907
6194
|
result->op = GGML_OP_RESHAPE;
|
5908
6195
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5929,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
5929
6216
|
|
5930
6217
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
5931
6218
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
6219
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5932
6220
|
|
5933
6221
|
result->op = GGML_OP_RESHAPE;
|
5934
6222
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5957,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
5957
6245
|
|
5958
6246
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
5959
6247
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
6248
|
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5960
6249
|
|
5961
6250
|
result->op = GGML_OP_RESHAPE;
|
5962
6251
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5981,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
|
|
5981
6270
|
}
|
5982
6271
|
|
5983
6272
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
6273
|
+
ggml_format_name(result, "%s (view)", a->name);
|
5984
6274
|
|
5985
6275
|
ggml_scratch_save(ctx);
|
5986
6276
|
|
5987
6277
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6278
|
+
ggml_set_name(offs, "offset");
|
5988
6279
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5989
6280
|
|
5990
6281
|
ggml_scratch_load(ctx);
|
@@ -6017,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
|
|
6017
6308
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6018
6309
|
|
6019
6310
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
6311
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6020
6312
|
|
6021
6313
|
ggml_scratch_save(ctx);
|
6022
6314
|
|
6023
6315
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6316
|
+
ggml_set_name(offs, "offset");
|
6024
6317
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6025
6318
|
|
6026
6319
|
ggml_scratch_load(ctx);
|
@@ -6059,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
|
|
6059
6352
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6060
6353
|
|
6061
6354
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
6355
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6062
6356
|
|
6063
6357
|
ggml_scratch_save(ctx);
|
6064
6358
|
|
6065
6359
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6360
|
+
ggml_set_name(offs, "offset");
|
6066
6361
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6067
6362
|
|
6068
6363
|
ggml_scratch_load(ctx);
|
@@ -6103,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
|
|
6103
6398
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6104
6399
|
|
6105
6400
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
6401
|
+
ggml_format_name(result, "%s (view)", a->name);
|
6106
6402
|
|
6107
6403
|
ggml_scratch_save(ctx);
|
6108
6404
|
|
6109
6405
|
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6406
|
+
ggml_set_name(offs, "offset");
|
6110
6407
|
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6111
6408
|
|
6112
6409
|
ggml_scratch_load(ctx);
|
@@ -6152,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
|
|
6152
6449
|
}
|
6153
6450
|
|
6154
6451
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6452
|
+
ggml_format_name(result, "%s (permuted)", a->name);
|
6155
6453
|
|
6156
6454
|
int ne[GGML_MAX_DIMS];
|
6157
6455
|
int nb[GGML_MAX_DIMS];
|
@@ -6211,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
|
|
6211
6509
|
}
|
6212
6510
|
|
6213
6511
|
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6512
|
+
ggml_format_name(result, "%s (transposed)", a->name);
|
6214
6513
|
|
6215
6514
|
result->ne[0] = a->ne[1];
|
6216
6515
|
result->ne[1] = a->ne[0];
|
@@ -6479,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6479
6778
|
int n_past,
|
6480
6779
|
int n_dims,
|
6481
6780
|
int mode,
|
6781
|
+
int n_ctx,
|
6482
6782
|
bool inplace) {
|
6483
6783
|
GGML_ASSERT(n_past >= 0);
|
6484
6784
|
bool is_node = false;
|
@@ -6491,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
|
|
6491
6791
|
|
6492
6792
|
ggml_scratch_save(ctx);
|
6493
6793
|
|
6494
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6794
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
6495
6795
|
|
6496
6796
|
((int32_t *) b->data)[0] = n_past;
|
6497
6797
|
((int32_t *) b->data)[1] = n_dims;
|
6498
6798
|
((int32_t *) b->data)[2] = mode;
|
6799
|
+
((int32_t *) b->data)[3] = n_ctx;
|
6499
6800
|
|
6500
6801
|
ggml_scratch_load(ctx);
|
6501
6802
|
|
@@ -6512,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
|
|
6512
6813
|
struct ggml_tensor * a,
|
6513
6814
|
int n_past,
|
6514
6815
|
int n_dims,
|
6515
|
-
int mode
|
6516
|
-
|
6816
|
+
int mode,
|
6817
|
+
int n_ctx) {
|
6818
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
6517
6819
|
}
|
6518
6820
|
|
6519
6821
|
struct ggml_tensor * ggml_rope_inplace(
|
@@ -6521,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6521
6823
|
struct ggml_tensor * a,
|
6522
6824
|
int n_past,
|
6523
6825
|
int n_dims,
|
6524
|
-
int mode
|
6525
|
-
|
6826
|
+
int mode,
|
6827
|
+
int n_ctx) {
|
6828
|
+
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
6526
6829
|
}
|
6527
6830
|
|
6528
6831
|
// ggml_rope_back
|
@@ -6619,7 +6922,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6619
6922
|
|
6620
6923
|
ggml_scratch_save(ctx);
|
6621
6924
|
|
6622
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx,
|
6925
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
|
6623
6926
|
|
6624
6927
|
((float *) b->data)[0] = min;
|
6625
6928
|
((float *) b->data)[1] = max;
|
@@ -6634,9 +6937,9 @@ struct ggml_tensor * ggml_clamp(
|
|
6634
6937
|
return result;
|
6635
6938
|
}
|
6636
6939
|
|
6637
|
-
//
|
6940
|
+
// ggml_conv_1d_s1_ph
|
6638
6941
|
|
6639
|
-
struct ggml_tensor *
|
6942
|
+
struct ggml_tensor * ggml_conv_1d_s1_ph(
|
6640
6943
|
struct ggml_context * ctx,
|
6641
6944
|
struct ggml_tensor * a,
|
6642
6945
|
struct ggml_tensor * b) {
|
@@ -6653,7 +6956,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6653
6956
|
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
6654
6957
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6655
6958
|
|
6656
|
-
result->op =
|
6959
|
+
result->op = GGML_OP_CONV_1D_S1_PH;
|
6657
6960
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6658
6961
|
result->src0 = a;
|
6659
6962
|
result->src1 = b;
|
@@ -6661,9 +6964,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6661
6964
|
return result;
|
6662
6965
|
}
|
6663
6966
|
|
6664
|
-
//
|
6967
|
+
// ggml_conv_1d_s2_ph
|
6665
6968
|
|
6666
|
-
struct ggml_tensor *
|
6969
|
+
struct ggml_tensor * ggml_conv_1d_s2_ph(
|
6667
6970
|
struct ggml_context * ctx,
|
6668
6971
|
struct ggml_tensor * a,
|
6669
6972
|
struct ggml_tensor * b) {
|
@@ -6680,7 +6983,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
6680
6983
|
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
6681
6984
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6682
6985
|
|
6683
|
-
result->op =
|
6986
|
+
result->op = GGML_OP_CONV_1D_S2_PH;
|
6987
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6988
|
+
result->src0 = a;
|
6989
|
+
result->src1 = b;
|
6990
|
+
|
6991
|
+
return result;
|
6992
|
+
}
|
6993
|
+
|
6994
|
+
// ggml_conv_2d_sk_p0
|
6995
|
+
|
6996
|
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6997
|
+
struct ggml_context * ctx,
|
6998
|
+
struct ggml_tensor * a,
|
6999
|
+
struct ggml_tensor * b) {
|
7000
|
+
GGML_ASSERT(b->ne[3] == 1);
|
7001
|
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
7002
|
+
GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
|
7003
|
+
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
7004
|
+
bool is_node = false;
|
7005
|
+
|
7006
|
+
if (a->grad || b->grad) {
|
7007
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7008
|
+
is_node = true;
|
7009
|
+
}
|
7010
|
+
|
7011
|
+
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
7012
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7013
|
+
|
7014
|
+
result->op = GGML_OP_CONV_2D_SK_P0;
|
6684
7015
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6685
7016
|
result->src0 = a;
|
6686
7017
|
result->src1 = b;
|
@@ -6814,45 +7145,133 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6814
7145
|
return result;
|
6815
7146
|
}
|
6816
7147
|
|
7148
|
+
// ggml_win_part
|
6817
7149
|
|
6818
|
-
|
7150
|
+
struct ggml_tensor * ggml_win_part(
|
7151
|
+
struct ggml_context * ctx,
|
7152
|
+
struct ggml_tensor * a,
|
7153
|
+
int w) {
|
7154
|
+
GGML_ASSERT(a->ne[3] == 1);
|
7155
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
6819
7156
|
|
6820
|
-
struct ggml_tensor * ggml_map_unary_impl_f32(
|
6821
|
-
struct ggml_context * ctx,
|
6822
|
-
struct ggml_tensor * a,
|
6823
|
-
const ggml_unary_op_f32_t fun,
|
6824
|
-
bool inplace) {
|
6825
7157
|
bool is_node = false;
|
6826
7158
|
|
6827
|
-
if (
|
7159
|
+
if (a->grad) {
|
7160
|
+
GGML_ASSERT(false); // TODO: implement backward
|
6828
7161
|
is_node = true;
|
6829
7162
|
}
|
6830
7163
|
|
6831
|
-
|
6832
|
-
|
6833
|
-
|
7164
|
+
// padding
|
7165
|
+
const int px = (w - a->ne[1]%w)%w;
|
7166
|
+
const int py = (w - a->ne[2]%w)%w;
|
6834
7167
|
|
6835
|
-
|
7168
|
+
const int npx = (px + a->ne[1])/w;
|
7169
|
+
const int npy = (py + a->ne[2])/w;
|
7170
|
+
const int np = npx*npy;
|
7171
|
+
|
7172
|
+
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
7173
|
+
|
7174
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7175
|
+
|
7176
|
+
ggml_scratch_save(ctx);
|
7177
|
+
|
7178
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7179
|
+
|
7180
|
+
((int32_t *) b->data)[0] = npx;
|
7181
|
+
((int32_t *) b->data)[1] = npy;
|
7182
|
+
((int32_t *) b->data)[2] = w;
|
7183
|
+
|
7184
|
+
ggml_scratch_load(ctx);
|
7185
|
+
|
7186
|
+
result->op = GGML_OP_WIN_PART;
|
6836
7187
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6837
7188
|
result->src0 = a;
|
6838
|
-
result->
|
7189
|
+
result->src1 = NULL;
|
7190
|
+
result->opt[0] = b;
|
6839
7191
|
|
6840
7192
|
return result;
|
6841
7193
|
}
|
6842
7194
|
|
6843
|
-
|
6844
|
-
struct ggml_context * ctx,
|
6845
|
-
struct ggml_tensor * a,
|
6846
|
-
const ggml_unary_op_f32_t fun) {
|
6847
|
-
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
6848
|
-
}
|
7195
|
+
// ggml_win_unpart
|
6849
7196
|
|
6850
|
-
struct ggml_tensor *
|
6851
|
-
struct ggml_context
|
6852
|
-
struct ggml_tensor
|
6853
|
-
|
6854
|
-
|
6855
|
-
|
7197
|
+
struct ggml_tensor * ggml_win_unpart(
|
7198
|
+
struct ggml_context * ctx,
|
7199
|
+
struct ggml_tensor * a,
|
7200
|
+
int w0,
|
7201
|
+
int h0,
|
7202
|
+
int w) {
|
7203
|
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
7204
|
+
|
7205
|
+
bool is_node = false;
|
7206
|
+
|
7207
|
+
if (a->grad) {
|
7208
|
+
GGML_ASSERT(false); // TODO: implement backward
|
7209
|
+
is_node = true;
|
7210
|
+
}
|
7211
|
+
|
7212
|
+
const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
|
7213
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7214
|
+
|
7215
|
+
ggml_scratch_save(ctx);
|
7216
|
+
|
7217
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
7218
|
+
|
7219
|
+
((int32_t *) b->data)[0] = w;
|
7220
|
+
|
7221
|
+
ggml_scratch_load(ctx);
|
7222
|
+
|
7223
|
+
result->op = GGML_OP_WIN_UNPART;
|
7224
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7225
|
+
result->src0 = a;
|
7226
|
+
result->src1 = NULL;
|
7227
|
+
result->opt[0] = b;
|
7228
|
+
|
7229
|
+
return result;
|
7230
|
+
}
|
7231
|
+
|
7232
|
+
// ggml_map_unary
|
7233
|
+
|
7234
|
+
struct ggml_tensor * ggml_map_unary_impl_f32(
|
7235
|
+
struct ggml_context * ctx,
|
7236
|
+
struct ggml_tensor * a,
|
7237
|
+
const ggml_unary_op_f32_t fun,
|
7238
|
+
bool inplace) {
|
7239
|
+
bool is_node = false;
|
7240
|
+
|
7241
|
+
if (!inplace && a->grad) {
|
7242
|
+
is_node = true;
|
7243
|
+
}
|
7244
|
+
|
7245
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7246
|
+
|
7247
|
+
ggml_scratch_save(ctx);
|
7248
|
+
|
7249
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7250
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7251
|
+
|
7252
|
+
ggml_scratch_load(ctx);
|
7253
|
+
|
7254
|
+
result->op = GGML_OP_MAP_UNARY;
|
7255
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7256
|
+
result->src0 = a;
|
7257
|
+
result->opt[0] = addr_tensor;
|
7258
|
+
|
7259
|
+
return result;
|
7260
|
+
}
|
7261
|
+
|
7262
|
+
struct ggml_tensor * ggml_map_unary_f32(
|
7263
|
+
struct ggml_context * ctx,
|
7264
|
+
struct ggml_tensor * a,
|
7265
|
+
const ggml_unary_op_f32_t fun) {
|
7266
|
+
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
7267
|
+
}
|
7268
|
+
|
7269
|
+
struct ggml_tensor * ggml_map_unary_inplace_f32(
|
7270
|
+
struct ggml_context * ctx,
|
7271
|
+
struct ggml_tensor * a,
|
7272
|
+
const ggml_unary_op_f32_t fun) {
|
7273
|
+
return ggml_map_unary_impl_f32(ctx, a, fun, true);
|
7274
|
+
}
|
6856
7275
|
|
6857
7276
|
// ggml_map_binary
|
6858
7277
|
|
@@ -6870,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
|
|
6870
7289
|
is_node = true;
|
6871
7290
|
}
|
6872
7291
|
|
7292
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7293
|
+
|
7294
|
+
ggml_scratch_save(ctx);
|
7295
|
+
|
6873
7296
|
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
6874
7297
|
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
6875
|
-
|
7298
|
+
|
7299
|
+
ggml_scratch_load(ctx);
|
6876
7300
|
|
6877
7301
|
result->op = GGML_OP_MAP_BINARY;
|
6878
7302
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -6899,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
|
6899
7323
|
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
6900
7324
|
}
|
6901
7325
|
|
7326
|
+
// ggml_map_custom1
|
7327
|
+
|
7328
|
+
struct ggml_tensor * ggml_map_custom1_impl_f32(
|
7329
|
+
struct ggml_context * ctx,
|
7330
|
+
struct ggml_tensor * a,
|
7331
|
+
const ggml_custom1_op_f32_t fun,
|
7332
|
+
bool inplace) {
|
7333
|
+
bool is_node = false;
|
7334
|
+
|
7335
|
+
if (!inplace && a->grad) {
|
7336
|
+
is_node = true;
|
7337
|
+
}
|
7338
|
+
|
7339
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7340
|
+
|
7341
|
+
ggml_scratch_save(ctx);
|
7342
|
+
|
7343
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7344
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7345
|
+
|
7346
|
+
ggml_scratch_load(ctx);
|
7347
|
+
|
7348
|
+
result->op = GGML_OP_MAP_CUSTOM1;
|
7349
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7350
|
+
result->src0 = a;
|
7351
|
+
result->opt[0] = addr_tensor;
|
7352
|
+
|
7353
|
+
return result;
|
7354
|
+
}
|
7355
|
+
|
7356
|
+
struct ggml_tensor * ggml_map_custom1_f32(
|
7357
|
+
struct ggml_context * ctx,
|
7358
|
+
struct ggml_tensor * a,
|
7359
|
+
const ggml_custom1_op_f32_t fun) {
|
7360
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, false);
|
7361
|
+
}
|
7362
|
+
|
7363
|
+
struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
7364
|
+
struct ggml_context * ctx,
|
7365
|
+
struct ggml_tensor * a,
|
7366
|
+
const ggml_custom1_op_f32_t fun) {
|
7367
|
+
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
7368
|
+
}
|
7369
|
+
|
7370
|
+
// ggml_map_custom2
|
7371
|
+
|
7372
|
+
struct ggml_tensor * ggml_map_custom2_impl_f32(
|
7373
|
+
struct ggml_context * ctx,
|
7374
|
+
struct ggml_tensor * a,
|
7375
|
+
struct ggml_tensor * b,
|
7376
|
+
const ggml_custom2_op_f32_t fun,
|
7377
|
+
bool inplace) {
|
7378
|
+
bool is_node = false;
|
7379
|
+
|
7380
|
+
if (!inplace && (a->grad || b->grad)) {
|
7381
|
+
is_node = true;
|
7382
|
+
}
|
7383
|
+
|
7384
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7385
|
+
|
7386
|
+
ggml_scratch_save(ctx);
|
7387
|
+
|
7388
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7389
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7390
|
+
|
7391
|
+
ggml_scratch_load(ctx);
|
7392
|
+
|
7393
|
+
result->op = GGML_OP_MAP_CUSTOM2;
|
7394
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7395
|
+
result->src0 = a;
|
7396
|
+
result->src1 = b;
|
7397
|
+
result->opt[0] = addr_tensor;
|
7398
|
+
|
7399
|
+
return result;
|
7400
|
+
}
|
7401
|
+
|
7402
|
+
struct ggml_tensor * ggml_map_custom2_f32(
|
7403
|
+
struct ggml_context * ctx,
|
7404
|
+
struct ggml_tensor * a,
|
7405
|
+
struct ggml_tensor * b,
|
7406
|
+
const ggml_custom2_op_f32_t fun) {
|
7407
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
|
7408
|
+
}
|
7409
|
+
|
7410
|
+
struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
7411
|
+
struct ggml_context * ctx,
|
7412
|
+
struct ggml_tensor * a,
|
7413
|
+
struct ggml_tensor * b,
|
7414
|
+
const ggml_custom2_op_f32_t fun) {
|
7415
|
+
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
7416
|
+
}
|
7417
|
+
|
7418
|
+
// ggml_map_custom3
|
7419
|
+
|
7420
|
+
struct ggml_tensor * ggml_map_custom3_impl_f32(
|
7421
|
+
struct ggml_context * ctx,
|
7422
|
+
struct ggml_tensor * a,
|
7423
|
+
struct ggml_tensor * b,
|
7424
|
+
struct ggml_tensor * c,
|
7425
|
+
const ggml_custom3_op_f32_t fun,
|
7426
|
+
bool inplace) {
|
7427
|
+
bool is_node = false;
|
7428
|
+
|
7429
|
+
if (!inplace && (a->grad || b->grad || c->grad)) {
|
7430
|
+
is_node = true;
|
7431
|
+
}
|
7432
|
+
|
7433
|
+
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
7434
|
+
|
7435
|
+
ggml_scratch_save(ctx);
|
7436
|
+
|
7437
|
+
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
7438
|
+
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
7439
|
+
|
7440
|
+
ggml_scratch_load(ctx);
|
7441
|
+
|
7442
|
+
result->op = GGML_OP_MAP_CUSTOM3;
|
7443
|
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7444
|
+
result->src0 = a;
|
7445
|
+
result->src1 = b;
|
7446
|
+
result->opt[0] = addr_tensor;
|
7447
|
+
result->opt[1] = c;
|
7448
|
+
|
7449
|
+
return result;
|
7450
|
+
}
|
7451
|
+
|
7452
|
+
struct ggml_tensor * ggml_map_custom3_f32(
|
7453
|
+
struct ggml_context * ctx,
|
7454
|
+
struct ggml_tensor * a,
|
7455
|
+
struct ggml_tensor * b,
|
7456
|
+
struct ggml_tensor * c,
|
7457
|
+
const ggml_custom3_op_f32_t fun) {
|
7458
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
|
7459
|
+
}
|
7460
|
+
|
7461
|
+
struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
7462
|
+
struct ggml_context * ctx,
|
7463
|
+
struct ggml_tensor * a,
|
7464
|
+
struct ggml_tensor * b,
|
7465
|
+
struct ggml_tensor * c,
|
7466
|
+
const ggml_custom3_op_f32_t fun) {
|
7467
|
+
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
7468
|
+
}
|
7469
|
+
|
6902
7470
|
// ggml_cross_entropy_loss
|
6903
7471
|
|
6904
7472
|
struct ggml_tensor * ggml_cross_entropy_loss(
|
@@ -7892,7 +8460,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
7892
8460
|
|
7893
8461
|
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
7894
8462
|
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
7895
|
-
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*
|
8463
|
+
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
7896
8464
|
|
7897
8465
|
assert(ne00 % 32 == 0);
|
7898
8466
|
|
@@ -9453,8 +10021,65 @@ static void ggml_compute_forward_gelu(
|
|
9453
10021
|
GGML_ASSERT(false);
|
9454
10022
|
} break;
|
9455
10023
|
}
|
10024
|
+
}
|
10025
|
+
|
10026
|
+
// ggml_compute_forward_gelu_quick
|
10027
|
+
|
10028
|
+
static void ggml_compute_forward_gelu_quick_f32(
|
10029
|
+
const struct ggml_compute_params * params,
|
10030
|
+
const struct ggml_tensor * src0,
|
10031
|
+
struct ggml_tensor * dst) {
|
10032
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
10033
|
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
10034
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
10035
|
+
|
10036
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10037
|
+
return;
|
10038
|
+
}
|
10039
|
+
|
10040
|
+
const int ith = params->ith;
|
10041
|
+
const int nth = params->nth;
|
10042
|
+
|
10043
|
+
const int nc = src0->ne[0];
|
10044
|
+
const int nr = ggml_nrows(src0);
|
10045
|
+
|
10046
|
+
// rows per thread
|
10047
|
+
const int dr = (nr + nth - 1)/nth;
|
10048
|
+
|
10049
|
+
// row range for this thread
|
10050
|
+
const int ir0 = dr*ith;
|
10051
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
10052
|
+
|
10053
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
10054
|
+
ggml_vec_gelu_quick_f32(nc,
|
10055
|
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
10056
|
+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
10057
|
+
|
10058
|
+
#ifndef NDEBUG
|
10059
|
+
for (int k = 0; k < nc; k++) {
|
10060
|
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
10061
|
+
UNUSED(x);
|
10062
|
+
assert(!isnan(x));
|
10063
|
+
assert(!isinf(x));
|
10064
|
+
}
|
10065
|
+
#endif
|
10066
|
+
}
|
10067
|
+
}
|
9456
10068
|
|
9457
|
-
|
10069
|
+
static void ggml_compute_forward_gelu_quick(
|
10070
|
+
const struct ggml_compute_params * params,
|
10071
|
+
const struct ggml_tensor * src0,
|
10072
|
+
struct ggml_tensor * dst) {
|
10073
|
+
switch (src0->type) {
|
10074
|
+
case GGML_TYPE_F32:
|
10075
|
+
{
|
10076
|
+
ggml_compute_forward_gelu_quick_f32(params, src0, dst);
|
10077
|
+
} break;
|
10078
|
+
default:
|
10079
|
+
{
|
10080
|
+
GGML_ASSERT(false);
|
10081
|
+
} break;
|
10082
|
+
}
|
9458
10083
|
}
|
9459
10084
|
|
9460
10085
|
// ggml_compute_forward_silu
|
@@ -10852,7 +11477,7 @@ static void ggml_compute_forward_set_f32(
|
|
10852
11477
|
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
10853
11478
|
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
10854
11479
|
|
10855
|
-
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3
|
11480
|
+
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
10856
11481
|
|
10857
11482
|
GGML_ASSERT(nb10 == sizeof(float));
|
10858
11483
|
|
@@ -11573,8 +12198,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11573
12198
|
const struct ggml_tensor * src1,
|
11574
12199
|
struct ggml_tensor * dst) {
|
11575
12200
|
assert(params->ith == 0);
|
11576
|
-
|
11577
|
-
|
12201
|
+
|
12202
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12203
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11578
12204
|
|
11579
12205
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11580
12206
|
return;
|
@@ -11637,8 +12263,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11637
12263
|
const struct ggml_tensor * src1,
|
11638
12264
|
struct ggml_tensor * dst) {
|
11639
12265
|
assert(params->ith == 0);
|
11640
|
-
|
11641
|
-
|
12266
|
+
|
12267
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
12268
|
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11642
12269
|
|
11643
12270
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11644
12271
|
return;
|
@@ -11740,15 +12367,16 @@ static void ggml_compute_forward_clamp_f32(
|
|
11740
12367
|
const struct ggml_tensor * src1,
|
11741
12368
|
struct ggml_tensor * dst) {
|
11742
12369
|
assert(params->ith == 0);
|
11743
|
-
|
11744
|
-
|
12370
|
+
|
12371
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12372
|
+
GGML_ASSERT(ggml_nelements(src1) == 2);
|
11745
12373
|
|
11746
12374
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11747
12375
|
return;
|
11748
12376
|
}
|
11749
12377
|
|
11750
|
-
const
|
11751
|
-
const
|
12378
|
+
const float min = ((float *) src1->data)[0];
|
12379
|
+
const float max = ((float *) src1->data)[1];
|
11752
12380
|
|
11753
12381
|
const int ith = params->ith;
|
11754
12382
|
const int nth = params->nth;
|
@@ -11816,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11816
12444
|
const struct ggml_tensor * src1,
|
11817
12445
|
struct ggml_tensor * dst) {
|
11818
12446
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11819
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12447
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
11820
12448
|
|
11821
12449
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11822
12450
|
return;
|
@@ -11825,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11825
12453
|
const int n_past = ((int32_t *) src1->data)[0];
|
11826
12454
|
const int n_dims = ((int32_t *) src1->data)[1];
|
11827
12455
|
const int mode = ((int32_t *) src1->data)[2];
|
12456
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
11828
12457
|
|
11829
12458
|
assert(n_past >= 0);
|
11830
12459
|
|
@@ -11869,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
|
|
11869
12498
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
11870
12499
|
|
11871
12500
|
const bool is_neox = mode & 2;
|
12501
|
+
const bool is_glm = mode & 4;
|
11872
12502
|
|
11873
12503
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11874
12504
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -11879,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
|
|
11879
12509
|
|
11880
12510
|
float theta = (float)p;
|
11881
12511
|
|
11882
|
-
if (
|
12512
|
+
if (is_glm) {
|
12513
|
+
theta = MIN(p, n_ctx - 2);
|
12514
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12515
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12516
|
+
const float cos_theta = cosf(theta);
|
12517
|
+
const float sin_theta = sinf(theta);
|
12518
|
+
const float cos_block_theta = cosf(block_theta);
|
12519
|
+
const float sin_block_theta = sinf(block_theta);
|
12520
|
+
|
12521
|
+
theta *= theta_scale;
|
12522
|
+
block_theta *= theta_scale;
|
12523
|
+
|
12524
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12525
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12526
|
+
|
12527
|
+
const float x0 = src[0];
|
12528
|
+
const float x1 = src[n_dims/2];
|
12529
|
+
const float x2 = src[n_dims];
|
12530
|
+
const float x3 = src[n_dims/2*3];
|
12531
|
+
|
12532
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
12533
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
12534
|
+
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
12535
|
+
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
12536
|
+
}
|
12537
|
+
} else if (!is_neox) {
|
11883
12538
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11884
12539
|
const float cos_theta = cosf(theta);
|
11885
12540
|
const float sin_theta = sinf(theta);
|
@@ -11929,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11929
12584
|
const struct ggml_tensor * src1,
|
11930
12585
|
struct ggml_tensor * dst) {
|
11931
12586
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11932
|
-
GGML_ASSERT(ggml_nelements(src1) ==
|
12587
|
+
GGML_ASSERT(ggml_nelements(src1) == 4);
|
11933
12588
|
|
11934
12589
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11935
12590
|
return;
|
@@ -11938,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11938
12593
|
const int n_past = ((int32_t *) src1->data)[0];
|
11939
12594
|
const int n_dims = ((int32_t *) src1->data)[1];
|
11940
12595
|
const int mode = ((int32_t *) src1->data)[2];
|
12596
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
11941
12597
|
|
11942
12598
|
assert(n_past >= 0);
|
11943
12599
|
|
@@ -11982,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
|
|
11982
12638
|
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
11983
12639
|
|
11984
12640
|
const bool is_neox = mode & 2;
|
12641
|
+
const bool is_glm = mode & 4;
|
11985
12642
|
|
11986
12643
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
11987
12644
|
for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
|
@@ -11992,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
|
|
11992
12649
|
|
11993
12650
|
float theta = (float)p;
|
11994
12651
|
|
11995
|
-
if (
|
12652
|
+
if (is_glm) {
|
12653
|
+
theta = MIN(p, n_ctx - 2);
|
12654
|
+
float block_theta = MAX(p - (n_ctx - 2), 0);
|
12655
|
+
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
12656
|
+
const float cos_theta = cosf(theta);
|
12657
|
+
const float sin_theta = sinf(theta);
|
12658
|
+
const float cos_block_theta = cosf(block_theta);
|
12659
|
+
const float sin_block_theta = sinf(block_theta);
|
12660
|
+
|
12661
|
+
theta *= theta_scale;
|
12662
|
+
block_theta *= theta_scale;
|
12663
|
+
|
12664
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
12665
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
12666
|
+
|
12667
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
12668
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
12669
|
+
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
12670
|
+
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
12671
|
+
|
12672
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
12673
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
12674
|
+
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
12675
|
+
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
12676
|
+
}
|
12677
|
+
} if (!is_neox) {
|
11996
12678
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
11997
12679
|
const float cos_theta = cosf(theta);
|
11998
12680
|
const float sin_theta = sinf(theta);
|
@@ -12306,9 +12988,9 @@ static void ggml_compute_forward_rope_back(
|
|
12306
12988
|
}
|
12307
12989
|
}
|
12308
12990
|
|
12309
|
-
//
|
12991
|
+
// ggml_compute_forward_conv_1d_s1_ph
|
12310
12992
|
|
12311
|
-
static void
|
12993
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12312
12994
|
const struct ggml_compute_params * params,
|
12313
12995
|
const struct ggml_tensor * src0,
|
12314
12996
|
const struct ggml_tensor * src1,
|
@@ -12428,7 +13110,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
12428
13110
|
}
|
12429
13111
|
}
|
12430
13112
|
|
12431
|
-
static void
|
13113
|
+
static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
12432
13114
|
const struct ggml_compute_params * params,
|
12433
13115
|
const struct ggml_tensor * src0,
|
12434
13116
|
const struct ggml_tensor * src1,
|
@@ -12548,7 +13230,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
12548
13230
|
}
|
12549
13231
|
}
|
12550
13232
|
|
12551
|
-
static void
|
13233
|
+
static void ggml_compute_forward_conv_1d_s1_ph(
|
12552
13234
|
const struct ggml_compute_params * params,
|
12553
13235
|
const struct ggml_tensor * src0,
|
12554
13236
|
const struct ggml_tensor * src1,
|
@@ -12556,11 +13238,11 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12556
13238
|
switch (src0->type) {
|
12557
13239
|
case GGML_TYPE_F16:
|
12558
13240
|
{
|
12559
|
-
|
13241
|
+
ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
|
12560
13242
|
} break;
|
12561
13243
|
case GGML_TYPE_F32:
|
12562
13244
|
{
|
12563
|
-
|
13245
|
+
ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
|
12564
13246
|
} break;
|
12565
13247
|
default:
|
12566
13248
|
{
|
@@ -12569,9 +13251,9 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12569
13251
|
}
|
12570
13252
|
}
|
12571
13253
|
|
12572
|
-
//
|
13254
|
+
// ggml_compute_forward_conv_1d_s2_ph
|
12573
13255
|
|
12574
|
-
static void
|
13256
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
12575
13257
|
const struct ggml_compute_params * params,
|
12576
13258
|
const struct ggml_tensor * src0,
|
12577
13259
|
const struct ggml_tensor * src1,
|
@@ -12691,7 +13373,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
12691
13373
|
}
|
12692
13374
|
}
|
12693
13375
|
|
12694
|
-
static void
|
13376
|
+
static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
12695
13377
|
const struct ggml_compute_params * params,
|
12696
13378
|
const struct ggml_tensor * src0,
|
12697
13379
|
const struct ggml_tensor * src1,
|
@@ -12811,7 +13493,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
12811
13493
|
}
|
12812
13494
|
}
|
12813
13495
|
|
12814
|
-
static void
|
13496
|
+
static void ggml_compute_forward_conv_1d_s2_ph(
|
12815
13497
|
const struct ggml_compute_params * params,
|
12816
13498
|
const struct ggml_tensor * src0,
|
12817
13499
|
const struct ggml_tensor * src1,
|
@@ -12819,11 +13501,11 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12819
13501
|
switch (src0->type) {
|
12820
13502
|
case GGML_TYPE_F16:
|
12821
13503
|
{
|
12822
|
-
|
13504
|
+
ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
|
12823
13505
|
} break;
|
12824
13506
|
case GGML_TYPE_F32:
|
12825
13507
|
{
|
12826
|
-
|
13508
|
+
ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
|
12827
13509
|
} break;
|
12828
13510
|
default:
|
12829
13511
|
{
|
@@ -12832,18 +13514,154 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12832
13514
|
}
|
12833
13515
|
}
|
12834
13516
|
|
12835
|
-
//
|
13517
|
+
// ggml_compute_forward_conv_2d_sk_p0
|
12836
13518
|
|
12837
|
-
static void
|
13519
|
+
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
12838
13520
|
const struct ggml_compute_params * params,
|
12839
|
-
const struct ggml_tensor *
|
12840
|
-
const struct ggml_tensor *
|
12841
|
-
|
12842
|
-
|
12843
|
-
|
12844
|
-
|
12845
|
-
|
12846
|
-
|
13521
|
+
const struct ggml_tensor * src0,
|
13522
|
+
const struct ggml_tensor * src1,
|
13523
|
+
struct ggml_tensor * dst) {
|
13524
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13525
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13526
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13527
|
+
|
13528
|
+
int64_t t0 = ggml_perf_time_us();
|
13529
|
+
UNUSED(t0);
|
13530
|
+
|
13531
|
+
const int ne00 = src0->ne[0];
|
13532
|
+
const int ne01 = src0->ne[1];
|
13533
|
+
const int ne02 = src0->ne[2];
|
13534
|
+
//const int ne03 = src0->ne[3];
|
13535
|
+
|
13536
|
+
const int ne10 = src1->ne[0];
|
13537
|
+
//const int ne11 = src1->ne[1];
|
13538
|
+
const int ne12 = src1->ne[2];
|
13539
|
+
//const int ne13 = src1->ne[3];
|
13540
|
+
|
13541
|
+
const int ne0 = dst->ne[0];
|
13542
|
+
const int ne1 = dst->ne[1];
|
13543
|
+
const int ne2 = dst->ne[2];
|
13544
|
+
//const int ne3 = dst->ne[3];
|
13545
|
+
//const int ne = ne0*ne1*ne2*ne3;
|
13546
|
+
|
13547
|
+
const int nb00 = src0->nb[0];
|
13548
|
+
//const int nb01 = src0->nb[1];
|
13549
|
+
//const int nb02 = src0->nb[2];
|
13550
|
+
const int nb03 = src0->nb[3];
|
13551
|
+
|
13552
|
+
const int nb10 = src1->nb[0];
|
13553
|
+
//const int nb11 = src1->nb[1];
|
13554
|
+
const int nb12 = src1->nb[2];
|
13555
|
+
//const int nb13 = src1->nb[3];
|
13556
|
+
|
13557
|
+
//const int nb0 = dst->nb[0];
|
13558
|
+
//const int nb1 = dst->nb[1];
|
13559
|
+
const int nb2 = dst->nb[2];
|
13560
|
+
//const int nb3 = dst->nb[3];
|
13561
|
+
|
13562
|
+
const int ith = params->ith;
|
13563
|
+
const int nth = params->nth;
|
13564
|
+
|
13565
|
+
const int nk0 = ne00;
|
13566
|
+
const int nk1 = ne01;
|
13567
|
+
|
13568
|
+
// size of the convolution row - the kernel size unrolled across all channels
|
13569
|
+
const int ew0 = nk0*nk1*ne02;
|
13570
|
+
|
13571
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13572
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
13573
|
+
|
13574
|
+
if (params->type == GGML_TASK_INIT) {
|
13575
|
+
// TODO: fix this memset (wsize is overestimated)
|
13576
|
+
memset(params->wdata, 0, params->wsize);
|
13577
|
+
|
13578
|
+
// prepare source data (src1)
|
13579
|
+
{
|
13580
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13581
|
+
|
13582
|
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13583
|
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13584
|
+
ggml_fp16_t * dst_data = wdata;
|
13585
|
+
|
13586
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13587
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13588
|
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13589
|
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13590
|
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13591
|
+
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
13592
|
+
}
|
13593
|
+
}
|
13594
|
+
}
|
13595
|
+
}
|
13596
|
+
}
|
13597
|
+
}
|
13598
|
+
|
13599
|
+
return;
|
13600
|
+
}
|
13601
|
+
|
13602
|
+
if (params->type == GGML_TASK_FINALIZE) {
|
13603
|
+
return;
|
13604
|
+
}
|
13605
|
+
|
13606
|
+
// total patches in dst
|
13607
|
+
const int np = ne2;
|
13608
|
+
|
13609
|
+
// patches per thread
|
13610
|
+
const int dp = (np + nth - 1)/nth;
|
13611
|
+
|
13612
|
+
// patch range for this thread
|
13613
|
+
const int ip0 = dp*ith;
|
13614
|
+
const int ip1 = MIN(ip0 + dp, np);
|
13615
|
+
|
13616
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13617
|
+
|
13618
|
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13619
|
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13620
|
+
|
13621
|
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13622
|
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13623
|
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13624
|
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13625
|
+
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
|
13626
|
+
}
|
13627
|
+
}
|
13628
|
+
}
|
13629
|
+
}
|
13630
|
+
|
13631
|
+
static void ggml_compute_forward_conv_2d_sk_p0(
|
13632
|
+
const struct ggml_compute_params * params,
|
13633
|
+
const struct ggml_tensor * src0,
|
13634
|
+
const struct ggml_tensor * src1,
|
13635
|
+
struct ggml_tensor * dst) {
|
13636
|
+
switch (src0->type) {
|
13637
|
+
case GGML_TYPE_F16:
|
13638
|
+
{
|
13639
|
+
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
13640
|
+
} break;
|
13641
|
+
case GGML_TYPE_F32:
|
13642
|
+
{
|
13643
|
+
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
13644
|
+
GGML_ASSERT(false);
|
13645
|
+
} break;
|
13646
|
+
default:
|
13647
|
+
{
|
13648
|
+
GGML_ASSERT(false);
|
13649
|
+
} break;
|
13650
|
+
}
|
13651
|
+
}
|
13652
|
+
|
13653
|
+
// ggml_compute_forward_flash_attn
|
13654
|
+
|
13655
|
+
static void ggml_compute_forward_flash_attn_f32(
|
13656
|
+
const struct ggml_compute_params * params,
|
13657
|
+
const struct ggml_tensor * q,
|
13658
|
+
const struct ggml_tensor * k,
|
13659
|
+
const struct ggml_tensor * v,
|
13660
|
+
const bool masked,
|
13661
|
+
struct ggml_tensor * dst) {
|
13662
|
+
int64_t t0 = ggml_perf_time_us();
|
13663
|
+
UNUSED(t0);
|
13664
|
+
|
12847
13665
|
const int64_t neq0 = q->ne[0];
|
12848
13666
|
const int64_t neq1 = q->ne[1];
|
12849
13667
|
const int64_t neq2 = q->ne[2];
|
@@ -13926,6 +14744,145 @@ static void ggml_compute_forward_flash_attn_back(
|
|
13926
14744
|
}
|
13927
14745
|
}
|
13928
14746
|
|
14747
|
+
// ggml_compute_forward_win_part
|
14748
|
+
|
14749
|
+
static void ggml_compute_forward_win_part_f32(
|
14750
|
+
const struct ggml_compute_params * params,
|
14751
|
+
const struct ggml_tensor * src0,
|
14752
|
+
const struct ggml_tensor * opt0,
|
14753
|
+
struct ggml_tensor * dst) {
|
14754
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14755
|
+
return;
|
14756
|
+
}
|
14757
|
+
|
14758
|
+
const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
|
14759
|
+
const int64_t ne01 = src0->ne[1];
|
14760
|
+
const int64_t ne02 = src0->ne[2];
|
14761
|
+
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14762
|
+
|
14763
|
+
const int64_t ne0 = dst->ne[0];
|
14764
|
+
const int64_t ne1 = dst->ne[1];
|
14765
|
+
const int64_t ne2 = dst->ne[2];
|
14766
|
+
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
14767
|
+
|
14768
|
+
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14769
|
+
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
14770
|
+
const int32_t w = ((const int32_t *)(opt0->data))[2];
|
14771
|
+
|
14772
|
+
assert(ne00 == ne0);
|
14773
|
+
assert(ne3 == nep0*nep1);
|
14774
|
+
|
14775
|
+
// TODO: optimize / multi-thread
|
14776
|
+
for (int py = 0; py < nep1; ++py) {
|
14777
|
+
for (int px = 0; px < nep0; ++px) {
|
14778
|
+
const int64_t i3 = py*nep0 + px;
|
14779
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14780
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14781
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14782
|
+
const int64_t i02 = py*w + i2;
|
14783
|
+
const int64_t i01 = px*w + i1;
|
14784
|
+
const int64_t i00 = i0;
|
14785
|
+
|
14786
|
+
const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
|
14787
|
+
const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
|
14788
|
+
|
14789
|
+
if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
|
14790
|
+
((float *) dst->data)[i] = 0.0f;
|
14791
|
+
} else {
|
14792
|
+
((float *) dst->data)[i] = ((float *) src0->data)[j];
|
14793
|
+
}
|
14794
|
+
}
|
14795
|
+
}
|
14796
|
+
}
|
14797
|
+
}
|
14798
|
+
}
|
14799
|
+
}
|
14800
|
+
|
14801
|
+
static void ggml_compute_forward_win_part(
|
14802
|
+
const struct ggml_compute_params * params,
|
14803
|
+
const struct ggml_tensor * src0,
|
14804
|
+
const struct ggml_tensor * opt0,
|
14805
|
+
struct ggml_tensor * dst) {
|
14806
|
+
switch (src0->type) {
|
14807
|
+
case GGML_TYPE_F32:
|
14808
|
+
{
|
14809
|
+
ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
|
14810
|
+
} break;
|
14811
|
+
default:
|
14812
|
+
{
|
14813
|
+
GGML_ASSERT(false);
|
14814
|
+
} break;
|
14815
|
+
}
|
14816
|
+
}
|
14817
|
+
|
14818
|
+
// ggml_compute_forward_win_unpart
|
14819
|
+
|
14820
|
+
static void ggml_compute_forward_win_unpart_f32(
|
14821
|
+
const struct ggml_compute_params * params,
|
14822
|
+
const struct ggml_tensor * src0,
|
14823
|
+
const struct ggml_tensor * opt0,
|
14824
|
+
struct ggml_tensor * dst) {
|
14825
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14826
|
+
return;
|
14827
|
+
}
|
14828
|
+
|
14829
|
+
const int64_t ne00 = src0->ne[0];
|
14830
|
+
const int64_t ne01 = src0->ne[1];
|
14831
|
+
const int64_t ne02 = src0->ne[2];
|
14832
|
+
//const int64_t ne03 = src0->ne[3];
|
14833
|
+
|
14834
|
+
const int64_t ne0 = dst->ne[0];
|
14835
|
+
const int64_t ne1 = dst->ne[1];
|
14836
|
+
const int64_t ne2 = dst->ne[2];
|
14837
|
+
|
14838
|
+
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14839
|
+
|
14840
|
+
// padding
|
14841
|
+
const int px = (w - ne1%w)%w;
|
14842
|
+
//const int py = (w - ne2%w)%w;
|
14843
|
+
|
14844
|
+
const int npx = (px + ne1)/w;
|
14845
|
+
//const int npy = (py + ne2)/w;
|
14846
|
+
|
14847
|
+
assert(ne0 == ne00);
|
14848
|
+
|
14849
|
+
// TODO: optimize / multi-thread
|
14850
|
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14851
|
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14852
|
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14853
|
+
const int ip2 = i2/w;
|
14854
|
+
const int ip1 = i1/w;
|
14855
|
+
|
14856
|
+
const int64_t i02 = i2%w;
|
14857
|
+
const int64_t i01 = i1%w;
|
14858
|
+
const int64_t i00 = i0;
|
14859
|
+
|
14860
|
+
const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
|
14861
|
+
const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
|
14862
|
+
|
14863
|
+
((float *) dst->data)[j] = ((float *) src0->data)[i];
|
14864
|
+
}
|
14865
|
+
}
|
14866
|
+
}
|
14867
|
+
}
|
14868
|
+
|
14869
|
+
static void ggml_compute_forward_win_unpart(
|
14870
|
+
const struct ggml_compute_params * params,
|
14871
|
+
const struct ggml_tensor * src0,
|
14872
|
+
const struct ggml_tensor * opt0,
|
14873
|
+
struct ggml_tensor * dst) {
|
14874
|
+
switch (src0->type) {
|
14875
|
+
case GGML_TYPE_F32:
|
14876
|
+
{
|
14877
|
+
ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
|
14878
|
+
} break;
|
14879
|
+
default:
|
14880
|
+
{
|
14881
|
+
GGML_ASSERT(false);
|
14882
|
+
} break;
|
14883
|
+
}
|
14884
|
+
}
|
14885
|
+
|
13929
14886
|
// ggml_compute_forward_map_unary
|
13930
14887
|
|
13931
14888
|
static void ggml_compute_forward_map_unary_f32(
|
@@ -14019,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
|
|
14019
14976
|
}
|
14020
14977
|
}
|
14021
14978
|
|
14979
|
+
// ggml_compute_forward_map_custom1
|
14980
|
+
|
14981
|
+
static void ggml_compute_forward_map_custom1_f32(
|
14982
|
+
const struct ggml_compute_params * params,
|
14983
|
+
const struct ggml_tensor * a,
|
14984
|
+
struct ggml_tensor * dst,
|
14985
|
+
const ggml_custom1_op_f32_t fun) {
|
14986
|
+
assert(params->ith == 0);
|
14987
|
+
|
14988
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14989
|
+
return;
|
14990
|
+
}
|
14991
|
+
|
14992
|
+
fun(dst, a);
|
14993
|
+
}
|
14994
|
+
|
14995
|
+
|
14996
|
+
static void ggml_compute_forward_map_custom1(
|
14997
|
+
const struct ggml_compute_params * params,
|
14998
|
+
const struct ggml_tensor * a,
|
14999
|
+
struct ggml_tensor * dst,
|
15000
|
+
const ggml_custom1_op_f32_t fun) {
|
15001
|
+
switch (a->type) {
|
15002
|
+
case GGML_TYPE_F32:
|
15003
|
+
{
|
15004
|
+
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
15005
|
+
} break;
|
15006
|
+
default:
|
15007
|
+
{
|
15008
|
+
GGML_ASSERT(false);
|
15009
|
+
} break;
|
15010
|
+
}
|
15011
|
+
}
|
15012
|
+
|
15013
|
+
// ggml_compute_forward_map_custom2
|
15014
|
+
|
15015
|
+
static void ggml_compute_forward_map_custom2_f32(
|
15016
|
+
const struct ggml_compute_params * params,
|
15017
|
+
const struct ggml_tensor * a,
|
15018
|
+
const struct ggml_tensor * b,
|
15019
|
+
struct ggml_tensor * dst,
|
15020
|
+
const ggml_custom2_op_f32_t fun) {
|
15021
|
+
assert(params->ith == 0);
|
15022
|
+
|
15023
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15024
|
+
return;
|
15025
|
+
}
|
15026
|
+
|
15027
|
+
fun(dst, a, b);
|
15028
|
+
}
|
15029
|
+
|
15030
|
+
|
15031
|
+
static void ggml_compute_forward_map_custom2(
|
15032
|
+
const struct ggml_compute_params * params,
|
15033
|
+
const struct ggml_tensor * a,
|
15034
|
+
const struct ggml_tensor * b,
|
15035
|
+
struct ggml_tensor * dst,
|
15036
|
+
const ggml_custom2_op_f32_t fun) {
|
15037
|
+
switch (a->type) {
|
15038
|
+
case GGML_TYPE_F32:
|
15039
|
+
{
|
15040
|
+
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
15041
|
+
} break;
|
15042
|
+
default:
|
15043
|
+
{
|
15044
|
+
GGML_ASSERT(false);
|
15045
|
+
} break;
|
15046
|
+
}
|
15047
|
+
}
|
15048
|
+
|
15049
|
+
// ggml_compute_forward_map_custom3
|
15050
|
+
|
15051
|
+
static void ggml_compute_forward_map_custom3_f32(
|
15052
|
+
const struct ggml_compute_params * params,
|
15053
|
+
const struct ggml_tensor * a,
|
15054
|
+
const struct ggml_tensor * b,
|
15055
|
+
const struct ggml_tensor * c,
|
15056
|
+
struct ggml_tensor * dst,
|
15057
|
+
const ggml_custom3_op_f32_t fun) {
|
15058
|
+
assert(params->ith == 0);
|
15059
|
+
|
15060
|
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
15061
|
+
return;
|
15062
|
+
}
|
15063
|
+
|
15064
|
+
fun(dst, a, b, c);
|
15065
|
+
}
|
15066
|
+
|
15067
|
+
|
15068
|
+
static void ggml_compute_forward_map_custom3(
|
15069
|
+
const struct ggml_compute_params * params,
|
15070
|
+
const struct ggml_tensor * a,
|
15071
|
+
const struct ggml_tensor * b,
|
15072
|
+
const struct ggml_tensor * c,
|
15073
|
+
struct ggml_tensor * dst,
|
15074
|
+
const ggml_custom3_op_f32_t fun) {
|
15075
|
+
switch (a->type) {
|
15076
|
+
case GGML_TYPE_F32:
|
15077
|
+
{
|
15078
|
+
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
15079
|
+
} break;
|
15080
|
+
default:
|
15081
|
+
{
|
15082
|
+
GGML_ASSERT(false);
|
15083
|
+
} break;
|
15084
|
+
}
|
15085
|
+
}
|
15086
|
+
|
14022
15087
|
// ggml_compute_forward_cross_entropy_loss
|
14023
15088
|
|
14024
15089
|
static void ggml_compute_forward_cross_entropy_loss_f32(
|
@@ -14309,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14309
15374
|
if (skip_cpu) {
|
14310
15375
|
return;
|
14311
15376
|
}
|
14312
|
-
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
15377
|
+
GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
|
14313
15378
|
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
14314
15379
|
#endif // GGML_USE_CUBLAS
|
14315
15380
|
|
@@ -14398,6 +15463,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14398
15463
|
{
|
14399
15464
|
ggml_compute_forward_gelu(params, tensor->src0, tensor);
|
14400
15465
|
} break;
|
15466
|
+
case GGML_OP_GELU_QUICK:
|
15467
|
+
{
|
15468
|
+
ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
|
15469
|
+
} break;
|
14401
15470
|
case GGML_OP_SILU:
|
14402
15471
|
{
|
14403
15472
|
ggml_compute_forward_silu(params, tensor->src0, tensor);
|
@@ -14502,19 +15571,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14502
15571
|
{
|
14503
15572
|
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
14504
15573
|
} break;
|
14505
|
-
case
|
15574
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15575
|
+
{
|
15576
|
+
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15577
|
+
} break;
|
15578
|
+
case GGML_OP_CONV_1D_S2_PH:
|
14506
15579
|
{
|
14507
|
-
|
15580
|
+
ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
|
14508
15581
|
} break;
|
14509
|
-
case
|
15582
|
+
case GGML_OP_CONV_2D_SK_P0:
|
14510
15583
|
{
|
14511
|
-
|
15584
|
+
ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
|
14512
15585
|
} break;
|
14513
15586
|
case GGML_OP_FLASH_ATTN:
|
14514
15587
|
{
|
14515
|
-
int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
15588
|
+
const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
14516
15589
|
GGML_ASSERT(t == 0 || t == 1);
|
14517
|
-
bool masked = t != 0;
|
15590
|
+
const bool masked = t != 0;
|
14518
15591
|
ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
|
14519
15592
|
} break;
|
14520
15593
|
case GGML_OP_FLASH_FF:
|
@@ -14528,6 +15601,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14528
15601
|
bool masked = t != 0;
|
14529
15602
|
ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
|
14530
15603
|
} break;
|
15604
|
+
case GGML_OP_WIN_PART:
|
15605
|
+
{
|
15606
|
+
ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
|
15607
|
+
} break;
|
15608
|
+
case GGML_OP_WIN_UNPART:
|
15609
|
+
{
|
15610
|
+
ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
|
15611
|
+
} break;
|
14531
15612
|
case GGML_OP_MAP_UNARY:
|
14532
15613
|
{
|
14533
15614
|
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
@@ -14540,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14540
15621
|
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
14541
15622
|
}
|
14542
15623
|
break;
|
15624
|
+
case GGML_OP_MAP_CUSTOM1:
|
15625
|
+
{
|
15626
|
+
const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
|
15627
|
+
ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
|
15628
|
+
}
|
15629
|
+
break;
|
15630
|
+
case GGML_OP_MAP_CUSTOM2:
|
15631
|
+
{
|
15632
|
+
const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
|
15633
|
+
ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
|
15634
|
+
}
|
15635
|
+
break;
|
15636
|
+
case GGML_OP_MAP_CUSTOM3:
|
15637
|
+
{
|
15638
|
+
const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
|
15639
|
+
ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
|
15640
|
+
}
|
15641
|
+
break;
|
14543
15642
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
14544
15643
|
{
|
14545
15644
|
ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
|
@@ -14799,6 +15898,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14799
15898
|
{
|
14800
15899
|
GGML_ASSERT(false); // TODO: not implemented
|
14801
15900
|
} break;
|
15901
|
+
case GGML_OP_GELU_QUICK:
|
15902
|
+
{
|
15903
|
+
GGML_ASSERT(false); // TODO: not implemented
|
15904
|
+
} break;
|
14802
15905
|
case GGML_OP_ALIBI:
|
14803
15906
|
{
|
14804
15907
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15144,28 +16247,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15144
16247
|
{
|
15145
16248
|
if (src0->grad) {
|
15146
16249
|
assert(src1->type == GGML_TYPE_I32);
|
15147
|
-
assert(ggml_nelements(src1) ==
|
16250
|
+
assert(ggml_nelements(src1) == 4);
|
15148
16251
|
const int n_past = ((int32_t *) src1->data)[0];
|
15149
16252
|
const int n_dims = ((int32_t *) src1->data)[1];
|
15150
16253
|
const int mode = ((int32_t *) src1->data)[2];
|
16254
|
+
const int n_ctx = ((int32_t *) src1->data)[3];
|
15151
16255
|
src0->grad = ggml_add_impl(ctx,
|
15152
16256
|
src0->grad,
|
15153
16257
|
ggml_rope(ctx,
|
15154
16258
|
tensor->grad,
|
15155
16259
|
n_past,
|
15156
16260
|
n_dims,
|
15157
|
-
mode
|
16261
|
+
mode,
|
16262
|
+
n_ctx),
|
15158
16263
|
inplace);
|
15159
16264
|
}
|
15160
16265
|
if (src1->grad) {
|
15161
16266
|
// noop
|
15162
16267
|
}
|
15163
16268
|
} break;
|
15164
|
-
case
|
16269
|
+
case GGML_OP_CONV_1D_S1_PH:
|
15165
16270
|
{
|
15166
16271
|
GGML_ASSERT(false); // TODO: not implemented
|
15167
16272
|
} break;
|
15168
|
-
case
|
16273
|
+
case GGML_OP_CONV_1D_S2_PH:
|
16274
|
+
{
|
16275
|
+
GGML_ASSERT(false); // TODO: not implemented
|
16276
|
+
} break;
|
16277
|
+
case GGML_OP_CONV_2D_SK_P0:
|
15169
16278
|
{
|
15170
16279
|
GGML_ASSERT(false); // TODO: not implemented
|
15171
16280
|
} break;
|
@@ -15334,8 +16443,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15334
16443
|
{
|
15335
16444
|
GGML_ASSERT(false); // not supported
|
15336
16445
|
} break;
|
16446
|
+
case GGML_OP_WIN_PART:
|
16447
|
+
case GGML_OP_WIN_UNPART:
|
15337
16448
|
case GGML_OP_MAP_UNARY:
|
15338
16449
|
case GGML_OP_MAP_BINARY:
|
16450
|
+
case GGML_OP_MAP_CUSTOM1:
|
16451
|
+
case GGML_OP_MAP_CUSTOM2:
|
16452
|
+
case GGML_OP_MAP_CUSTOM3:
|
15339
16453
|
{
|
15340
16454
|
GGML_ASSERT(false); // not supported
|
15341
16455
|
} break;
|
@@ -15407,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15407
16521
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
15408
16522
|
|
15409
16523
|
if (strlen(node->name) == 0) {
|
15410
|
-
|
16524
|
+
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
15411
16525
|
}
|
15412
16526
|
|
15413
16527
|
cgraph->leafs[cgraph->n_leafs] = node;
|
@@ -15416,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15416
16530
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
15417
16531
|
|
15418
16532
|
if (strlen(node->name) == 0) {
|
15419
|
-
|
16533
|
+
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
15420
16534
|
}
|
15421
16535
|
|
15422
16536
|
cgraph->nodes[cgraph->n_nodes] = node;
|
@@ -15570,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
|
|
15570
16684
|
|
15571
16685
|
#endif
|
15572
16686
|
|
16687
|
+
// Android's libc implementation "bionic" does not support setting affinity
|
16688
|
+
#if defined(__linux__) && !defined(__BIONIC__)
|
16689
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) {
|
16690
|
+
if (!ggml_is_numa()) {
|
16691
|
+
return;
|
16692
|
+
}
|
16693
|
+
|
16694
|
+
// run thread on node_num thread_n / (threads per node)
|
16695
|
+
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
16696
|
+
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
16697
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16698
|
+
|
16699
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16700
|
+
CPU_ZERO_S(setsize, cpus);
|
16701
|
+
for (size_t i = 0; i < node->n_cpus; ++i) {
|
16702
|
+
CPU_SET_S(node->cpus[i], setsize, cpus);
|
16703
|
+
}
|
16704
|
+
|
16705
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16706
|
+
if (rv) {
|
16707
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16708
|
+
strerror(rv));
|
16709
|
+
}
|
16710
|
+
|
16711
|
+
CPU_FREE(cpus);
|
16712
|
+
}
|
16713
|
+
|
16714
|
+
void clear_numa_thread_affinity(void) {
|
16715
|
+
if (!ggml_is_numa()) {
|
16716
|
+
return;
|
16717
|
+
}
|
16718
|
+
|
16719
|
+
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
16720
|
+
|
16721
|
+
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
16722
|
+
CPU_ZERO_S(setsize, cpus);
|
16723
|
+
for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
|
16724
|
+
CPU_SET_S(i, setsize, cpus);
|
16725
|
+
}
|
16726
|
+
|
16727
|
+
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
16728
|
+
if (rv) {
|
16729
|
+
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
16730
|
+
strerror(rv));
|
16731
|
+
}
|
16732
|
+
|
16733
|
+
CPU_FREE(cpus);
|
16734
|
+
}
|
16735
|
+
#else
|
16736
|
+
// TODO: Windows etc.
|
16737
|
+
// (the linux implementation may also work on BSD, someone should test)
|
16738
|
+
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
16739
|
+
void clear_numa_thread_affinity(void) {}
|
16740
|
+
#endif
|
16741
|
+
|
15573
16742
|
struct ggml_compute_state_shared {
|
15574
|
-
|
16743
|
+
struct ggml_cgraph * cgraph;
|
16744
|
+
|
16745
|
+
int64_t perf_node_start_cycles;
|
16746
|
+
int64_t perf_node_start_time_us;
|
15575
16747
|
|
15576
16748
|
int n_threads;
|
15577
16749
|
|
15578
16750
|
// synchronization primitives
|
15579
|
-
atomic_int
|
15580
|
-
|
15581
|
-
atomic_bool stop; // stop all threads
|
16751
|
+
atomic_int n_active; // num active threads
|
16752
|
+
atomic_int node_n; // active graph node
|
15582
16753
|
};
|
15583
16754
|
|
15584
16755
|
struct ggml_compute_state {
|
15585
16756
|
ggml_thread_t thrd;
|
15586
|
-
|
15587
|
-
struct ggml_compute_params params;
|
15588
|
-
struct ggml_tensor * node;
|
15589
|
-
|
16757
|
+
int ith;
|
15590
16758
|
struct ggml_compute_state_shared * shared;
|
15591
16759
|
};
|
15592
16760
|
|
16761
|
+
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
16762
|
+
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
16763
|
+
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
16764
|
+
|
16765
|
+
node->perf_runs++;
|
16766
|
+
node->perf_cycles += cycles_cur;
|
16767
|
+
node->perf_time_us += time_us_cur;
|
16768
|
+
}
|
16769
|
+
|
15593
16770
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
15594
16771
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
16772
|
+
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
15595
16773
|
|
15596
16774
|
const int n_threads = state->shared->n_threads;
|
16775
|
+
set_numa_thread_affinity(state->ith, n_threads);
|
16776
|
+
|
16777
|
+
int node_n = -1;
|
15597
16778
|
|
15598
16779
|
while (true) {
|
15599
|
-
if (
|
15600
|
-
|
15601
|
-
|
15602
|
-
|
15603
|
-
|
15604
|
-
|
15605
|
-
|
15606
|
-
|
15607
|
-
|
16780
|
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
16781
|
+
// all other threads are finished and spinning
|
16782
|
+
// do finalize and init here so we don't have synchronize again
|
16783
|
+
struct ggml_compute_params params = {
|
16784
|
+
/*.type =*/ GGML_TASK_FINALIZE,
|
16785
|
+
/*.ith =*/ 0,
|
16786
|
+
/*.nth =*/ 0,
|
16787
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16788
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16789
|
+
};
|
16790
|
+
|
16791
|
+
if (node_n != -1) {
|
16792
|
+
/* FINALIZE */
|
16793
|
+
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16794
|
+
params.nth = node->n_tasks;
|
16795
|
+
ggml_compute_forward(¶ms, node);
|
16796
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
15608
16797
|
}
|
15609
|
-
}
|
15610
16798
|
|
15611
|
-
|
16799
|
+
// distribute new work or execute it direct if 1T
|
16800
|
+
while (++node_n < cgraph->n_nodes) {
|
16801
|
+
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
16802
|
+
|
16803
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
16804
|
+
|
16805
|
+
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16806
|
+
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16807
|
+
|
16808
|
+
/* INIT */
|
16809
|
+
params.type = GGML_TASK_INIT;
|
16810
|
+
params.nth = node->n_tasks;
|
16811
|
+
ggml_compute_forward(¶ms, node);
|
16812
|
+
|
16813
|
+
if (node->n_tasks == 1) {
|
16814
|
+
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
16815
|
+
// they do something more efficient than spinning (?)
|
16816
|
+
params.type = GGML_TASK_COMPUTE;
|
16817
|
+
ggml_compute_forward(¶ms, node);
|
15612
16818
|
|
15613
|
-
|
15614
|
-
|
15615
|
-
|
15616
|
-
|
16819
|
+
params.type = GGML_TASK_FINALIZE;
|
16820
|
+
ggml_compute_forward(¶ms, node);
|
16821
|
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16822
|
+
} else {
|
16823
|
+
break;
|
16824
|
+
}
|
15617
16825
|
}
|
15618
|
-
|
15619
|
-
|
16826
|
+
|
16827
|
+
atomic_store(&state->shared->n_active, n_threads);
|
16828
|
+
atomic_store(&state->shared->node_n, node_n);
|
16829
|
+
} else {
|
16830
|
+
// wait for other threads to finish
|
16831
|
+
const int last = node_n;
|
16832
|
+
do {
|
16833
|
+
sched_yield();
|
16834
|
+
node_n = atomic_load(&state->shared->node_n);
|
16835
|
+
} while (node_n == last);
|
15620
16836
|
}
|
15621
16837
|
|
15622
16838
|
// check if we should stop
|
15623
|
-
if (
|
15624
|
-
break;
|
15625
|
-
}
|
16839
|
+
if (node_n >= cgraph->n_nodes) break;
|
15626
16840
|
|
15627
|
-
|
15628
|
-
|
15629
|
-
ggml_compute_forward(&state->params, state->node);
|
15630
|
-
}
|
16841
|
+
/* COMPUTE */
|
16842
|
+
struct ggml_tensor * node = cgraph->nodes[node_n];
|
15631
16843
|
|
15632
|
-
|
15633
|
-
|
15634
|
-
|
16844
|
+
struct ggml_compute_params params = {
|
16845
|
+
/*.type =*/ GGML_TASK_COMPUTE,
|
16846
|
+
/*.ith =*/ state->ith,
|
16847
|
+
/*.nth =*/ node->n_tasks,
|
16848
|
+
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16849
|
+
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16850
|
+
};
|
16851
|
+
|
16852
|
+
if (state->ith < node->n_tasks) {
|
16853
|
+
ggml_compute_forward(¶ms, node);
|
15635
16854
|
}
|
15636
16855
|
}
|
15637
16856
|
|
@@ -15642,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15642
16861
|
const int n_threads = cgraph->n_threads;
|
15643
16862
|
|
15644
16863
|
struct ggml_compute_state_shared state_shared = {
|
15645
|
-
/*.
|
15646
|
-
/*.
|
15647
|
-
/*.
|
15648
|
-
/*.
|
15649
|
-
/*.
|
16864
|
+
/*.cgraph =*/ cgraph,
|
16865
|
+
/*.perf_node_start_cycles =*/ 0,
|
16866
|
+
/*.perf_node_start_time_us =*/ 0,
|
16867
|
+
/*.n_threads =*/ n_threads,
|
16868
|
+
/*.n_active =*/ n_threads,
|
16869
|
+
/*.node_n =*/ -1,
|
15650
16870
|
};
|
15651
|
-
struct ggml_compute_state * workers =
|
15652
|
-
|
15653
|
-
// create thread pool
|
15654
|
-
if (n_threads > 1) {
|
15655
|
-
ggml_lock_init(&state_shared.spin);
|
15656
|
-
|
15657
|
-
atomic_store(&state_shared.has_work, true);
|
15658
|
-
|
15659
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
15660
|
-
workers[j] = (struct ggml_compute_state) {
|
15661
|
-
.thrd = 0,
|
15662
|
-
.params = {
|
15663
|
-
.type = GGML_TASK_COMPUTE,
|
15664
|
-
.ith = j + 1,
|
15665
|
-
.nth = n_threads,
|
15666
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
15667
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
15668
|
-
},
|
15669
|
-
.node = NULL,
|
15670
|
-
.shared = &state_shared,
|
15671
|
-
};
|
15672
|
-
|
15673
|
-
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
15674
|
-
GGML_ASSERT(rc == 0);
|
15675
|
-
UNUSED(rc);
|
15676
|
-
}
|
15677
|
-
}
|
16871
|
+
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
15678
16872
|
|
15679
16873
|
// initialize tasks + work buffer
|
15680
16874
|
{
|
@@ -15742,6 +16936,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15742
16936
|
} break;
|
15743
16937
|
case GGML_OP_MUL:
|
15744
16938
|
case GGML_OP_GELU:
|
16939
|
+
case GGML_OP_GELU_QUICK:
|
15745
16940
|
case GGML_OP_SILU:
|
15746
16941
|
case GGML_OP_SILU_BACK:
|
15747
16942
|
case GGML_OP_NORM:
|
@@ -15817,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15817
17012
|
} break;
|
15818
17013
|
case GGML_OP_SCALE:
|
15819
17014
|
{
|
15820
|
-
node->n_tasks =
|
17015
|
+
node->n_tasks = 1;
|
15821
17016
|
} break;
|
15822
17017
|
case GGML_OP_SET:
|
15823
17018
|
case GGML_OP_CONT:
|
@@ -15848,8 +17043,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15848
17043
|
{
|
15849
17044
|
node->n_tasks = 1; //TODO
|
15850
17045
|
} break;
|
15851
|
-
case
|
15852
|
-
case
|
17046
|
+
case GGML_OP_CONV_1D_S1_PH:
|
17047
|
+
case GGML_OP_CONV_1D_S2_PH:
|
15853
17048
|
{
|
15854
17049
|
node->n_tasks = n_threads;
|
15855
17050
|
|
@@ -15876,6 +17071,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15876
17071
|
GGML_ASSERT(false);
|
15877
17072
|
}
|
15878
17073
|
|
17074
|
+
work_size = MAX(work_size, cur);
|
17075
|
+
} break;
|
17076
|
+
case GGML_OP_CONV_2D_SK_P0:
|
17077
|
+
{
|
17078
|
+
node->n_tasks = n_threads;
|
17079
|
+
|
17080
|
+
GGML_ASSERT(node->src1->ne[3] == 1);
|
17081
|
+
|
17082
|
+
const int64_t ne00 = node->src0->ne[0]; // W
|
17083
|
+
const int64_t ne01 = node->src0->ne[1]; // H
|
17084
|
+
const int64_t ne02 = node->src0->ne[2]; // C
|
17085
|
+
const int64_t ne03 = node->src0->ne[3]; // N
|
17086
|
+
|
17087
|
+
const int64_t ne10 = node->src1->ne[0]; // W
|
17088
|
+
const int64_t ne11 = node->src1->ne[1]; // H
|
17089
|
+
const int64_t ne12 = node->src1->ne[2]; // C
|
17090
|
+
|
17091
|
+
const int64_t nk = ne00*ne01;
|
17092
|
+
|
17093
|
+
UNUSED(ne02);
|
17094
|
+
UNUSED(ne03);
|
17095
|
+
UNUSED(nk);
|
17096
|
+
|
17097
|
+
size_t cur = 0;
|
17098
|
+
|
17099
|
+
if (node->src0->type == GGML_TYPE_F16 &&
|
17100
|
+
node->src1->type == GGML_TYPE_F32) {
|
17101
|
+
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
17102
|
+
} else if (node->src0->type == GGML_TYPE_F32 &&
|
17103
|
+
node->src1->type == GGML_TYPE_F32) {
|
17104
|
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
17105
|
+
} else {
|
17106
|
+
GGML_ASSERT(false);
|
17107
|
+
}
|
17108
|
+
|
15879
17109
|
work_size = MAX(work_size, cur);
|
15880
17110
|
} break;
|
15881
17111
|
case GGML_OP_FLASH_ATTN:
|
@@ -15937,8 +17167,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15937
17167
|
|
15938
17168
|
work_size = MAX(work_size, cur);
|
15939
17169
|
} break;
|
17170
|
+
case GGML_OP_WIN_PART:
|
17171
|
+
case GGML_OP_WIN_UNPART:
|
15940
17172
|
case GGML_OP_MAP_UNARY:
|
15941
17173
|
case GGML_OP_MAP_BINARY:
|
17174
|
+
case GGML_OP_MAP_CUSTOM1:
|
17175
|
+
case GGML_OP_MAP_CUSTOM2:
|
17176
|
+
case GGML_OP_MAP_CUSTOM3:
|
15942
17177
|
{
|
15943
17178
|
node->n_tasks = 1;
|
15944
17179
|
} break;
|
@@ -15981,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15981
17216
|
}
|
15982
17217
|
}
|
15983
17218
|
|
15984
|
-
|
15985
|
-
|
15986
|
-
|
15987
|
-
|
15988
|
-
|
15989
|
-
|
15990
|
-
|
15991
|
-
|
15992
|
-
// TODO: this could be used to avoid unnecessary computations, but it needs to be improved
|
15993
|
-
//if (node->grad == NULL && node->perf_runs > 0) {
|
15994
|
-
// continue;
|
15995
|
-
//}
|
15996
|
-
|
15997
|
-
const int64_t perf_node_start_cycles = ggml_perf_cycles();
|
15998
|
-
const int64_t perf_node_start_time_us = ggml_perf_time_us();
|
15999
|
-
|
16000
|
-
// INIT
|
16001
|
-
struct ggml_compute_params params = {
|
16002
|
-
/*.type =*/ GGML_TASK_INIT,
|
16003
|
-
/*.ith =*/ 0,
|
16004
|
-
/*.nth =*/ node->n_tasks,
|
16005
|
-
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16006
|
-
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
16007
|
-
};
|
16008
|
-
|
16009
|
-
ggml_compute_forward(¶ms, node);
|
16010
|
-
|
16011
|
-
// COMPUTE
|
16012
|
-
if (node->n_tasks > 1) {
|
16013
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16014
|
-
atomic_store(&state_shared.has_work, false);
|
16015
|
-
}
|
16016
|
-
|
16017
|
-
while (atomic_load(&state_shared.has_work)) {
|
16018
|
-
ggml_lock_lock (&state_shared.spin);
|
16019
|
-
ggml_lock_unlock(&state_shared.spin);
|
16020
|
-
}
|
16021
|
-
|
16022
|
-
// launch thread pool
|
16023
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16024
|
-
workers[j].params = (struct ggml_compute_params) {
|
16025
|
-
.type = GGML_TASK_COMPUTE,
|
16026
|
-
.ith = j + 1,
|
16027
|
-
.nth = node->n_tasks,
|
16028
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16029
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16030
|
-
};
|
16031
|
-
workers[j].node = node;
|
16032
|
-
}
|
16033
|
-
|
16034
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16035
|
-
|
16036
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16037
|
-
ggml_lock_lock (&state_shared.spin);
|
16038
|
-
ggml_lock_unlock(&state_shared.spin);
|
16039
|
-
}
|
16040
|
-
|
16041
|
-
atomic_store(&state_shared.has_work, true);
|
16042
|
-
}
|
16043
|
-
|
16044
|
-
params.type = GGML_TASK_COMPUTE;
|
16045
|
-
ggml_compute_forward(¶ms, node);
|
16046
|
-
|
16047
|
-
// wait for thread pool
|
16048
|
-
if (node->n_tasks > 1) {
|
16049
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16050
|
-
atomic_store(&state_shared.has_work, false);
|
16051
|
-
}
|
16052
|
-
|
16053
|
-
while (atomic_load(&state_shared.has_work)) {
|
16054
|
-
ggml_lock_lock (&state_shared.spin);
|
16055
|
-
ggml_lock_unlock(&state_shared.spin);
|
16056
|
-
}
|
16057
|
-
|
16058
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16059
|
-
|
16060
|
-
while (atomic_load(&state_shared.n_ready) != 0) {
|
16061
|
-
ggml_lock_lock (&state_shared.spin);
|
16062
|
-
ggml_lock_unlock(&state_shared.spin);
|
16063
|
-
}
|
16064
|
-
}
|
16065
|
-
|
16066
|
-
// FINALIZE
|
16067
|
-
if (node->n_tasks > 1) {
|
16068
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16069
|
-
atomic_store(&state_shared.has_work, false);
|
16070
|
-
}
|
16071
|
-
|
16072
|
-
while (atomic_load(&state_shared.has_work)) {
|
16073
|
-
ggml_lock_lock (&state_shared.spin);
|
16074
|
-
ggml_lock_unlock(&state_shared.spin);
|
16075
|
-
}
|
16076
|
-
|
16077
|
-
// launch thread pool
|
16078
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16079
|
-
workers[j].params = (struct ggml_compute_params) {
|
16080
|
-
.type = GGML_TASK_FINALIZE,
|
16081
|
-
.ith = j + 1,
|
16082
|
-
.nth = node->n_tasks,
|
16083
|
-
.wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
16084
|
-
.wdata = cgraph->work ? cgraph->work->data : NULL,
|
16085
|
-
};
|
16086
|
-
workers[j].node = node;
|
16087
|
-
}
|
16088
|
-
|
16089
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
16090
|
-
|
16091
|
-
while (atomic_load(&state_shared.n_ready) > 0) {
|
16092
|
-
ggml_lock_lock (&state_shared.spin);
|
16093
|
-
ggml_lock_unlock(&state_shared.spin);
|
16094
|
-
}
|
17219
|
+
// create thread pool
|
17220
|
+
if (n_threads > 1) {
|
17221
|
+
for (int j = 1; j < n_threads; ++j) {
|
17222
|
+
workers[j] = (struct ggml_compute_state) {
|
17223
|
+
.thrd = 0,
|
17224
|
+
.ith = j,
|
17225
|
+
.shared = &state_shared,
|
17226
|
+
};
|
16095
17227
|
|
16096
|
-
|
17228
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
17229
|
+
GGML_ASSERT(rc == 0);
|
16097
17230
|
}
|
17231
|
+
}
|
17232
|
+
workers[0].ith = 0;
|
17233
|
+
workers[0].shared = &state_shared;
|
16098
17234
|
|
16099
|
-
|
16100
|
-
|
16101
|
-
|
16102
|
-
// wait for thread pool
|
16103
|
-
if (node->n_tasks > 1) {
|
16104
|
-
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
|
16105
|
-
atomic_store(&state_shared.has_work, false);
|
16106
|
-
}
|
16107
|
-
|
16108
|
-
while (atomic_load(&state_shared.has_work)) {
|
16109
|
-
ggml_lock_lock (&state_shared.spin);
|
16110
|
-
ggml_lock_unlock(&state_shared.spin);
|
16111
|
-
}
|
16112
|
-
|
16113
|
-
atomic_fetch_sub(&state_shared.n_ready, 1);
|
17235
|
+
const int64_t perf_start_cycles = ggml_perf_cycles();
|
17236
|
+
const int64_t perf_start_time_us = ggml_perf_time_us();
|
16114
17237
|
|
16115
|
-
|
16116
|
-
|
16117
|
-
ggml_lock_unlock(&state_shared.spin);
|
16118
|
-
}
|
16119
|
-
}
|
17238
|
+
// this is a work thread too
|
17239
|
+
ggml_graph_compute_thread(&workers[0]);
|
16120
17240
|
|
16121
|
-
|
16122
|
-
|
16123
|
-
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
|
16124
|
-
int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
|
16125
|
-
|
16126
|
-
node->perf_runs++;
|
16127
|
-
node->perf_cycles += perf_cycles_cur;
|
16128
|
-
node->perf_time_us += perf_time_us_cur;
|
16129
|
-
}
|
16130
|
-
}
|
17241
|
+
// don't leave affinity set on the main thread
|
17242
|
+
clear_numa_thread_affinity();
|
16131
17243
|
|
16132
17244
|
// join thread pool
|
16133
17245
|
if (n_threads > 1) {
|
16134
|
-
|
16135
|
-
|
16136
|
-
|
16137
|
-
for (int j = 0; j < n_threads - 1; j++) {
|
16138
|
-
int rc = ggml_thread_join(workers[j].thrd, NULL);
|
17246
|
+
for (int j = 1; j < n_threads; j++) {
|
17247
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
16139
17248
|
GGML_ASSERT(rc == 0);
|
16140
|
-
UNUSED(rc);
|
16141
17249
|
}
|
16142
|
-
|
16143
|
-
ggml_lock_destroy(&state_shared.spin);
|
16144
17250
|
}
|
16145
17251
|
|
16146
17252
|
// performance stats (graph)
|
@@ -16469,16 +17575,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16469
17575
|
|
16470
17576
|
if (!*ctx_data) {
|
16471
17577
|
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
17578
|
+
fclose(fin);
|
16472
17579
|
return result;
|
16473
17580
|
}
|
16474
17581
|
}
|
16475
17582
|
|
16476
17583
|
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
16477
17584
|
|
16478
|
-
|
16479
|
-
|
16480
|
-
|
16481
|
-
|
17585
|
+
{
|
17586
|
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
17587
|
+
if (ret != fsize) {
|
17588
|
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
17589
|
+
fclose(fin);
|
17590
|
+
return result;
|
17591
|
+
}
|
16482
17592
|
}
|
16483
17593
|
|
16484
17594
|
fclose(fin);
|
@@ -16758,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
|
16758
17868
|
return NULL;
|
16759
17869
|
}
|
16760
17870
|
|
17871
|
+
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17872
|
+
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
17873
|
+
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
17874
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
17875
|
+
gparent0 ? (void *) gparent0 : (void *) parent,
|
17876
|
+
gparent0 ? "g" : "x",
|
17877
|
+
gparent ? (void *) gparent : (void *) node,
|
17878
|
+
gparent ? "g" : "x",
|
17879
|
+
gparent ? "empty" : "vee",
|
17880
|
+
gparent ? "dashed" : "solid",
|
17881
|
+
label);
|
17882
|
+
}
|
17883
|
+
|
17884
|
+
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17885
|
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
17886
|
+
(void *) parent, "x",
|
17887
|
+
(void *) node, "x",
|
17888
|
+
label);
|
17889
|
+
}
|
17890
|
+
|
16761
17891
|
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
16762
17892
|
char color[16];
|
16763
17893
|
|
@@ -16793,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16793
17923
|
(void *) node, color);
|
16794
17924
|
|
16795
17925
|
if (strlen(node->name) > 0) {
|
16796
|
-
fprintf(fp, "%s |", node->name);
|
17926
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17927
|
+
} else {
|
17928
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16797
17929
|
}
|
16798
17930
|
|
16799
17931
|
if (node->n_dims == 2) {
|
@@ -16802,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16802
17934
|
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
16803
17935
|
}
|
16804
17936
|
|
16805
|
-
|
16806
17937
|
if (node->grad) {
|
16807
17938
|
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
16808
17939
|
} else {
|
@@ -16821,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16821
17952
|
(void *) node, color);
|
16822
17953
|
|
16823
17954
|
if (strlen(node->name) > 0) {
|
16824
|
-
|
17955
|
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17956
|
+
} else {
|
17957
|
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
16825
17958
|
}
|
16826
|
-
|
16827
|
-
|
16828
|
-
|
16829
|
-
|
16830
|
-
|
16831
|
-
|
17959
|
+
|
17960
|
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17961
|
+
if (ggml_nelements(node) < 5) {
|
17962
|
+
fprintf(fp, " | (");
|
17963
|
+
for (int j = 0; j < ggml_nelements(node); j++) {
|
17964
|
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
17965
|
+
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
17966
|
+
}
|
17967
|
+
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
17968
|
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
17969
|
+
}
|
17970
|
+
else {
|
17971
|
+
fprintf(fp, "#");
|
17972
|
+
}
|
17973
|
+
if (j < ggml_nelements(node) - 1) {
|
17974
|
+
fprintf(fp, ", ");
|
17975
|
+
}
|
16832
17976
|
}
|
16833
|
-
|
16834
|
-
else {
|
16835
|
-
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17977
|
+
fprintf(fp, ")");
|
16836
17978
|
}
|
16837
17979
|
fprintf(fp, "\"; ]\n");
|
16838
17980
|
}
|
@@ -16840,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16840
17982
|
for (int i = 0; i < gb->n_nodes; i++) {
|
16841
17983
|
struct ggml_tensor * node = gb->nodes[i];
|
16842
17984
|
|
16843
|
-
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
16844
|
-
|
16845
17985
|
if (node->src0) {
|
16846
|
-
|
16847
|
-
|
16848
|
-
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
16849
|
-
parent0 ? (void *) parent0 : (void *) node->src0,
|
16850
|
-
parent0 ? "g" : "x",
|
16851
|
-
parent ? (void *) parent : (void *) node,
|
16852
|
-
parent ? "g" : "x",
|
16853
|
-
parent ? "empty" : "vee",
|
16854
|
-
parent ? "dashed" : "solid");
|
17986
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
16855
17987
|
}
|
16856
17988
|
|
16857
17989
|
if (node->src1) {
|
16858
|
-
|
16859
|
-
|
16860
|
-
|
16861
|
-
|
16862
|
-
|
16863
|
-
|
16864
|
-
|
16865
|
-
|
16866
|
-
|
17990
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17991
|
+
}
|
17992
|
+
|
17993
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17994
|
+
if (node->opt[j]) {
|
17995
|
+
char label[16];
|
17996
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
17997
|
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
17998
|
+
}
|
16867
17999
|
}
|
16868
18000
|
}
|
16869
18001
|
|
@@ -16871,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16871
18003
|
struct ggml_tensor * node = gb->leafs[i];
|
16872
18004
|
|
16873
18005
|
if (node->src0) {
|
16874
|
-
|
16875
|
-
(void *) node->src0, "x",
|
16876
|
-
(void *) node, "x");
|
18006
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
16877
18007
|
}
|
16878
18008
|
|
16879
18009
|
if (node->src1) {
|
16880
|
-
|
16881
|
-
|
16882
|
-
|
18010
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
18011
|
+
}
|
18012
|
+
|
18013
|
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
18014
|
+
if (node->opt[j]) {
|
18015
|
+
char label[16];
|
18016
|
+
snprintf(label, sizeof(label), "opt %d", j);
|
18017
|
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
18018
|
+
}
|
16883
18019
|
}
|
16884
18020
|
}
|
16885
18021
|
|
@@ -17598,7 +18734,6 @@ GGML_API void ggml_opt_init(
|
|
17598
18734
|
ggml_set_zero(opt->lbfgs.g);
|
17599
18735
|
ggml_set_zero(opt->lbfgs.gp);
|
17600
18736
|
ggml_set_zero(opt->lbfgs.d);
|
17601
|
-
ggml_set_zero(opt->lbfgs.pf);
|
17602
18737
|
if (opt->lbfgs.pf) {
|
17603
18738
|
ggml_set_zero(opt->lbfgs.pf);
|
17604
18739
|
}
|