llama_cpp 0.15.1 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +3 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
- data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
- data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
- data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
- data/vendor/tmp/llama.cpp/ggml.c +726 -646
- data/vendor/tmp/llama.cpp/ggml.h +28 -17
- data/vendor/tmp/llama.cpp/llama.cpp +478 -281
- data/vendor/tmp/llama.cpp/llama.h +3 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
-
#include "sgemm.h"
|
8
7
|
|
9
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
10
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -37,6 +36,10 @@
|
|
37
36
|
#undef GGML_USE_LLAMAFILE
|
38
37
|
#endif
|
39
38
|
|
39
|
+
#ifdef GGML_USE_LLAMAFILE
|
40
|
+
#include "sgemm.h"
|
41
|
+
#endif
|
42
|
+
|
40
43
|
#if defined(_MSC_VER)
|
41
44
|
// disable "possible loss of data" to avoid hundreds of casts
|
42
45
|
// we should just be careful :)
|
@@ -109,6 +112,8 @@ typedef void * thread_ret_t;
|
|
109
112
|
|
110
113
|
#endif
|
111
114
|
|
115
|
+
typedef pthread_t ggml_thread_t;
|
116
|
+
|
112
117
|
#ifdef GGML_USE_CPU_HBM
|
113
118
|
#include <hbwmalloc.h>
|
114
119
|
#endif
|
@@ -160,9 +165,6 @@ void ggml_print_backtrace(void) {
|
|
160
165
|
#define GGML_DEBUG 0
|
161
166
|
#define GGML_GELU_FP16
|
162
167
|
#define GGML_GELU_QUICK_FP16
|
163
|
-
#define GGML_SILU_FP16
|
164
|
-
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
165
|
-
// #define GGML_FLASH_ATTN_EXP_FP16
|
166
168
|
|
167
169
|
#define GGML_SOFT_MAX_UNROLL 4
|
168
170
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -313,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
|
313
315
|
// precomputed quick gelu table for f16 (128 KB)
|
314
316
|
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
315
317
|
|
316
|
-
// precomputed silu table for f16 (128 KB)
|
317
|
-
static ggml_fp16_t ggml_table_silu_f16[1 << 16];
|
318
|
-
|
319
|
-
// precomputed exp table for f16 (128 KB)
|
320
|
-
static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
321
|
-
|
322
318
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
323
319
|
float ggml_table_f32_f16[1 << 16];
|
324
320
|
|
@@ -1303,6 +1299,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1303
1299
|
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
1304
1300
|
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
1305
1301
|
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
1302
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
1303
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
1306
1304
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
1307
1305
|
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
1308
1306
|
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
@@ -1534,6 +1532,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
1534
1532
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
1535
1533
|
#endif
|
1536
1534
|
|
1535
|
+
//
|
1536
|
+
// ggml context
|
1537
|
+
//
|
1538
|
+
|
1539
|
+
struct ggml_context {
|
1540
|
+
size_t mem_size;
|
1541
|
+
void* mem_buffer;
|
1542
|
+
bool mem_buffer_owned;
|
1543
|
+
bool no_alloc;
|
1544
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
1545
|
+
|
1546
|
+
int n_objects;
|
1547
|
+
|
1548
|
+
struct ggml_object* objects_begin;
|
1549
|
+
struct ggml_object* objects_end;
|
1550
|
+
|
1551
|
+
struct ggml_scratch scratch;
|
1552
|
+
struct ggml_scratch scratch_save;
|
1553
|
+
};
|
1554
|
+
|
1555
|
+
struct ggml_context_container {
|
1556
|
+
bool used;
|
1557
|
+
|
1558
|
+
struct ggml_context context;
|
1559
|
+
};
|
1560
|
+
|
1561
|
+
struct ggml_compute_state_shared {
|
1562
|
+
const struct ggml_cgraph* cgraph;
|
1563
|
+
const struct ggml_cplan* cplan;
|
1564
|
+
|
1565
|
+
int64_t perf_node_start_cycles;
|
1566
|
+
int64_t perf_node_start_time_us;
|
1567
|
+
|
1568
|
+
const int n_threads;
|
1569
|
+
|
1570
|
+
// synchronization primitives
|
1571
|
+
atomic_int n_active; // num active threads
|
1572
|
+
atomic_int node_n; // active graph node
|
1573
|
+
atomic_int node_task; // active graph node task phase
|
1574
|
+
|
1575
|
+
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1576
|
+
void* abort_callback_data;
|
1577
|
+
|
1578
|
+
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
1579
|
+
};
|
1580
|
+
|
1581
|
+
struct ggml_compute_state {
|
1582
|
+
ggml_thread_t thrd;
|
1583
|
+
int ith;
|
1584
|
+
struct ggml_compute_state_shared* shared;
|
1585
|
+
enum ggml_status ec;
|
1586
|
+
};
|
1587
|
+
|
1537
1588
|
//
|
1538
1589
|
// fundamental operations
|
1539
1590
|
//
|
@@ -1949,6 +2000,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
1949
2000
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1950
2001
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1951
2002
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2003
|
+
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
1952
2004
|
// TODO: optimize performance
|
1953
2005
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1954
2006
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
@@ -2024,52 +2076,291 @@ inline static float ggml_silu_f32(float x) {
|
|
2024
2076
|
return x/(1.0f + expf(-x));
|
2025
2077
|
}
|
2026
2078
|
|
2027
|
-
|
2028
|
-
// const uint16_t * i16 = (const uint16_t *) x;
|
2029
|
-
// for (int i = 0; i < n; ++i) {
|
2030
|
-
// y[i] = ggml_table_silu_f16[i16[i]];
|
2031
|
-
// }
|
2032
|
-
//}
|
2079
|
+
#if defined(__ARM_NEON)
|
2033
2080
|
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
|
2041
|
-
|
2042
|
-
|
2081
|
+
// adapted from arm limited optimized routine
|
2082
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2083
|
+
// numbers above 88.38 will flush to infinity
|
2084
|
+
// numbers beneath -103.97 will flush to zero
|
2085
|
+
inline static float32x4_t ggml_v_expf(float32x4_t x) {
|
2086
|
+
const float32x4_t r = vdupq_n_f32(0x1.8p23f);
|
2087
|
+
const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
|
2088
|
+
const float32x4_t n = vsubq_f32(z, r);
|
2089
|
+
const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
|
2090
|
+
vdupq_n_f32(0x1.7f7d1cp-20f));
|
2091
|
+
const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
|
2092
|
+
const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
|
2093
|
+
const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
|
2094
|
+
const float32x4_t u = vmulq_f32(b, b);
|
2095
|
+
const float32x4_t j = vfmaq_f32(
|
2096
|
+
vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
|
2097
|
+
vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
|
2098
|
+
vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
|
2099
|
+
if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
|
2100
|
+
return vfmaq_f32(k, j, k);
|
2101
|
+
const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
|
2102
|
+
const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
|
2103
|
+
const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
|
2104
|
+
return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
|
2105
|
+
vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
|
2106
|
+
}
|
2107
|
+
|
2108
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2109
|
+
inline static float32x4_t ggml_v_silu(float32x4_t x) {
|
2110
|
+
const float32x4_t one = vdupq_n_f32(1.0f);
|
2111
|
+
const float32x4_t zero = vdupq_n_f32(0.0f);
|
2112
|
+
const float32x4_t neg_x = vsubq_f32(zero, x);
|
2113
|
+
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
|
2114
|
+
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
|
2115
|
+
return vdivq_f32(x, one_plus_exp_neg_x);
|
2116
|
+
}
|
2117
|
+
|
2118
|
+
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
2119
|
+
|
2120
|
+
// adapted from arm limited optimized routine
|
2121
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2122
|
+
// numbers above 88.38 will flush to infinity
|
2123
|
+
// numbers beneath -103.97 will flush to zero
|
2124
|
+
inline static __m512 ggml_v_expf(__m512 x) {
|
2125
|
+
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2126
|
+
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2127
|
+
const __m512 n = _mm512_sub_ps(z, r);
|
2128
|
+
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2129
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2130
|
+
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
|
2131
|
+
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2132
|
+
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2133
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2134
|
+
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2135
|
+
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2136
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2137
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2138
|
+
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2139
|
+
if (_mm512_kortestz(c, c))
|
2140
|
+
return _mm512_fmadd_ps(j, k, k);
|
2141
|
+
const __m512i g = _mm512_and_si512(
|
2142
|
+
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2143
|
+
_mm512_set1_epi32(0x82000000u));
|
2144
|
+
const __m512 s1 =
|
2145
|
+
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2146
|
+
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2147
|
+
const __mmask16 d =
|
2148
|
+
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2149
|
+
return _mm512_mask_blend_ps(
|
2150
|
+
d, _mm512_mask_blend_ps(
|
2151
|
+
c, _mm512_fmadd_ps(k, j, k),
|
2152
|
+
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
|
2153
|
+
_mm512_mul_ps(s1, s1));
|
2154
|
+
}
|
2155
|
+
|
2156
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2157
|
+
inline static __m512 ggml_v_silu(__m512 x) {
|
2158
|
+
const __m512 one = _mm512_set1_ps(1);
|
2159
|
+
const __m512 zero = _mm512_setzero_ps();
|
2160
|
+
const __m512 neg_x = _mm512_sub_ps(zero, x);
|
2161
|
+
const __m512 exp_neg_x = ggml_v_expf(neg_x);
|
2162
|
+
const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
|
2163
|
+
return _mm512_div_ps(x, one_plus_exp_neg_x);
|
2164
|
+
}
|
2165
|
+
|
2166
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2167
|
+
|
2168
|
+
// adapted from arm limited optimized routine
|
2169
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2170
|
+
// numbers above 88.38 will flush to infinity
|
2171
|
+
// numbers beneath -103.97 will flush to zero
|
2172
|
+
inline static __m256 ggml_v_expf(__m256 x) {
|
2173
|
+
const __m256 r = _mm256_set1_ps(0x1.8p23f);
|
2174
|
+
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
|
2175
|
+
const __m256 n = _mm256_sub_ps(z, r);
|
2176
|
+
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
|
2177
|
+
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
|
2178
|
+
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
|
2179
|
+
const __m256 k = _mm256_castsi256_ps(
|
2180
|
+
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
|
2181
|
+
const __m256i c = _mm256_castps_si256(
|
2182
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2183
|
+
_mm256_set1_ps(126), _CMP_GT_OQ));
|
2184
|
+
const __m256 u = _mm256_mul_ps(b, b);
|
2185
|
+
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
|
2186
|
+
_mm256_set1_ps(0x1.573e2ep-5f)), u,
|
2187
|
+
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
|
2188
|
+
_mm256_set1_ps(0x1.fffdb6p-2f))),
|
2189
|
+
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
|
2190
|
+
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
|
2191
|
+
return _mm256_fmadd_ps(j, k, k);
|
2192
|
+
const __m256i g = _mm256_and_si256(
|
2193
|
+
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
|
2194
|
+
_mm256_set1_epi32(0x82000000u));
|
2195
|
+
const __m256 s1 =
|
2196
|
+
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
|
2197
|
+
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
|
2198
|
+
const __m256i d = _mm256_castps_si256(
|
2199
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2200
|
+
_mm256_set1_ps(192), _CMP_GT_OQ));
|
2201
|
+
return _mm256_or_ps(
|
2202
|
+
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
|
2203
|
+
_mm256_andnot_ps(
|
2204
|
+
_mm256_castsi256_ps(d),
|
2205
|
+
_mm256_or_ps(
|
2206
|
+
_mm256_and_ps(_mm256_castsi256_ps(c),
|
2207
|
+
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
|
2208
|
+
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
|
2209
|
+
}
|
2210
|
+
|
2211
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2212
|
+
inline static __m256 ggml_v_silu(__m256 x) {
|
2213
|
+
const __m256 one = _mm256_set1_ps(1);
|
2214
|
+
const __m256 zero = _mm256_setzero_ps();
|
2215
|
+
const __m256 neg_x = _mm256_sub_ps(zero, x);
|
2216
|
+
const __m256 exp_neg_x = ggml_v_expf(neg_x);
|
2217
|
+
const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
|
2218
|
+
return _mm256_div_ps(x, one_plus_exp_neg_x);
|
2219
|
+
}
|
2220
|
+
|
2221
|
+
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
2222
|
+
|
2223
|
+
#if defined(__FMA__)
|
2224
|
+
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
|
2225
|
+
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
|
2043
2226
|
#else
|
2044
|
-
|
2045
|
-
|
2227
|
+
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
|
2228
|
+
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
|
2229
|
+
#endif
|
2230
|
+
|
2231
|
+
// adapted from arm limited optimized routine
|
2232
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2233
|
+
// numbers above 88.38 will flush to infinity
|
2234
|
+
// numbers beneath -103.97 will flush to zero
|
2235
|
+
inline static __m128 ggml_v_expf(__m128 x) {
|
2236
|
+
const __m128 r = _mm_set1_ps(0x1.8p23f);
|
2237
|
+
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
|
2238
|
+
const __m128 n = _mm_sub_ps(z, r);
|
2239
|
+
const __m128 b =
|
2240
|
+
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
|
2241
|
+
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
|
2242
|
+
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
|
2243
|
+
const __m128i c =
|
2244
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
|
2245
|
+
const __m128 u = _mm_mul_ps(b, b);
|
2246
|
+
const __m128 j =
|
2247
|
+
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
|
2248
|
+
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
|
2249
|
+
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
|
2250
|
+
if (!_mm_movemask_epi8(c))
|
2251
|
+
return MADD128(j, k, k);
|
2252
|
+
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
|
2253
|
+
_mm_set1_epi32(0x82000000u));
|
2254
|
+
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
|
2255
|
+
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
|
2256
|
+
const __m128i d =
|
2257
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
|
2258
|
+
return _mm_or_ps(
|
2259
|
+
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
|
2260
|
+
_mm_andnot_ps(_mm_castsi128_ps(d),
|
2261
|
+
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
|
2262
|
+
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
|
2263
|
+
}
|
2264
|
+
|
2265
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2266
|
+
inline static __m128 ggml_v_silu(__m128 x) {
|
2267
|
+
const __m128 one = _mm_set1_ps(1);
|
2268
|
+
const __m128 zero = _mm_setzero_ps();
|
2269
|
+
const __m128 neg_x = _mm_sub_ps(zero, x);
|
2270
|
+
const __m128 exp_neg_x = ggml_v_expf(neg_x);
|
2271
|
+
const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
|
2272
|
+
return _mm_div_ps(x, one_plus_exp_neg_x);
|
2273
|
+
}
|
2274
|
+
|
2275
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__
|
2276
|
+
|
2277
|
+
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
2278
|
+
int i = 0;
|
2279
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2280
|
+
for (; i + 15 < n; i += 16) {
|
2281
|
+
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
2282
|
+
}
|
2283
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2284
|
+
for (; i + 7 < n; i += 8) {
|
2285
|
+
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
2286
|
+
}
|
2287
|
+
#elif defined(__SSE2__)
|
2288
|
+
for (; i + 3 < n; i += 4) {
|
2289
|
+
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
2290
|
+
}
|
2291
|
+
#elif defined(__ARM_NEON)
|
2292
|
+
for (; i + 3 < n; i += 4) {
|
2293
|
+
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
2294
|
+
}
|
2295
|
+
#endif
|
2296
|
+
for (; i < n; ++i) {
|
2046
2297
|
y[i] = ggml_silu_f32(x[i]);
|
2047
2298
|
}
|
2048
2299
|
}
|
2300
|
+
|
2301
|
+
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
2302
|
+
int i = 0;
|
2303
|
+
ggml_float sum = 0;
|
2304
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2305
|
+
for (; i + 15 < n; i += 16) {
|
2306
|
+
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
2307
|
+
_mm512_set1_ps(max)));
|
2308
|
+
_mm512_storeu_ps(y + i, val);
|
2309
|
+
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
2310
|
+
}
|
2311
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2312
|
+
for (; i + 7 < n; i += 8) {
|
2313
|
+
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
2314
|
+
_mm256_set1_ps(max)));
|
2315
|
+
_mm256_storeu_ps(y + i, val);
|
2316
|
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
2317
|
+
_mm256_castps256_ps128(val));
|
2318
|
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
2319
|
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
2320
|
+
sum += (ggml_float)_mm_cvtss_f32(val2);
|
2321
|
+
}
|
2322
|
+
#elif defined(__SSE2__)
|
2323
|
+
for (; i + 3 < n; i += 4) {
|
2324
|
+
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
2325
|
+
_mm_set1_ps(max)));
|
2326
|
+
_mm_storeu_ps(y + i, val);
|
2327
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
2328
|
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
2329
|
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
2330
|
+
#else
|
2331
|
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
2332
|
+
val = _mm_add_ps(val, tmp);
|
2333
|
+
tmp = _mm_movehl_ps(tmp, val);
|
2334
|
+
val = _mm_add_ss(val, tmp);
|
2335
|
+
#endif
|
2336
|
+
sum += (ggml_float)_mm_cvtss_f32(val);
|
2337
|
+
}
|
2338
|
+
#elif defined(__ARM_NEON)
|
2339
|
+
for (; i + 3 < n; i += 4) {
|
2340
|
+
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
2341
|
+
vdupq_n_f32(max)));
|
2342
|
+
vst1q_f32(y + i, val);
|
2343
|
+
sum += (ggml_float)vaddvq_f32(val);
|
2344
|
+
}
|
2049
2345
|
#endif
|
2346
|
+
for (; i < n; ++i) {
|
2347
|
+
float val = expf(x[i] - max);
|
2348
|
+
sum += (ggml_float)val;
|
2349
|
+
y[i] = val;
|
2350
|
+
}
|
2351
|
+
return sum;
|
2352
|
+
}
|
2050
2353
|
|
2051
2354
|
inline static float ggml_silu_backward_f32(float x, float dy) {
|
2052
2355
|
const float s = 1.0f/(1.0f + expf(-x));
|
2053
2356
|
return dy*s*(1.0f + x*(1.0f - s));
|
2054
2357
|
}
|
2055
2358
|
|
2056
|
-
#ifdef GGML_SILU_FP16
|
2057
|
-
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2058
|
-
for (int i = 0; i < n; ++i) {
|
2059
|
-
// we did not use x[i] to compute forward silu but its f16 equivalent
|
2060
|
-
// take derivative at f16 of x[i]:
|
2061
|
-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
2062
|
-
float usedx = GGML_FP16_TO_FP32(fp16);
|
2063
|
-
dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
|
2064
|
-
}
|
2065
|
-
}
|
2066
|
-
#else
|
2067
2359
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2068
2360
|
for (int i = 0; i < n; ++i) {
|
2069
2361
|
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
2070
2362
|
}
|
2071
2363
|
}
|
2072
|
-
#endif
|
2073
2364
|
|
2074
2365
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
2075
2366
|
#ifndef GGML_USE_ACCELERATE
|
@@ -2185,7 +2476,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2185
2476
|
"SOFT_MAX_BACK",
|
2186
2477
|
"ROPE",
|
2187
2478
|
"ROPE_BACK",
|
2188
|
-
"ALIBI",
|
2189
2479
|
"CLAMP",
|
2190
2480
|
"CONV_TRANSPOSE_1D",
|
2191
2481
|
"IM2COL",
|
@@ -2227,7 +2517,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2227
2517
|
"CROSS_ENTROPY_LOSS_BACK",
|
2228
2518
|
};
|
2229
2519
|
|
2230
|
-
static_assert(GGML_OP_COUNT ==
|
2520
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
2231
2521
|
|
2232
2522
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2233
2523
|
"none",
|
@@ -2276,7 +2566,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2276
2566
|
"soft_max_back(x)",
|
2277
2567
|
"rope(x)",
|
2278
2568
|
"rope_back(x)",
|
2279
|
-
"alibi(x)",
|
2280
2569
|
"clamp(x)",
|
2281
2570
|
"conv_transpose_1d(x)",
|
2282
2571
|
"im2col(x)",
|
@@ -2318,7 +2607,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2318
2607
|
"cross_entropy_loss_back(x,y)",
|
2319
2608
|
};
|
2320
2609
|
|
2321
|
-
static_assert(GGML_OP_COUNT ==
|
2610
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
2322
2611
|
|
2323
2612
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2324
2613
|
|
@@ -2331,6 +2620,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2331
2620
|
"TANH",
|
2332
2621
|
"ELU",
|
2333
2622
|
"RELU",
|
2623
|
+
"SIGMOID",
|
2334
2624
|
"GELU",
|
2335
2625
|
"GELU_QUICK",
|
2336
2626
|
"SILU",
|
@@ -2338,7 +2628,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2338
2628
|
"HARDSIGMOID",
|
2339
2629
|
};
|
2340
2630
|
|
2341
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
2631
|
+
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
2342
2632
|
|
2343
2633
|
|
2344
2634
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2380,32 +2670,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
2380
2670
|
}
|
2381
2671
|
}
|
2382
2672
|
|
2383
|
-
//
|
2384
|
-
// ggml context
|
2385
|
-
//
|
2386
|
-
|
2387
|
-
struct ggml_context {
|
2388
|
-
size_t mem_size;
|
2389
|
-
void * mem_buffer;
|
2390
|
-
bool mem_buffer_owned;
|
2391
|
-
bool no_alloc;
|
2392
|
-
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
2393
|
-
|
2394
|
-
int n_objects;
|
2395
|
-
|
2396
|
-
struct ggml_object * objects_begin;
|
2397
|
-
struct ggml_object * objects_end;
|
2398
|
-
|
2399
|
-
struct ggml_scratch scratch;
|
2400
|
-
struct ggml_scratch scratch_save;
|
2401
|
-
};
|
2402
|
-
|
2403
|
-
struct ggml_context_container {
|
2404
|
-
bool used;
|
2405
|
-
|
2406
|
-
struct ggml_context context;
|
2407
|
-
};
|
2408
|
-
|
2409
2673
|
//
|
2410
2674
|
// NUMA support
|
2411
2675
|
//
|
@@ -2819,6 +3083,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2819
3083
|
(t0->ne[3] == t1->ne[3] );
|
2820
3084
|
}
|
2821
3085
|
|
3086
|
+
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3087
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3088
|
+
|
3089
|
+
return
|
3090
|
+
(t0->nb[0] == t1->nb[0] ) &&
|
3091
|
+
(t0->nb[1] == t1->nb[1] ) &&
|
3092
|
+
(t0->nb[2] == t1->nb[2] ) &&
|
3093
|
+
(t0->nb[3] == t1->nb[3] );
|
3094
|
+
}
|
3095
|
+
|
2822
3096
|
// check if t1 can be represented as a repeatition of t0
|
2823
3097
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2824
3098
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
@@ -2878,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2878
3152
|
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
2879
3153
|
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
2880
3154
|
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
2881
|
-
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
2882
|
-
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
2883
3155
|
}
|
2884
3156
|
|
2885
3157
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
@@ -3163,6 +3435,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3163
3435
|
|
3164
3436
|
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
3165
3437
|
|
3438
|
+
#ifdef __clang__
|
3439
|
+
// temporary until ggml_tensor::backend is removed
|
3440
|
+
#pragma clang diagnostic push
|
3441
|
+
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
3442
|
+
#endif
|
3443
|
+
|
3166
3444
|
*result = (struct ggml_tensor) {
|
3167
3445
|
/*.type =*/ type,
|
3168
3446
|
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
@@ -3185,6 +3463,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3185
3463
|
/*.padding =*/ { 0 },
|
3186
3464
|
};
|
3187
3465
|
|
3466
|
+
#ifdef __clang__
|
3467
|
+
#pragma clang diagnostic pop
|
3468
|
+
#endif
|
3469
|
+
|
3188
3470
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3189
3471
|
//ggml_assert_aligned(result->data);
|
3190
3472
|
|
@@ -4563,6 +4845,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
4563
4845
|
return result;
|
4564
4846
|
}
|
4565
4847
|
|
4848
|
+
// ggml_sigmoid
|
4849
|
+
|
4850
|
+
struct ggml_tensor * ggml_sigmoid(
|
4851
|
+
struct ggml_context * ctx,
|
4852
|
+
struct ggml_tensor * a) {
|
4853
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
4854
|
+
}
|
4855
|
+
|
4856
|
+
struct ggml_tensor * ggml_sigmoid_inplace(
|
4857
|
+
struct ggml_context * ctx,
|
4858
|
+
struct ggml_tensor * a) {
|
4859
|
+
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
4860
|
+
}
|
4861
|
+
|
4566
4862
|
// ggml_gelu
|
4567
4863
|
|
4568
4864
|
struct ggml_tensor * ggml_gelu(
|
@@ -5646,7 +5942,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5646
5942
|
struct ggml_context * ctx,
|
5647
5943
|
struct ggml_tensor * a,
|
5648
5944
|
struct ggml_tensor * mask,
|
5649
|
-
struct ggml_tensor * pos,
|
5650
5945
|
float scale,
|
5651
5946
|
float max_bias,
|
5652
5947
|
bool inplace) {
|
@@ -5660,18 +5955,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5660
5955
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5661
5956
|
}
|
5662
5957
|
|
5663
|
-
if (pos) {
|
5664
|
-
GGML_ASSERT(ggml_is_vector(pos));
|
5665
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5666
|
-
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5667
|
-
}
|
5668
|
-
|
5669
|
-
if (pos && mask) {
|
5670
|
-
GGML_ASSERT(pos->type == mask->type);
|
5671
|
-
}
|
5672
|
-
|
5673
5958
|
if (max_bias > 0.0f) {
|
5674
|
-
GGML_ASSERT(
|
5959
|
+
GGML_ASSERT(mask);
|
5675
5960
|
}
|
5676
5961
|
|
5677
5962
|
bool is_node = false;
|
@@ -5689,7 +5974,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5689
5974
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5690
5975
|
result->src[0] = a;
|
5691
5976
|
result->src[1] = mask;
|
5692
|
-
result->src[2] = pos;
|
5693
5977
|
|
5694
5978
|
return result;
|
5695
5979
|
}
|
@@ -5697,23 +5981,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5697
5981
|
struct ggml_tensor * ggml_soft_max(
|
5698
5982
|
struct ggml_context * ctx,
|
5699
5983
|
struct ggml_tensor * a) {
|
5700
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
5984
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
|
5701
5985
|
}
|
5702
5986
|
|
5703
5987
|
struct ggml_tensor * ggml_soft_max_inplace(
|
5704
5988
|
struct ggml_context * ctx,
|
5705
5989
|
struct ggml_tensor * a) {
|
5706
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
5990
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
|
5707
5991
|
}
|
5708
5992
|
|
5709
5993
|
struct ggml_tensor * ggml_soft_max_ext(
|
5710
5994
|
struct ggml_context * ctx,
|
5711
5995
|
struct ggml_tensor * a,
|
5712
5996
|
struct ggml_tensor * mask,
|
5713
|
-
struct ggml_tensor * pos,
|
5714
5997
|
float scale,
|
5715
5998
|
float max_bias) {
|
5716
|
-
return ggml_soft_max_impl(ctx, a, mask,
|
5999
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
5717
6000
|
}
|
5718
6001
|
|
5719
6002
|
// ggml_soft_max_back
|
@@ -5928,37 +6211,6 @@ struct ggml_tensor * ggml_rope_back(
|
|
5928
6211
|
return result;
|
5929
6212
|
}
|
5930
6213
|
|
5931
|
-
// ggml_alibi
|
5932
|
-
|
5933
|
-
struct ggml_tensor * ggml_alibi(
|
5934
|
-
struct ggml_context * ctx,
|
5935
|
-
struct ggml_tensor * a,
|
5936
|
-
int n_past,
|
5937
|
-
int n_head,
|
5938
|
-
float bias_max) {
|
5939
|
-
GGML_ASSERT(n_past >= 0);
|
5940
|
-
bool is_node = false;
|
5941
|
-
|
5942
|
-
if (a->grad) {
|
5943
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5944
|
-
is_node = true;
|
5945
|
-
}
|
5946
|
-
|
5947
|
-
// TODO: when implement backward, fix this:
|
5948
|
-
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5949
|
-
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
5950
|
-
|
5951
|
-
int32_t op_params[3] = { n_past, n_head };
|
5952
|
-
memcpy(op_params + 2, &bias_max, sizeof(float));
|
5953
|
-
ggml_set_op_params(result, op_params, sizeof(op_params));
|
5954
|
-
|
5955
|
-
result->op = GGML_OP_ALIBI;
|
5956
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5957
|
-
result->src[0] = a;
|
5958
|
-
|
5959
|
-
return result;
|
5960
|
-
}
|
5961
|
-
|
5962
6214
|
// ggml_clamp
|
5963
6215
|
|
5964
6216
|
struct ggml_tensor * ggml_clamp(
|
@@ -6308,7 +6560,10 @@ struct ggml_tensor * ggml_pool_2d(
|
|
6308
6560
|
static struct ggml_tensor * ggml_upscale_impl(
|
6309
6561
|
struct ggml_context * ctx,
|
6310
6562
|
struct ggml_tensor * a,
|
6311
|
-
int
|
6563
|
+
int ne0,
|
6564
|
+
int ne1,
|
6565
|
+
int ne2,
|
6566
|
+
int ne3) {
|
6312
6567
|
bool is_node = false;
|
6313
6568
|
|
6314
6569
|
if (a->grad) {
|
@@ -6316,19 +6571,45 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
6316
6571
|
is_node = true;
|
6317
6572
|
}
|
6318
6573
|
|
6574
|
+
GGML_ASSERT(a->ne[0] <= ne0);
|
6575
|
+
GGML_ASSERT(a->ne[1] <= ne1);
|
6576
|
+
GGML_ASSERT(a->ne[2] <= ne2);
|
6577
|
+
GGML_ASSERT(a->ne[3] <= ne3);
|
6578
|
+
|
6319
6579
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
6320
|
-
|
6321
|
-
|
6322
|
-
|
6580
|
+
ne0,
|
6581
|
+
ne1,
|
6582
|
+
ne2,
|
6583
|
+
ne3
|
6584
|
+
);
|
6323
6585
|
|
6324
6586
|
result->op = GGML_OP_UPSCALE;
|
6325
|
-
|
6587
|
+
|
6326
6588
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6327
6589
|
result->src[0] = a;
|
6328
6590
|
|
6329
6591
|
return result;
|
6330
6592
|
}
|
6331
6593
|
|
6594
|
+
struct ggml_tensor * ggml_upscale(
|
6595
|
+
struct ggml_context * ctx,
|
6596
|
+
struct ggml_tensor * a,
|
6597
|
+
int scale_factor) {
|
6598
|
+
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
|
6599
|
+
}
|
6600
|
+
|
6601
|
+
struct ggml_tensor * ggml_upscale_ext(
|
6602
|
+
struct ggml_context * ctx,
|
6603
|
+
struct ggml_tensor * a,
|
6604
|
+
int ne0,
|
6605
|
+
int ne1,
|
6606
|
+
int ne2,
|
6607
|
+
int ne3) {
|
6608
|
+
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
|
6609
|
+
}
|
6610
|
+
|
6611
|
+
// ggml_pad
|
6612
|
+
|
6332
6613
|
struct ggml_tensor * ggml_pad(
|
6333
6614
|
struct ggml_context * ctx,
|
6334
6615
|
struct ggml_tensor * a,
|
@@ -6353,12 +6634,7 @@ struct ggml_tensor * ggml_pad(
|
|
6353
6634
|
return result;
|
6354
6635
|
}
|
6355
6636
|
|
6356
|
-
|
6357
|
-
struct ggml_context * ctx,
|
6358
|
-
struct ggml_tensor * a,
|
6359
|
-
int scale_factor) {
|
6360
|
-
return ggml_upscale_impl(ctx, a, scale_factor);
|
6361
|
-
}
|
6637
|
+
// ggml_arange
|
6362
6638
|
|
6363
6639
|
struct ggml_tensor * ggml_arange(
|
6364
6640
|
struct ggml_context * ctx,
|
@@ -6380,6 +6656,8 @@ struct ggml_tensor * ggml_arange(
|
|
6380
6656
|
return result;
|
6381
6657
|
}
|
6382
6658
|
|
6659
|
+
// ggml_timestep_embedding
|
6660
|
+
|
6383
6661
|
struct ggml_tensor * ggml_timestep_embedding(
|
6384
6662
|
struct ggml_context * ctx,
|
6385
6663
|
struct ggml_tensor * timesteps,
|
@@ -6486,9 +6764,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6486
6764
|
struct ggml_tensor * k,
|
6487
6765
|
struct ggml_tensor * v,
|
6488
6766
|
struct ggml_tensor * mask,
|
6489
|
-
float scale
|
6767
|
+
float scale,
|
6768
|
+
float max_bias) {
|
6490
6769
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6491
6770
|
// TODO: check if vT can be multiplied by (k*qT)
|
6771
|
+
|
6492
6772
|
if (mask) {
|
6493
6773
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
6494
6774
|
GGML_ASSERT(mask->ne[2] == 1);
|
@@ -6498,6 +6778,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6498
6778
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6499
6779
|
}
|
6500
6780
|
|
6781
|
+
if (max_bias > 0.0f) {
|
6782
|
+
GGML_ASSERT(mask);
|
6783
|
+
}
|
6784
|
+
|
6501
6785
|
bool is_node = false;
|
6502
6786
|
|
6503
6787
|
if (q->grad || k->grad || v->grad) {
|
@@ -6508,7 +6792,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6508
6792
|
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6509
6793
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6510
6794
|
|
6511
|
-
float params[] = { scale };
|
6795
|
+
float params[] = { scale, max_bias };
|
6512
6796
|
ggml_set_op_params(result, params, sizeof(params));
|
6513
6797
|
|
6514
6798
|
result->op = GGML_OP_FLASH_ATTN_EXT;
|
@@ -6528,7 +6812,7 @@ void ggml_flash_attn_ext_set_prec(
|
|
6528
6812
|
|
6529
6813
|
const int32_t prec_i32 = (int32_t) prec;
|
6530
6814
|
|
6531
|
-
ggml_set_op_params_i32(a,
|
6815
|
+
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
6532
6816
|
}
|
6533
6817
|
|
6534
6818
|
// ggml_flash_ff
|
@@ -10892,6 +11176,52 @@ static void ggml_compute_forward_relu(
|
|
10892
11176
|
}
|
10893
11177
|
}
|
10894
11178
|
|
11179
|
+
// ggml_compute_forward_sigmoid
|
11180
|
+
|
11181
|
+
static void ggml_compute_forward_sigmoid_f32(
|
11182
|
+
const struct ggml_compute_params * params,
|
11183
|
+
struct ggml_tensor * dst) {
|
11184
|
+
|
11185
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11186
|
+
|
11187
|
+
assert(params->ith == 0);
|
11188
|
+
assert(ggml_are_same_shape(src0, dst));
|
11189
|
+
|
11190
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11191
|
+
return;
|
11192
|
+
}
|
11193
|
+
|
11194
|
+
const int n = ggml_nrows(src0);
|
11195
|
+
const int nc = src0->ne[0];
|
11196
|
+
|
11197
|
+
assert(dst->nb[0] == sizeof(float));
|
11198
|
+
assert(src0->nb[0] == sizeof(float));
|
11199
|
+
|
11200
|
+
for (int i = 0; i < n; i++) {
|
11201
|
+
ggml_vec_sigmoid_f32(nc,
|
11202
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
11203
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
11204
|
+
}
|
11205
|
+
}
|
11206
|
+
|
11207
|
+
static void ggml_compute_forward_sigmoid(
|
11208
|
+
const struct ggml_compute_params * params,
|
11209
|
+
struct ggml_tensor * dst) {
|
11210
|
+
|
11211
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11212
|
+
|
11213
|
+
switch (src0->type) {
|
11214
|
+
case GGML_TYPE_F32:
|
11215
|
+
{
|
11216
|
+
ggml_compute_forward_sigmoid_f32(params, dst);
|
11217
|
+
} break;
|
11218
|
+
default:
|
11219
|
+
{
|
11220
|
+
GGML_ASSERT(false);
|
11221
|
+
} break;
|
11222
|
+
}
|
11223
|
+
}
|
11224
|
+
|
10895
11225
|
// ggml_compute_forward_gelu
|
10896
11226
|
|
10897
11227
|
static void ggml_compute_forward_gelu_f32(
|
@@ -11742,48 +12072,139 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
11742
12072
|
}
|
11743
12073
|
#endif
|
11744
12074
|
|
11745
|
-
static void
|
11746
|
-
|
11747
|
-
|
12075
|
+
static void ggml_compute_forward_mul_mat_one_chunk(
|
12076
|
+
const struct ggml_compute_params * params,
|
12077
|
+
struct ggml_tensor * dst,
|
12078
|
+
const int64_t num_rows_per_vec_dot,
|
12079
|
+
const int64_t ir0_start,
|
12080
|
+
const int64_t ir0_end,
|
12081
|
+
const int64_t ir1_start,
|
12082
|
+
const int64_t ir1_end) {
|
11748
12083
|
|
11749
12084
|
const struct ggml_tensor * src0 = dst->src[0];
|
11750
12085
|
const struct ggml_tensor * src1 = dst->src[1];
|
11751
12086
|
|
11752
|
-
int64_t t0 = ggml_perf_time_us();
|
11753
|
-
UNUSED(t0);
|
11754
|
-
|
11755
12087
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11756
12088
|
|
11757
|
-
const int ith = params->ith;
|
11758
|
-
const int nth = params->nth;
|
11759
|
-
|
11760
12089
|
const enum ggml_type type = src0->type;
|
11761
12090
|
|
11762
12091
|
const bool src1_cont = ggml_is_contiguous(src1);
|
11763
12092
|
|
11764
|
-
ggml_vec_dot_t const vec_dot
|
11765
|
-
enum ggml_type const vec_dot_type
|
11766
|
-
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
11767
|
-
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
12093
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
12094
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
11768
12095
|
|
11769
|
-
|
11770
|
-
|
11771
|
-
|
11772
|
-
GGML_ASSERT(ne3 == ne13);
|
12096
|
+
// broadcast factors
|
12097
|
+
const int64_t r2 = ne12 / ne02;
|
12098
|
+
const int64_t r3 = ne13 / ne03;
|
11773
12099
|
|
11774
|
-
//
|
11775
|
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
11776
|
-
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
12100
|
+
//printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
|
11777
12101
|
|
11778
|
-
//
|
11779
|
-
|
11780
|
-
|
11781
|
-
|
11782
|
-
|
12102
|
+
// threads with no work simply yield (not sure if it helps)
|
12103
|
+
if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
|
12104
|
+
return;
|
12105
|
+
}
|
12106
|
+
|
12107
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12108
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
12109
|
+
|
12110
|
+
assert(ne12 % ne02 == 0);
|
12111
|
+
assert(ne13 % ne03 == 0);
|
12112
|
+
|
12113
|
+
// block-tiling attempt
|
12114
|
+
const int64_t blck_0 = 16;
|
12115
|
+
const int64_t blck_1 = 16;
|
12116
|
+
|
12117
|
+
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
12118
|
+
|
12119
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
12120
|
+
// 16 * 2, accounting for mmla kernels
|
12121
|
+
float tmp[32];
|
12122
|
+
|
12123
|
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
12124
|
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
12125
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
|
12126
|
+
const int64_t i13 = (ir1 / (ne12 * ne1));
|
12127
|
+
const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
12128
|
+
const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
12129
|
+
|
12130
|
+
// broadcast src0 into src1
|
12131
|
+
const int64_t i03 = i13 / r3;
|
12132
|
+
const int64_t i02 = i12 / r2;
|
12133
|
+
|
12134
|
+
const int64_t i1 = i11;
|
12135
|
+
const int64_t i2 = i12;
|
12136
|
+
const int64_t i3 = i13;
|
12137
|
+
|
12138
|
+
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
12139
|
+
|
12140
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
12141
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
12142
|
+
// the original src1 data pointer, so we should index using the indices directly
|
12143
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12144
|
+
const char * src1_col = (const char*)wdata +
|
12145
|
+
(src1_cont || src1->type != vec_dot_type
|
12146
|
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
12147
|
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
12148
|
+
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
12149
|
+
|
12150
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
12151
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
12152
|
+
//}
|
12153
|
+
|
12154
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
12155
|
+
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
12156
|
+
}
|
12157
|
+
|
12158
|
+
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
12159
|
+
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
12160
|
+
}
|
12161
|
+
}
|
12162
|
+
}
|
12163
|
+
}
|
12164
|
+
}
|
12165
|
+
|
12166
|
+
static void ggml_compute_forward_mul_mat(
|
12167
|
+
const struct ggml_compute_params * params,
|
12168
|
+
struct ggml_tensor * dst,
|
12169
|
+
struct ggml_compute_state * state) {
|
12170
|
+
|
12171
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
12172
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
12173
|
+
|
12174
|
+
int64_t t0 = ggml_perf_time_us();
|
12175
|
+
UNUSED(t0);
|
12176
|
+
|
12177
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12178
|
+
|
12179
|
+
const int ith = params->ith;
|
12180
|
+
const int nth = params->nth;
|
12181
|
+
|
12182
|
+
const enum ggml_type type = src0->type;
|
12183
|
+
|
12184
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
12185
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
12186
|
+
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
12187
|
+
|
12188
|
+
GGML_ASSERT(ne0 == ne01);
|
12189
|
+
GGML_ASSERT(ne1 == ne11);
|
12190
|
+
GGML_ASSERT(ne2 == ne12);
|
12191
|
+
GGML_ASSERT(ne3 == ne13);
|
12192
|
+
|
12193
|
+
// we don't support permuted src0 or src1
|
12194
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
12195
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
12196
|
+
|
12197
|
+
// dst cannot be transposed or permuted
|
12198
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12199
|
+
GGML_ASSERT(nb0 <= nb1);
|
12200
|
+
GGML_ASSERT(nb1 <= nb2);
|
12201
|
+
GGML_ASSERT(nb2 <= nb3);
|
11783
12202
|
|
11784
12203
|
// broadcast factors
|
11785
|
-
const int64_t r2 = ne12/ne02;
|
11786
|
-
const int64_t r3 = ne13/ne03;
|
12204
|
+
const int64_t r2 = ne12 / ne02;
|
12205
|
+
const int64_t r3 = ne13 / ne03;
|
12206
|
+
UNUSED(r2);
|
12207
|
+
UNUSED(r3);
|
11787
12208
|
|
11788
12209
|
// nb01 >= nb00 - src0 is not transposed
|
11789
12210
|
// compute by src0 rows
|
@@ -11865,6 +12286,8 @@ static void ggml_compute_forward_mul_mat(
|
|
11865
12286
|
#endif
|
11866
12287
|
|
11867
12288
|
#if GGML_USE_LLAMAFILE
|
12289
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
12290
|
+
|
11868
12291
|
if (src1_cont) {
|
11869
12292
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11870
12293
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
@@ -11890,6 +12313,8 @@ UseGgmlGemm1:;
|
|
11890
12313
|
if (ith != 0) {
|
11891
12314
|
return;
|
11892
12315
|
}
|
12316
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
12317
|
+
atomic_store(&state->shared->current_chunk, nth);
|
11893
12318
|
if (src1->type != vec_dot_type) {
|
11894
12319
|
char * wdata = params->wdata;
|
11895
12320
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -11914,11 +12339,11 @@ UseGgmlGemm1:;
|
|
11914
12339
|
return;
|
11915
12340
|
}
|
11916
12341
|
|
11917
|
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11918
|
-
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11919
|
-
|
11920
12342
|
#if GGML_USE_LLAMAFILE
|
11921
12343
|
if (src1->type != vec_dot_type) {
|
12344
|
+
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12345
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
12346
|
+
|
11922
12347
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11923
12348
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
11924
12349
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
@@ -11939,98 +12364,87 @@ UseGgmlGemm1:;
|
|
11939
12364
|
UseGgmlGemm2:;
|
11940
12365
|
#endif
|
11941
12366
|
|
11942
|
-
|
11943
|
-
|
11944
|
-
|
11945
|
-
|
11946
|
-
|
11947
|
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
11948
|
-
|
11949
|
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
11950
|
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
11951
|
-
|
11952
|
-
const int64_t ith0 = ith % nth0;
|
11953
|
-
const int64_t ith1 = ith / nth0;
|
11954
|
-
|
11955
|
-
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
11956
|
-
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
11957
|
-
|
11958
|
-
const int64_t ir010 = dr0*ith0;
|
11959
|
-
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
11960
|
-
|
11961
|
-
const int64_t ir110 = dr1*ith1;
|
11962
|
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11963
|
-
|
11964
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11965
|
-
|
11966
|
-
// threads with no work simply yield (not sure if it helps)
|
11967
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11968
|
-
sched_yield();
|
11969
|
-
return;
|
11970
|
-
}
|
12367
|
+
#ifdef GGML_PERF
|
12368
|
+
int chunks_executed = 0;
|
12369
|
+
UNUSED(chunks_executed);
|
12370
|
+
#endif
|
11971
12371
|
|
11972
|
-
|
11973
|
-
|
12372
|
+
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
12373
|
+
const int64_t nr0 = ne0;
|
11974
12374
|
|
11975
|
-
//
|
11976
|
-
const int64_t
|
11977
|
-
const int64_t blck_1 = 16;
|
12375
|
+
// This is the size of the rest of the dimensions of the result
|
12376
|
+
const int64_t nr1 = ne1 * ne2 * ne3;
|
11978
12377
|
|
11979
12378
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
11980
|
-
int64_t
|
12379
|
+
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
11981
12380
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
11982
12381
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
11983
12382
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
11984
|
-
|
12383
|
+
num_rows_per_vec_dot = 1;
|
11985
12384
|
}
|
11986
12385
|
|
11987
|
-
|
12386
|
+
// Now select a reasonable chunk size.
|
12387
|
+
int chunk_size = 16;
|
11988
12388
|
|
11989
|
-
//
|
11990
|
-
|
11991
|
-
|
12389
|
+
// We need to step up the size if it's small
|
12390
|
+
if (nr0 == 1 || nr1 == 1) {
|
12391
|
+
chunk_size = 64;
|
12392
|
+
}
|
11992
12393
|
|
11993
|
-
|
11994
|
-
|
11995
|
-
|
11996
|
-
|
11997
|
-
|
11998
|
-
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
12394
|
+
// distribute the work across the inner or outer loop based on which one is larger
|
12395
|
+
// The number of chunks in the 0/1 dim.
|
12396
|
+
// CEIL(nr0/chunk_size)
|
12397
|
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
12398
|
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
11999
12399
|
|
12000
|
-
|
12001
|
-
|
12002
|
-
|
12400
|
+
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
12401
|
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
12402
|
+
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
12403
|
+
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
12404
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
12405
|
+
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
12406
|
+
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
12407
|
+
}
|
12003
12408
|
|
12004
|
-
|
12005
|
-
|
12006
|
-
|
12409
|
+
// The number of elements in each chunk
|
12410
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
12411
|
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
12007
12412
|
|
12008
|
-
|
12413
|
+
//if (ith == 0)
|
12414
|
+
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
12009
12415
|
|
12010
|
-
|
12011
|
-
|
12012
|
-
// the original src1 data pointer, so we should index using the indices directly
|
12013
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12014
|
-
const char * src1_col = (const char *) wdata +
|
12015
|
-
(src1_cont || src1->type != vec_dot_type
|
12016
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
12017
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
12018
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
12416
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
12417
|
+
int current_chunk = ith;
|
12019
12418
|
|
12020
|
-
|
12021
|
-
|
12022
|
-
|
12419
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
12420
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
12421
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
12023
12422
|
|
12024
|
-
|
12025
|
-
|
12026
|
-
}
|
12423
|
+
const int64_t ir0_start = dr0 * ith0;
|
12424
|
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
12027
12425
|
|
12028
|
-
|
12029
|
-
|
12030
|
-
|
12031
|
-
|
12426
|
+
const int64_t ir1_start = dr1 * ith1;
|
12427
|
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
12428
|
+
|
12429
|
+
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
12430
|
+
|
12431
|
+
#ifdef GGML_PERF
|
12432
|
+
chunks_executed++;
|
12433
|
+
#endif
|
12434
|
+
|
12435
|
+
if (nth >= nchunk0 * nchunk1) {
|
12436
|
+
break;
|
12032
12437
|
}
|
12438
|
+
|
12439
|
+
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
12033
12440
|
}
|
12441
|
+
|
12442
|
+
#ifdef GGML_PERF
|
12443
|
+
// These numbers are useful when trying to measure how well the threading scheduling works.
|
12444
|
+
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
|
12445
|
+
//float time = (ggml_perf_time_us() - t0);
|
12446
|
+
//printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
|
12447
|
+
#endif
|
12034
12448
|
}
|
12035
12449
|
|
12036
12450
|
// ggml_compute_forward_mul_mat_id
|
@@ -13333,7 +13747,6 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13333
13747
|
|
13334
13748
|
const struct ggml_tensor * src0 = dst->src[0];
|
13335
13749
|
const struct ggml_tensor * src1 = dst->src[1];
|
13336
|
-
const struct ggml_tensor * src2 = dst->src[2];
|
13337
13750
|
|
13338
13751
|
assert(ggml_is_contiguous(dst));
|
13339
13752
|
assert(ggml_are_same_shape(src0, dst));
|
@@ -13359,8 +13772,8 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13359
13772
|
|
13360
13773
|
// TODO: is this supposed to be ceil instead of floor?
|
13361
13774
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
13362
|
-
const uint32_t
|
13363
|
-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(
|
13775
|
+
const uint32_t n_head = ne02;
|
13776
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
13364
13777
|
|
13365
13778
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
13366
13779
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
@@ -13377,13 +13790,13 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13377
13790
|
|
13378
13791
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
13379
13792
|
|
13380
|
-
|
13381
|
-
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
13382
|
-
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
13383
|
-
|
13384
|
-
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
13793
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
13385
13794
|
|
13386
13795
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
13796
|
+
// ALiBi
|
13797
|
+
const uint32_t h = (i1/ne01)%ne02; // head
|
13798
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
13799
|
+
|
13387
13800
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
13388
13801
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
13389
13802
|
|
@@ -13396,27 +13809,11 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13396
13809
|
if (mp_f32) {
|
13397
13810
|
if (use_f16) {
|
13398
13811
|
for (int i = 0; i < nc; ++i) {
|
13399
|
-
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
13400
|
-
}
|
13401
|
-
} else {
|
13402
|
-
for (int i = 0; i < nc; ++i) {
|
13403
|
-
wp[i] += mp_f32[i];
|
13404
|
-
}
|
13405
|
-
}
|
13406
|
-
}
|
13407
|
-
|
13408
|
-
// ALiBi bias
|
13409
|
-
if (max_bias > 0.0f) {
|
13410
|
-
const uint32_t h = (i1/ne01)%ne02; // head
|
13411
|
-
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
13412
|
-
|
13413
|
-
if (use_f16) {
|
13414
|
-
for (int i = 0; i < nc; ++i) {
|
13415
|
-
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
13812
|
+
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
|
13416
13813
|
}
|
13417
13814
|
} else {
|
13418
13815
|
for (int i = 0; i < nc; ++i) {
|
13419
|
-
wp[i] += slope*
|
13816
|
+
wp[i] += slope*mp_f32[i];
|
13420
13817
|
}
|
13421
13818
|
}
|
13422
13819
|
}
|
@@ -13431,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13431
13828
|
float max = -INFINITY;
|
13432
13829
|
ggml_vec_max_f32(nc, &max, wp);
|
13433
13830
|
|
13434
|
-
ggml_float sum =
|
13435
|
-
|
13436
|
-
uint16_t scvt;
|
13437
|
-
for (int i = 0; i < nc; i++) {
|
13438
|
-
if (wp[i] == -INFINITY) {
|
13439
|
-
dp[i] = 0.0f;
|
13440
|
-
} else {
|
13441
|
-
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
13442
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
13443
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
13444
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
13445
|
-
sum += (ggml_float)val;
|
13446
|
-
dp[i] = val;
|
13447
|
-
}
|
13448
|
-
}
|
13449
|
-
|
13831
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
|
13450
13832
|
assert(sum > 0.0);
|
13451
13833
|
|
13452
13834
|
sum = 1.0/sum;
|
@@ -13578,178 +13960,6 @@ static void ggml_compute_forward_soft_max_back(
|
|
13578
13960
|
}
|
13579
13961
|
}
|
13580
13962
|
|
13581
|
-
// ggml_compute_forward_alibi
|
13582
|
-
|
13583
|
-
static void ggml_compute_forward_alibi_f32(
|
13584
|
-
const struct ggml_compute_params * params,
|
13585
|
-
struct ggml_tensor * dst) {
|
13586
|
-
|
13587
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13588
|
-
|
13589
|
-
assert(params->ith == 0);
|
13590
|
-
|
13591
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13592
|
-
return;
|
13593
|
-
}
|
13594
|
-
|
13595
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13596
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
13597
|
-
float max_bias;
|
13598
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13599
|
-
|
13600
|
-
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13601
|
-
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13602
|
-
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13603
|
-
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13604
|
-
|
13605
|
-
const int64_t n = ggml_nrows(src0);
|
13606
|
-
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13607
|
-
|
13608
|
-
const size_t nb0 = src0->nb[0];
|
13609
|
-
const size_t nb1 = src0->nb[1];
|
13610
|
-
const size_t nb2 = src0->nb[2];
|
13611
|
-
//const int nb3 = src0->nb[3];
|
13612
|
-
|
13613
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
13614
|
-
GGML_ASSERT(n_head == ne2);
|
13615
|
-
|
13616
|
-
// add alibi to src0 (KQ_scaled)
|
13617
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
13618
|
-
|
13619
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13620
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13621
|
-
|
13622
|
-
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13623
|
-
// TODO: k*nb2 or k*nb3
|
13624
|
-
float m_k;
|
13625
|
-
|
13626
|
-
if (k < n_heads_log2_floor) {
|
13627
|
-
m_k = powf(m0, k + 1);
|
13628
|
-
} else {
|
13629
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13630
|
-
}
|
13631
|
-
|
13632
|
-
for (int64_t i = 0; i < ne0; i++) {
|
13633
|
-
for (int64_t j = 0; j < ne1; j++) {
|
13634
|
-
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13635
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13636
|
-
pdst[0] = i * m_k + src[0];
|
13637
|
-
}
|
13638
|
-
}
|
13639
|
-
}
|
13640
|
-
}
|
13641
|
-
|
13642
|
-
static void ggml_compute_forward_alibi_f16(
|
13643
|
-
const struct ggml_compute_params * params,
|
13644
|
-
struct ggml_tensor * dst) {
|
13645
|
-
|
13646
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13647
|
-
|
13648
|
-
assert(params->ith == 0);
|
13649
|
-
|
13650
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13651
|
-
return;
|
13652
|
-
}
|
13653
|
-
|
13654
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13655
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
13656
|
-
float max_bias;
|
13657
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13658
|
-
|
13659
|
-
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13660
|
-
const int ne1 = src0->ne[1]; // seq_len_without_past
|
13661
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13662
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13663
|
-
|
13664
|
-
const int n = ggml_nrows(src0);
|
13665
|
-
const int ne2_ne3 = n/ne1; // ne2*ne3
|
13666
|
-
|
13667
|
-
const int nb0 = src0->nb[0];
|
13668
|
-
const int nb1 = src0->nb[1];
|
13669
|
-
const int nb2 = src0->nb[2];
|
13670
|
-
//const int nb3 = src0->nb[3];
|
13671
|
-
|
13672
|
-
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
13673
|
-
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
13674
|
-
GGML_ASSERT(n_head == ne2);
|
13675
|
-
|
13676
|
-
// add alibi to src0 (KQ_scaled)
|
13677
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
13678
|
-
|
13679
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13680
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13681
|
-
|
13682
|
-
for (int k = 0; k < ne2_ne3; k++) {
|
13683
|
-
// TODO: k*nb2 or k*nb3
|
13684
|
-
float m_k;
|
13685
|
-
|
13686
|
-
if (k < n_heads_log2_floor) {
|
13687
|
-
m_k = powf(m0, k + 1);
|
13688
|
-
} else {
|
13689
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13690
|
-
}
|
13691
|
-
|
13692
|
-
for (int i = 0; i < ne0; i++) {
|
13693
|
-
for (int j = 0; j < ne1; j++) {
|
13694
|
-
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13695
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13696
|
-
|
13697
|
-
// we return F32
|
13698
|
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
13699
|
-
}
|
13700
|
-
}
|
13701
|
-
}
|
13702
|
-
}
|
13703
|
-
|
13704
|
-
static void ggml_compute_forward_alibi(
|
13705
|
-
const struct ggml_compute_params * params,
|
13706
|
-
struct ggml_tensor * dst) {
|
13707
|
-
|
13708
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13709
|
-
|
13710
|
-
switch (src0->type) {
|
13711
|
-
case GGML_TYPE_F16:
|
13712
|
-
{
|
13713
|
-
ggml_compute_forward_alibi_f16(params, dst);
|
13714
|
-
} break;
|
13715
|
-
case GGML_TYPE_F32:
|
13716
|
-
{
|
13717
|
-
ggml_compute_forward_alibi_f32(params, dst);
|
13718
|
-
} break;
|
13719
|
-
case GGML_TYPE_BF16:
|
13720
|
-
case GGML_TYPE_Q4_0:
|
13721
|
-
case GGML_TYPE_Q4_1:
|
13722
|
-
case GGML_TYPE_Q5_0:
|
13723
|
-
case GGML_TYPE_Q5_1:
|
13724
|
-
case GGML_TYPE_Q8_0:
|
13725
|
-
case GGML_TYPE_Q8_1:
|
13726
|
-
case GGML_TYPE_Q2_K:
|
13727
|
-
case GGML_TYPE_Q3_K:
|
13728
|
-
case GGML_TYPE_Q4_K:
|
13729
|
-
case GGML_TYPE_Q5_K:
|
13730
|
-
case GGML_TYPE_Q6_K:
|
13731
|
-
case GGML_TYPE_IQ2_XXS:
|
13732
|
-
case GGML_TYPE_IQ2_XS:
|
13733
|
-
case GGML_TYPE_IQ3_XXS:
|
13734
|
-
case GGML_TYPE_IQ1_S:
|
13735
|
-
case GGML_TYPE_IQ1_M:
|
13736
|
-
case GGML_TYPE_IQ4_NL:
|
13737
|
-
case GGML_TYPE_IQ4_XS:
|
13738
|
-
case GGML_TYPE_IQ3_S:
|
13739
|
-
case GGML_TYPE_IQ2_S:
|
13740
|
-
case GGML_TYPE_Q8_K:
|
13741
|
-
case GGML_TYPE_I8:
|
13742
|
-
case GGML_TYPE_I16:
|
13743
|
-
case GGML_TYPE_I32:
|
13744
|
-
case GGML_TYPE_I64:
|
13745
|
-
case GGML_TYPE_F64:
|
13746
|
-
case GGML_TYPE_COUNT:
|
13747
|
-
{
|
13748
|
-
GGML_ASSERT(false);
|
13749
|
-
} break;
|
13750
|
-
}
|
13751
|
-
}
|
13752
|
-
|
13753
13963
|
// ggml_compute_forward_clamp
|
13754
13964
|
|
13755
13965
|
static void ggml_compute_forward_clamp_f32(
|
@@ -14972,25 +15182,28 @@ static void ggml_compute_forward_upscale_f32(
|
|
14972
15182
|
return;
|
14973
15183
|
}
|
14974
15184
|
|
14975
|
-
GGML_ASSERT(src0->
|
15185
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14976
15186
|
|
14977
15187
|
const int ith = params->ith;
|
14978
15188
|
const int nth = params->nth;
|
14979
15189
|
|
14980
15190
|
GGML_TENSOR_UNARY_OP_LOCALS
|
14981
15191
|
|
14982
|
-
const
|
15192
|
+
const float sf0 = (float)ne0/src0->ne[0];
|
15193
|
+
const float sf1 = (float)ne1/src0->ne[1];
|
15194
|
+
const float sf2 = (float)ne2/src0->ne[2];
|
15195
|
+
const float sf3 = (float)ne3/src0->ne[3];
|
14983
15196
|
|
14984
15197
|
// TODO: optimize
|
14985
15198
|
|
14986
15199
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
14987
|
-
const int64_t i03 = i3;
|
15200
|
+
const int64_t i03 = i3 / sf3;
|
14988
15201
|
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
14989
|
-
const int64_t i02 = i2;
|
15202
|
+
const int64_t i02 = i2 / sf2;
|
14990
15203
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14991
|
-
const int64_t i01 = i1 /
|
15204
|
+
const int64_t i01 = i1 / sf1;
|
14992
15205
|
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
14993
|
-
const int64_t i00 = i0 /
|
15206
|
+
const int64_t i00 = i0 / sf0;
|
14994
15207
|
|
14995
15208
|
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
14996
15209
|
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
@@ -15020,6 +15233,7 @@ static void ggml_compute_forward_upscale(
|
|
15020
15233
|
}
|
15021
15234
|
}
|
15022
15235
|
|
15236
|
+
|
15023
15237
|
// ggml_compute_forward_pad
|
15024
15238
|
|
15025
15239
|
static void ggml_compute_forward_pad_f32(
|
@@ -15373,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
15373
15587
|
vvexpf(S, S, &Mup);
|
15374
15588
|
ggml_vec_sum_f32(Mup, &sum, S);
|
15375
15589
|
#else
|
15376
|
-
|
15377
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15378
|
-
|
15379
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15380
|
-
if (i >= masked_begin) {
|
15381
|
-
break;
|
15382
|
-
}
|
15383
|
-
float * SS = S + i;
|
15384
|
-
|
15385
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15386
|
-
if (i + j >= masked_begin) {
|
15387
|
-
break;
|
15388
|
-
} else if (SS[j] == -INFINITY) {
|
15389
|
-
SS[j] = 0.0f;
|
15390
|
-
} else {
|
15391
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
15392
|
-
const float val = expf(SS[j] - max);
|
15393
|
-
#else
|
15394
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15395
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15396
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15397
|
-
#endif
|
15398
|
-
sump[j] += (ggml_float)val;
|
15399
|
-
SS[j] = val;
|
15400
|
-
}
|
15401
|
-
}
|
15402
|
-
}
|
15403
|
-
|
15404
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15405
|
-
sum += sump[i];
|
15406
|
-
}
|
15590
|
+
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
15407
15591
|
#endif
|
15408
15592
|
}
|
15409
15593
|
|
@@ -15585,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
15585
15769
|
vvexpf(S, S, &Mup);
|
15586
15770
|
ggml_vec_sum_f32(Mup, &sum, S);
|
15587
15771
|
#else
|
15588
|
-
|
15589
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15590
|
-
|
15591
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15592
|
-
float * SS = S + i;
|
15593
|
-
|
15594
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15595
|
-
if (SS[j] == -INFINITY) {
|
15596
|
-
SS[j] = 0.0f;
|
15597
|
-
} else {
|
15598
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15599
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15600
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15601
|
-
sump[j] += (ggml_float)val;
|
15602
|
-
SS[j] = val;
|
15603
|
-
}
|
15604
|
-
}
|
15605
|
-
}
|
15606
|
-
|
15607
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15608
|
-
sum += sump[i];
|
15609
|
-
}
|
15772
|
+
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
15610
15773
|
#endif
|
15611
15774
|
}
|
15612
15775
|
|
@@ -15763,8 +15926,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15763
15926
|
const int ir0 = dr*ith;
|
15764
15927
|
const int ir1 = MIN(ir0 + dr, nr);
|
15765
15928
|
|
15766
|
-
float scale
|
15767
|
-
|
15929
|
+
float scale = 1.0f;
|
15930
|
+
float max_bias = 0.0f;
|
15931
|
+
|
15932
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
15933
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
15934
|
+
|
15935
|
+
const uint32_t n_head = neq2;
|
15936
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
15937
|
+
|
15938
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
15939
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
15768
15940
|
|
15769
15941
|
// loop over n_batch and n_head
|
15770
15942
|
for (int ir = ir0; ir < ir1; ++ir) {
|
@@ -15773,6 +15945,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15773
15945
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15774
15946
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15775
15947
|
|
15948
|
+
const uint32_t h = iq2; // head
|
15949
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
15950
|
+
|
15776
15951
|
float S = 0.0f;
|
15777
15952
|
float M = -INFINITY;
|
15778
15953
|
|
@@ -15796,7 +15971,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15796
15971
|
// loop over n_kv and n_head_kv
|
15797
15972
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
15798
15973
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15799
|
-
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15974
|
+
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15800
15975
|
if (mv == -INFINITY) {
|
15801
15976
|
continue;
|
15802
15977
|
}
|
@@ -15867,7 +16042,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
15867
16042
|
const struct ggml_tensor * v,
|
15868
16043
|
const struct ggml_tensor * mask,
|
15869
16044
|
struct ggml_tensor * dst) {
|
15870
|
-
switch (dst->op_params[
|
16045
|
+
switch (dst->op_params[2]) {
|
15871
16046
|
case GGML_PREC_DEFAULT:
|
15872
16047
|
case GGML_PREC_F32:
|
15873
16048
|
{
|
@@ -16221,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
16221
16396
|
vvexpf(SM, SM, &Mup);
|
16222
16397
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
16223
16398
|
#else
|
16224
|
-
|
16225
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
16226
|
-
|
16227
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
16228
|
-
if (i >= masked_begin) {
|
16229
|
-
break;
|
16230
|
-
}
|
16231
|
-
float * SR = S + i;
|
16232
|
-
float * SW = SM + i;
|
16233
|
-
|
16234
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
16235
|
-
if (i + j >= masked_begin) {
|
16236
|
-
break;
|
16237
|
-
} else if (SR[j] == -INFINITY) {
|
16238
|
-
SW[j] = 0.0f;
|
16239
|
-
} else {
|
16240
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
16241
|
-
const float val = expf(SR[j] - max);
|
16242
|
-
#else
|
16243
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
16244
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
16245
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
16246
|
-
#endif
|
16247
|
-
sump[j] += (ggml_float)val;
|
16248
|
-
SW[j] = val;
|
16249
|
-
}
|
16250
|
-
}
|
16251
|
-
}
|
16252
|
-
|
16253
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
16254
|
-
sum += sump[i];
|
16255
|
-
}
|
16399
|
+
sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
|
16256
16400
|
#endif
|
16257
16401
|
}
|
16258
16402
|
|
@@ -16834,6 +16978,10 @@ static void ggml_compute_forward_unary(
|
|
16834
16978
|
{
|
16835
16979
|
ggml_compute_forward_relu(params, dst);
|
16836
16980
|
} break;
|
16981
|
+
case GGML_UNARY_OP_SIGMOID:
|
16982
|
+
{
|
16983
|
+
ggml_compute_forward_sigmoid(params, dst);
|
16984
|
+
} break;
|
16837
16985
|
case GGML_UNARY_OP_GELU:
|
16838
16986
|
{
|
16839
16987
|
ggml_compute_forward_gelu(params, dst);
|
@@ -17274,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
17274
17422
|
assert(!isnan(s1[i]));
|
17275
17423
|
}
|
17276
17424
|
#endif
|
17277
|
-
// soft_max
|
17278
|
-
ggml_float sum = 0.0;
|
17279
|
-
{
|
17280
|
-
float max = -INFINITY;
|
17281
|
-
ggml_vec_max_f32(nc, &max, s0);
|
17282
17425
|
|
17283
|
-
|
17284
|
-
|
17285
|
-
|
17286
|
-
|
17287
|
-
|
17288
|
-
|
17289
|
-
const float s = s0[i] - max;
|
17290
|
-
const float val = expf(s);
|
17291
|
-
#else
|
17292
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17293
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17294
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17295
|
-
#endif
|
17296
|
-
sum += (ggml_float)val;
|
17297
|
-
st[i] = val;
|
17298
|
-
}
|
17299
|
-
}
|
17426
|
+
// soft_max
|
17427
|
+
float max = -INFINITY;
|
17428
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17429
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
|
17430
|
+
assert(sum > 0.0);
|
17431
|
+
sum = (1.0 - eps) / sum;
|
17300
17432
|
|
17301
|
-
assert(sum > 0.0);
|
17302
|
-
// sum = 1.0/sum;
|
17303
|
-
}
|
17304
17433
|
// avoid log(0) by rescaling from [0..1] to [eps..1]
|
17305
|
-
sum = (1.0 - eps) / sum;
|
17306
17434
|
ggml_vec_scale_f32(nc, st, sum);
|
17307
17435
|
ggml_vec_add1_f32(nc, st, st, eps);
|
17308
17436
|
ggml_vec_log_f32(nc, st, st);
|
@@ -17392,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
17392
17520
|
#endif
|
17393
17521
|
|
17394
17522
|
// soft_max
|
17395
|
-
|
17396
|
-
|
17397
|
-
|
17398
|
-
|
17399
|
-
|
17400
|
-
uint16_t scvt; UNUSED(scvt);
|
17401
|
-
for (int i = 0; i < nc; i++) {
|
17402
|
-
if (s0[i] == -INFINITY) {
|
17403
|
-
ds0[i] = 0.0f;
|
17404
|
-
} else {
|
17405
|
-
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
17406
|
-
const float s = s0[i] - max;
|
17407
|
-
const float val = expf(s);
|
17408
|
-
#else
|
17409
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17410
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17411
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17412
|
-
#endif
|
17413
|
-
sum += (ggml_float)val;
|
17414
|
-
ds0[i] = val;
|
17415
|
-
}
|
17416
|
-
}
|
17417
|
-
|
17418
|
-
assert(sum > 0.0);
|
17419
|
-
sum = (1.0 - eps)/sum;
|
17420
|
-
}
|
17523
|
+
float max = -INFINITY;
|
17524
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17525
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
17526
|
+
assert(sum > 0.0);
|
17527
|
+
sum = (1.0 - eps) / sum;
|
17421
17528
|
|
17422
17529
|
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
17423
17530
|
ggml_vec_scale_f32(nc, ds0, sum);
|
@@ -17454,7 +17561,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
17454
17561
|
|
17455
17562
|
/////////////////////////////////
|
17456
17563
|
|
17457
|
-
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
17564
|
+
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
|
17458
17565
|
GGML_ASSERT(params);
|
17459
17566
|
|
17460
17567
|
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
@@ -17552,7 +17659,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17552
17659
|
} break;
|
17553
17660
|
case GGML_OP_MUL_MAT:
|
17554
17661
|
{
|
17555
|
-
ggml_compute_forward_mul_mat(params, tensor);
|
17662
|
+
ggml_compute_forward_mul_mat(params, tensor, state);
|
17556
17663
|
} break;
|
17557
17664
|
case GGML_OP_MUL_MAT_ID:
|
17558
17665
|
{
|
@@ -17630,10 +17737,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17630
17737
|
{
|
17631
17738
|
ggml_compute_forward_rope_back(params, tensor);
|
17632
17739
|
} break;
|
17633
|
-
case GGML_OP_ALIBI:
|
17634
|
-
{
|
17635
|
-
ggml_compute_forward_alibi(params, tensor);
|
17636
|
-
} break;
|
17637
17740
|
case GGML_OP_CLAMP:
|
17638
17741
|
{
|
17639
17742
|
ggml_compute_forward_clamp(params, tensor);
|
@@ -18652,10 +18755,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18652
18755
|
zero_table);
|
18653
18756
|
}
|
18654
18757
|
} break;
|
18655
|
-
case GGML_OP_ALIBI:
|
18656
|
-
{
|
18657
|
-
GGML_ASSERT(false); // TODO: not implemented
|
18658
|
-
} break;
|
18659
18758
|
case GGML_OP_CLAMP:
|
18660
18759
|
{
|
18661
18760
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -18826,6 +18925,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18826
18925
|
zero_table);
|
18827
18926
|
}
|
18828
18927
|
} break;
|
18928
|
+
case GGML_UNARY_OP_SIGMOID:
|
18929
|
+
{
|
18930
|
+
GGML_ASSERT(false); // TODO: not implemented
|
18931
|
+
} break;
|
18829
18932
|
case GGML_UNARY_OP_GELU:
|
18830
18933
|
{
|
18831
18934
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -19172,8 +19275,6 @@ typedef int ggml_lock_t;
|
|
19172
19275
|
|
19173
19276
|
#define GGML_LOCK_INITIALIZER 0
|
19174
19277
|
|
19175
|
-
typedef pthread_t ggml_thread_t;
|
19176
|
-
|
19177
19278
|
#define ggml_thread_create pthread_create
|
19178
19279
|
#define ggml_thread_join pthread_join
|
19179
19280
|
|
@@ -19199,8 +19300,6 @@ typedef int ggml_lock_t;
|
|
19199
19300
|
|
19200
19301
|
#define GGML_LOCK_INITIALIZER 0
|
19201
19302
|
|
19202
|
-
typedef pthread_t ggml_thread_t;
|
19203
|
-
|
19204
19303
|
#define ggml_thread_create pthread_create
|
19205
19304
|
#define ggml_thread_join pthread_join
|
19206
19305
|
|
@@ -19280,31 +19379,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
|
19280
19379
|
static void clear_numa_thread_affinity(void) {}
|
19281
19380
|
#endif
|
19282
19381
|
|
19283
|
-
struct ggml_compute_state_shared {
|
19284
|
-
const struct ggml_cgraph * cgraph;
|
19285
|
-
const struct ggml_cplan * cplan;
|
19286
|
-
|
19287
|
-
int64_t perf_node_start_cycles;
|
19288
|
-
int64_t perf_node_start_time_us;
|
19289
|
-
|
19290
|
-
const int n_threads;
|
19291
|
-
|
19292
|
-
// synchronization primitives
|
19293
|
-
atomic_int n_active; // num active threads
|
19294
|
-
atomic_int node_n; // active graph node
|
19295
|
-
atomic_int node_task; // active graph node task phase
|
19296
|
-
|
19297
|
-
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
19298
|
-
void * abort_callback_data;
|
19299
|
-
};
|
19300
|
-
|
19301
|
-
struct ggml_compute_state {
|
19302
|
-
ggml_thread_t thrd;
|
19303
|
-
int ith;
|
19304
|
-
struct ggml_compute_state_shared * shared;
|
19305
|
-
enum ggml_status ec;
|
19306
|
-
};
|
19307
|
-
|
19308
19382
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
19309
19383
|
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
19310
19384
|
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
@@ -19355,6 +19429,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19355
19429
|
case GGML_UNARY_OP_TANH:
|
19356
19430
|
case GGML_UNARY_OP_ELU:
|
19357
19431
|
case GGML_UNARY_OP_RELU:
|
19432
|
+
case GGML_UNARY_OP_SIGMOID:
|
19358
19433
|
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
19359
19434
|
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
19360
19435
|
{
|
@@ -19428,10 +19503,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19428
19503
|
{
|
19429
19504
|
n_tasks = n_threads;
|
19430
19505
|
} break;
|
19431
|
-
case GGML_OP_ALIBI:
|
19432
|
-
{
|
19433
|
-
n_tasks = 1; //TODO
|
19434
|
-
} break;
|
19435
19506
|
case GGML_OP_CLAMP:
|
19436
19507
|
{
|
19437
19508
|
n_tasks = 1; //TODO
|
@@ -19580,6 +19651,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19580
19651
|
|
19581
19652
|
* node_n = atomic_load(&state->shared->node_n);
|
19582
19653
|
if (* node_n != last_node_n) break;
|
19654
|
+
#if defined(__SSE3__)
|
19655
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19656
|
+
_mm_pause();
|
19657
|
+
#endif
|
19583
19658
|
}
|
19584
19659
|
}
|
19585
19660
|
|
@@ -19594,6 +19669,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
|
19594
19669
|
|
19595
19670
|
* task_phase = atomic_load(&state->shared->node_task);
|
19596
19671
|
if (* task_phase != last_task_phase) break;
|
19672
|
+
#if defined(__SSE3__)
|
19673
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19674
|
+
_mm_pause();
|
19675
|
+
#endif
|
19597
19676
|
}
|
19598
19677
|
}
|
19599
19678
|
|
@@ -19633,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19633
19712
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19634
19713
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19635
19714
|
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19636
|
-
ggml_compute_forward(¶ms, node);
|
19715
|
+
ggml_compute_forward(¶ms, node, state);
|
19637
19716
|
}
|
19638
19717
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19639
19718
|
}
|
@@ -19653,17 +19732,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19653
19732
|
/* INIT */
|
19654
19733
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19655
19734
|
params.type = GGML_TASK_TYPE_INIT;
|
19656
|
-
ggml_compute_forward(¶ms, node);
|
19735
|
+
ggml_compute_forward(¶ms, node, state);
|
19657
19736
|
}
|
19658
19737
|
|
19659
19738
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19660
19739
|
// they do something more efficient than spinning (?)
|
19661
19740
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19662
|
-
ggml_compute_forward(¶ms, node);
|
19741
|
+
ggml_compute_forward(¶ms, node, state);
|
19663
19742
|
|
19664
19743
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19665
19744
|
params.type = GGML_TASK_TYPE_FINALIZE;
|
19666
|
-
ggml_compute_forward(¶ms, node);
|
19745
|
+
ggml_compute_forward(¶ms, node, state);
|
19667
19746
|
}
|
19668
19747
|
|
19669
19748
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -19702,7 +19781,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19702
19781
|
|
19703
19782
|
if (state->ith < n_tasks) {
|
19704
19783
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19705
|
-
ggml_compute_forward(¶ms, node);
|
19784
|
+
ggml_compute_forward(¶ms, node, state);
|
19706
19785
|
}
|
19707
19786
|
}
|
19708
19787
|
|
@@ -19723,7 +19802,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19723
19802
|
|
19724
19803
|
if (state->ith < n_tasks) {
|
19725
19804
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19726
|
-
ggml_compute_forward(¶ms, node);
|
19805
|
+
ggml_compute_forward(¶ms, node, state);
|
19727
19806
|
}
|
19728
19807
|
|
19729
19808
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
@@ -19974,6 +20053,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19974
20053
|
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19975
20054
|
/*.abort_callback =*/ NULL,
|
19976
20055
|
/*.abort_callback_data =*/ NULL,
|
20056
|
+
/*.current_chunk; =*/ 0,
|
19977
20057
|
};
|
19978
20058
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
19979
20059
|
|