llama_cpp 0.15.1 → 0.15.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +3 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
- data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
- data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
- data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
- data/vendor/tmp/llama.cpp/ggml.c +726 -646
- data/vendor/tmp/llama.cpp/ggml.h +28 -17
- data/vendor/tmp/llama.cpp/llama.cpp +478 -281
- data/vendor/tmp/llama.cpp/llama.h +3 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +4 -2
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
#include "ggml-impl.h"
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
|
-
#include "sgemm.h"
|
8
7
|
|
9
8
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
10
9
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -37,6 +36,10 @@
|
|
37
36
|
#undef GGML_USE_LLAMAFILE
|
38
37
|
#endif
|
39
38
|
|
39
|
+
#ifdef GGML_USE_LLAMAFILE
|
40
|
+
#include "sgemm.h"
|
41
|
+
#endif
|
42
|
+
|
40
43
|
#if defined(_MSC_VER)
|
41
44
|
// disable "possible loss of data" to avoid hundreds of casts
|
42
45
|
// we should just be careful :)
|
@@ -109,6 +112,8 @@ typedef void * thread_ret_t;
|
|
109
112
|
|
110
113
|
#endif
|
111
114
|
|
115
|
+
typedef pthread_t ggml_thread_t;
|
116
|
+
|
112
117
|
#ifdef GGML_USE_CPU_HBM
|
113
118
|
#include <hbwmalloc.h>
|
114
119
|
#endif
|
@@ -160,9 +165,6 @@ void ggml_print_backtrace(void) {
|
|
160
165
|
#define GGML_DEBUG 0
|
161
166
|
#define GGML_GELU_FP16
|
162
167
|
#define GGML_GELU_QUICK_FP16
|
163
|
-
#define GGML_SILU_FP16
|
164
|
-
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
165
|
-
// #define GGML_FLASH_ATTN_EXP_FP16
|
166
168
|
|
167
169
|
#define GGML_SOFT_MAX_UNROLL 4
|
168
170
|
#define GGML_VEC_DOT_UNROLL 2
|
@@ -313,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
|
313
315
|
// precomputed quick gelu table for f16 (128 KB)
|
314
316
|
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
315
317
|
|
316
|
-
// precomputed silu table for f16 (128 KB)
|
317
|
-
static ggml_fp16_t ggml_table_silu_f16[1 << 16];
|
318
|
-
|
319
|
-
// precomputed exp table for f16 (128 KB)
|
320
|
-
static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
321
|
-
|
322
318
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
323
319
|
float ggml_table_f32_f16[1 << 16];
|
324
320
|
|
@@ -1303,6 +1299,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1303
1299
|
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
1304
1300
|
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
1305
1301
|
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
1302
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
1303
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
1306
1304
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
1307
1305
|
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
1308
1306
|
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
@@ -1534,6 +1532,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
|
1534
1532
|
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
1535
1533
|
#endif
|
1536
1534
|
|
1535
|
+
//
|
1536
|
+
// ggml context
|
1537
|
+
//
|
1538
|
+
|
1539
|
+
struct ggml_context {
|
1540
|
+
size_t mem_size;
|
1541
|
+
void* mem_buffer;
|
1542
|
+
bool mem_buffer_owned;
|
1543
|
+
bool no_alloc;
|
1544
|
+
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
1545
|
+
|
1546
|
+
int n_objects;
|
1547
|
+
|
1548
|
+
struct ggml_object* objects_begin;
|
1549
|
+
struct ggml_object* objects_end;
|
1550
|
+
|
1551
|
+
struct ggml_scratch scratch;
|
1552
|
+
struct ggml_scratch scratch_save;
|
1553
|
+
};
|
1554
|
+
|
1555
|
+
struct ggml_context_container {
|
1556
|
+
bool used;
|
1557
|
+
|
1558
|
+
struct ggml_context context;
|
1559
|
+
};
|
1560
|
+
|
1561
|
+
struct ggml_compute_state_shared {
|
1562
|
+
const struct ggml_cgraph* cgraph;
|
1563
|
+
const struct ggml_cplan* cplan;
|
1564
|
+
|
1565
|
+
int64_t perf_node_start_cycles;
|
1566
|
+
int64_t perf_node_start_time_us;
|
1567
|
+
|
1568
|
+
const int n_threads;
|
1569
|
+
|
1570
|
+
// synchronization primitives
|
1571
|
+
atomic_int n_active; // num active threads
|
1572
|
+
atomic_int node_n; // active graph node
|
1573
|
+
atomic_int node_task; // active graph node task phase
|
1574
|
+
|
1575
|
+
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1576
|
+
void* abort_callback_data;
|
1577
|
+
|
1578
|
+
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
1579
|
+
};
|
1580
|
+
|
1581
|
+
struct ggml_compute_state {
|
1582
|
+
ggml_thread_t thrd;
|
1583
|
+
int ith;
|
1584
|
+
struct ggml_compute_state_shared* shared;
|
1585
|
+
enum ggml_status ec;
|
1586
|
+
};
|
1587
|
+
|
1537
1588
|
//
|
1538
1589
|
// fundamental operations
|
1539
1590
|
//
|
@@ -1949,6 +2000,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|
1949
2000
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
1950
2001
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
1951
2002
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2003
|
+
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
1952
2004
|
// TODO: optimize performance
|
1953
2005
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
1954
2006
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
@@ -2024,52 +2076,291 @@ inline static float ggml_silu_f32(float x) {
|
|
2024
2076
|
return x/(1.0f + expf(-x));
|
2025
2077
|
}
|
2026
2078
|
|
2027
|
-
|
2028
|
-
// const uint16_t * i16 = (const uint16_t *) x;
|
2029
|
-
// for (int i = 0; i < n; ++i) {
|
2030
|
-
// y[i] = ggml_table_silu_f16[i16[i]];
|
2031
|
-
// }
|
2032
|
-
//}
|
2079
|
+
#if defined(__ARM_NEON)
|
2033
2080
|
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2037
|
-
|
2038
|
-
|
2039
|
-
|
2040
|
-
|
2041
|
-
|
2042
|
-
|
2081
|
+
// adapted from arm limited optimized routine
|
2082
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2083
|
+
// numbers above 88.38 will flush to infinity
|
2084
|
+
// numbers beneath -103.97 will flush to zero
|
2085
|
+
inline static float32x4_t ggml_v_expf(float32x4_t x) {
|
2086
|
+
const float32x4_t r = vdupq_n_f32(0x1.8p23f);
|
2087
|
+
const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
|
2088
|
+
const float32x4_t n = vsubq_f32(z, r);
|
2089
|
+
const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
|
2090
|
+
vdupq_n_f32(0x1.7f7d1cp-20f));
|
2091
|
+
const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
|
2092
|
+
const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
|
2093
|
+
const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
|
2094
|
+
const float32x4_t u = vmulq_f32(b, b);
|
2095
|
+
const float32x4_t j = vfmaq_f32(
|
2096
|
+
vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
|
2097
|
+
vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
|
2098
|
+
vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
|
2099
|
+
if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
|
2100
|
+
return vfmaq_f32(k, j, k);
|
2101
|
+
const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
|
2102
|
+
const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
|
2103
|
+
const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
|
2104
|
+
return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
|
2105
|
+
vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
|
2106
|
+
}
|
2107
|
+
|
2108
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2109
|
+
inline static float32x4_t ggml_v_silu(float32x4_t x) {
|
2110
|
+
const float32x4_t one = vdupq_n_f32(1.0f);
|
2111
|
+
const float32x4_t zero = vdupq_n_f32(0.0f);
|
2112
|
+
const float32x4_t neg_x = vsubq_f32(zero, x);
|
2113
|
+
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
|
2114
|
+
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
|
2115
|
+
return vdivq_f32(x, one_plus_exp_neg_x);
|
2116
|
+
}
|
2117
|
+
|
2118
|
+
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
2119
|
+
|
2120
|
+
// adapted from arm limited optimized routine
|
2121
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2122
|
+
// numbers above 88.38 will flush to infinity
|
2123
|
+
// numbers beneath -103.97 will flush to zero
|
2124
|
+
inline static __m512 ggml_v_expf(__m512 x) {
|
2125
|
+
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2126
|
+
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2127
|
+
const __m512 n = _mm512_sub_ps(z, r);
|
2128
|
+
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2129
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2130
|
+
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
|
2131
|
+
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2132
|
+
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2133
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2134
|
+
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2135
|
+
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2136
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2137
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2138
|
+
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2139
|
+
if (_mm512_kortestz(c, c))
|
2140
|
+
return _mm512_fmadd_ps(j, k, k);
|
2141
|
+
const __m512i g = _mm512_and_si512(
|
2142
|
+
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2143
|
+
_mm512_set1_epi32(0x82000000u));
|
2144
|
+
const __m512 s1 =
|
2145
|
+
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2146
|
+
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2147
|
+
const __mmask16 d =
|
2148
|
+
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2149
|
+
return _mm512_mask_blend_ps(
|
2150
|
+
d, _mm512_mask_blend_ps(
|
2151
|
+
c, _mm512_fmadd_ps(k, j, k),
|
2152
|
+
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
|
2153
|
+
_mm512_mul_ps(s1, s1));
|
2154
|
+
}
|
2155
|
+
|
2156
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2157
|
+
inline static __m512 ggml_v_silu(__m512 x) {
|
2158
|
+
const __m512 one = _mm512_set1_ps(1);
|
2159
|
+
const __m512 zero = _mm512_setzero_ps();
|
2160
|
+
const __m512 neg_x = _mm512_sub_ps(zero, x);
|
2161
|
+
const __m512 exp_neg_x = ggml_v_expf(neg_x);
|
2162
|
+
const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
|
2163
|
+
return _mm512_div_ps(x, one_plus_exp_neg_x);
|
2164
|
+
}
|
2165
|
+
|
2166
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2167
|
+
|
2168
|
+
// adapted from arm limited optimized routine
|
2169
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2170
|
+
// numbers above 88.38 will flush to infinity
|
2171
|
+
// numbers beneath -103.97 will flush to zero
|
2172
|
+
inline static __m256 ggml_v_expf(__m256 x) {
|
2173
|
+
const __m256 r = _mm256_set1_ps(0x1.8p23f);
|
2174
|
+
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
|
2175
|
+
const __m256 n = _mm256_sub_ps(z, r);
|
2176
|
+
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
|
2177
|
+
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
|
2178
|
+
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
|
2179
|
+
const __m256 k = _mm256_castsi256_ps(
|
2180
|
+
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
|
2181
|
+
const __m256i c = _mm256_castps_si256(
|
2182
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2183
|
+
_mm256_set1_ps(126), _CMP_GT_OQ));
|
2184
|
+
const __m256 u = _mm256_mul_ps(b, b);
|
2185
|
+
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
|
2186
|
+
_mm256_set1_ps(0x1.573e2ep-5f)), u,
|
2187
|
+
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
|
2188
|
+
_mm256_set1_ps(0x1.fffdb6p-2f))),
|
2189
|
+
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
|
2190
|
+
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
|
2191
|
+
return _mm256_fmadd_ps(j, k, k);
|
2192
|
+
const __m256i g = _mm256_and_si256(
|
2193
|
+
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
|
2194
|
+
_mm256_set1_epi32(0x82000000u));
|
2195
|
+
const __m256 s1 =
|
2196
|
+
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
|
2197
|
+
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
|
2198
|
+
const __m256i d = _mm256_castps_si256(
|
2199
|
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
2200
|
+
_mm256_set1_ps(192), _CMP_GT_OQ));
|
2201
|
+
return _mm256_or_ps(
|
2202
|
+
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
|
2203
|
+
_mm256_andnot_ps(
|
2204
|
+
_mm256_castsi256_ps(d),
|
2205
|
+
_mm256_or_ps(
|
2206
|
+
_mm256_and_ps(_mm256_castsi256_ps(c),
|
2207
|
+
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
|
2208
|
+
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
|
2209
|
+
}
|
2210
|
+
|
2211
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2212
|
+
inline static __m256 ggml_v_silu(__m256 x) {
|
2213
|
+
const __m256 one = _mm256_set1_ps(1);
|
2214
|
+
const __m256 zero = _mm256_setzero_ps();
|
2215
|
+
const __m256 neg_x = _mm256_sub_ps(zero, x);
|
2216
|
+
const __m256 exp_neg_x = ggml_v_expf(neg_x);
|
2217
|
+
const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
|
2218
|
+
return _mm256_div_ps(x, one_plus_exp_neg_x);
|
2219
|
+
}
|
2220
|
+
|
2221
|
+
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
2222
|
+
|
2223
|
+
#if defined(__FMA__)
|
2224
|
+
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
|
2225
|
+
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
|
2043
2226
|
#else
|
2044
|
-
|
2045
|
-
|
2227
|
+
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
|
2228
|
+
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
|
2229
|
+
#endif
|
2230
|
+
|
2231
|
+
// adapted from arm limited optimized routine
|
2232
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
2233
|
+
// numbers above 88.38 will flush to infinity
|
2234
|
+
// numbers beneath -103.97 will flush to zero
|
2235
|
+
inline static __m128 ggml_v_expf(__m128 x) {
|
2236
|
+
const __m128 r = _mm_set1_ps(0x1.8p23f);
|
2237
|
+
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
|
2238
|
+
const __m128 n = _mm_sub_ps(z, r);
|
2239
|
+
const __m128 b =
|
2240
|
+
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
|
2241
|
+
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
|
2242
|
+
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
|
2243
|
+
const __m128i c =
|
2244
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
|
2245
|
+
const __m128 u = _mm_mul_ps(b, b);
|
2246
|
+
const __m128 j =
|
2247
|
+
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
|
2248
|
+
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
|
2249
|
+
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
|
2250
|
+
if (!_mm_movemask_epi8(c))
|
2251
|
+
return MADD128(j, k, k);
|
2252
|
+
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
|
2253
|
+
_mm_set1_epi32(0x82000000u));
|
2254
|
+
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
|
2255
|
+
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
|
2256
|
+
const __m128i d =
|
2257
|
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
|
2258
|
+
return _mm_or_ps(
|
2259
|
+
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
|
2260
|
+
_mm_andnot_ps(_mm_castsi128_ps(d),
|
2261
|
+
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
|
2262
|
+
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
|
2263
|
+
}
|
2264
|
+
|
2265
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
2266
|
+
inline static __m128 ggml_v_silu(__m128 x) {
|
2267
|
+
const __m128 one = _mm_set1_ps(1);
|
2268
|
+
const __m128 zero = _mm_setzero_ps();
|
2269
|
+
const __m128 neg_x = _mm_sub_ps(zero, x);
|
2270
|
+
const __m128 exp_neg_x = ggml_v_expf(neg_x);
|
2271
|
+
const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
|
2272
|
+
return _mm_div_ps(x, one_plus_exp_neg_x);
|
2273
|
+
}
|
2274
|
+
|
2275
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__
|
2276
|
+
|
2277
|
+
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
2278
|
+
int i = 0;
|
2279
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2280
|
+
for (; i + 15 < n; i += 16) {
|
2281
|
+
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
2282
|
+
}
|
2283
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2284
|
+
for (; i + 7 < n; i += 8) {
|
2285
|
+
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
2286
|
+
}
|
2287
|
+
#elif defined(__SSE2__)
|
2288
|
+
for (; i + 3 < n; i += 4) {
|
2289
|
+
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
2290
|
+
}
|
2291
|
+
#elif defined(__ARM_NEON)
|
2292
|
+
for (; i + 3 < n; i += 4) {
|
2293
|
+
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
2294
|
+
}
|
2295
|
+
#endif
|
2296
|
+
for (; i < n; ++i) {
|
2046
2297
|
y[i] = ggml_silu_f32(x[i]);
|
2047
2298
|
}
|
2048
2299
|
}
|
2300
|
+
|
2301
|
+
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
2302
|
+
int i = 0;
|
2303
|
+
ggml_float sum = 0;
|
2304
|
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
2305
|
+
for (; i + 15 < n; i += 16) {
|
2306
|
+
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
2307
|
+
_mm512_set1_ps(max)));
|
2308
|
+
_mm512_storeu_ps(y + i, val);
|
2309
|
+
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
2310
|
+
}
|
2311
|
+
#elif defined(__AVX2__) && defined(__FMA__)
|
2312
|
+
for (; i + 7 < n; i += 8) {
|
2313
|
+
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
2314
|
+
_mm256_set1_ps(max)));
|
2315
|
+
_mm256_storeu_ps(y + i, val);
|
2316
|
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
2317
|
+
_mm256_castps256_ps128(val));
|
2318
|
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
2319
|
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
2320
|
+
sum += (ggml_float)_mm_cvtss_f32(val2);
|
2321
|
+
}
|
2322
|
+
#elif defined(__SSE2__)
|
2323
|
+
for (; i + 3 < n; i += 4) {
|
2324
|
+
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
2325
|
+
_mm_set1_ps(max)));
|
2326
|
+
_mm_storeu_ps(y + i, val);
|
2327
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
2328
|
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
2329
|
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
2330
|
+
#else
|
2331
|
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
2332
|
+
val = _mm_add_ps(val, tmp);
|
2333
|
+
tmp = _mm_movehl_ps(tmp, val);
|
2334
|
+
val = _mm_add_ss(val, tmp);
|
2335
|
+
#endif
|
2336
|
+
sum += (ggml_float)_mm_cvtss_f32(val);
|
2337
|
+
}
|
2338
|
+
#elif defined(__ARM_NEON)
|
2339
|
+
for (; i + 3 < n; i += 4) {
|
2340
|
+
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
2341
|
+
vdupq_n_f32(max)));
|
2342
|
+
vst1q_f32(y + i, val);
|
2343
|
+
sum += (ggml_float)vaddvq_f32(val);
|
2344
|
+
}
|
2049
2345
|
#endif
|
2346
|
+
for (; i < n; ++i) {
|
2347
|
+
float val = expf(x[i] - max);
|
2348
|
+
sum += (ggml_float)val;
|
2349
|
+
y[i] = val;
|
2350
|
+
}
|
2351
|
+
return sum;
|
2352
|
+
}
|
2050
2353
|
|
2051
2354
|
inline static float ggml_silu_backward_f32(float x, float dy) {
|
2052
2355
|
const float s = 1.0f/(1.0f + expf(-x));
|
2053
2356
|
return dy*s*(1.0f + x*(1.0f - s));
|
2054
2357
|
}
|
2055
2358
|
|
2056
|
-
#ifdef GGML_SILU_FP16
|
2057
|
-
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2058
|
-
for (int i = 0; i < n; ++i) {
|
2059
|
-
// we did not use x[i] to compute forward silu but its f16 equivalent
|
2060
|
-
// take derivative at f16 of x[i]:
|
2061
|
-
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
2062
|
-
float usedx = GGML_FP16_TO_FP32(fp16);
|
2063
|
-
dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
|
2064
|
-
}
|
2065
|
-
}
|
2066
|
-
#else
|
2067
2359
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
2068
2360
|
for (int i = 0; i < n; ++i) {
|
2069
2361
|
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
2070
2362
|
}
|
2071
2363
|
}
|
2072
|
-
#endif
|
2073
2364
|
|
2074
2365
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
2075
2366
|
#ifndef GGML_USE_ACCELERATE
|
@@ -2185,7 +2476,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2185
2476
|
"SOFT_MAX_BACK",
|
2186
2477
|
"ROPE",
|
2187
2478
|
"ROPE_BACK",
|
2188
|
-
"ALIBI",
|
2189
2479
|
"CLAMP",
|
2190
2480
|
"CONV_TRANSPOSE_1D",
|
2191
2481
|
"IM2COL",
|
@@ -2227,7 +2517,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
2227
2517
|
"CROSS_ENTROPY_LOSS_BACK",
|
2228
2518
|
};
|
2229
2519
|
|
2230
|
-
static_assert(GGML_OP_COUNT ==
|
2520
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
2231
2521
|
|
2232
2522
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
2233
2523
|
"none",
|
@@ -2276,7 +2566,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2276
2566
|
"soft_max_back(x)",
|
2277
2567
|
"rope(x)",
|
2278
2568
|
"rope_back(x)",
|
2279
|
-
"alibi(x)",
|
2280
2569
|
"clamp(x)",
|
2281
2570
|
"conv_transpose_1d(x)",
|
2282
2571
|
"im2col(x)",
|
@@ -2318,7 +2607,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
2318
2607
|
"cross_entropy_loss_back(x,y)",
|
2319
2608
|
};
|
2320
2609
|
|
2321
|
-
static_assert(GGML_OP_COUNT ==
|
2610
|
+
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
2322
2611
|
|
2323
2612
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
2324
2613
|
|
@@ -2331,6 +2620,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2331
2620
|
"TANH",
|
2332
2621
|
"ELU",
|
2333
2622
|
"RELU",
|
2623
|
+
"SIGMOID",
|
2334
2624
|
"GELU",
|
2335
2625
|
"GELU_QUICK",
|
2336
2626
|
"SILU",
|
@@ -2338,7 +2628,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|
2338
2628
|
"HARDSIGMOID",
|
2339
2629
|
};
|
2340
2630
|
|
2341
|
-
static_assert(GGML_UNARY_OP_COUNT ==
|
2631
|
+
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
2342
2632
|
|
2343
2633
|
|
2344
2634
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
@@ -2380,32 +2670,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
|
2380
2670
|
}
|
2381
2671
|
}
|
2382
2672
|
|
2383
|
-
//
|
2384
|
-
// ggml context
|
2385
|
-
//
|
2386
|
-
|
2387
|
-
struct ggml_context {
|
2388
|
-
size_t mem_size;
|
2389
|
-
void * mem_buffer;
|
2390
|
-
bool mem_buffer_owned;
|
2391
|
-
bool no_alloc;
|
2392
|
-
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
2393
|
-
|
2394
|
-
int n_objects;
|
2395
|
-
|
2396
|
-
struct ggml_object * objects_begin;
|
2397
|
-
struct ggml_object * objects_end;
|
2398
|
-
|
2399
|
-
struct ggml_scratch scratch;
|
2400
|
-
struct ggml_scratch scratch_save;
|
2401
|
-
};
|
2402
|
-
|
2403
|
-
struct ggml_context_container {
|
2404
|
-
bool used;
|
2405
|
-
|
2406
|
-
struct ggml_context context;
|
2407
|
-
};
|
2408
|
-
|
2409
2673
|
//
|
2410
2674
|
// NUMA support
|
2411
2675
|
//
|
@@ -2819,6 +3083,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
2819
3083
|
(t0->ne[3] == t1->ne[3] );
|
2820
3084
|
}
|
2821
3085
|
|
3086
|
+
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3087
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3088
|
+
|
3089
|
+
return
|
3090
|
+
(t0->nb[0] == t1->nb[0] ) &&
|
3091
|
+
(t0->nb[1] == t1->nb[1] ) &&
|
3092
|
+
(t0->nb[2] == t1->nb[2] ) &&
|
3093
|
+
(t0->nb[3] == t1->nb[3] );
|
3094
|
+
}
|
3095
|
+
|
2822
3096
|
// check if t1 can be represented as a repeatition of t0
|
2823
3097
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
2824
3098
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
@@ -2878,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2878
3152
|
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
2879
3153
|
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
2880
3154
|
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
2881
|
-
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
2882
|
-
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
2883
3155
|
}
|
2884
3156
|
|
2885
3157
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
@@ -3163,6 +3435,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3163
3435
|
|
3164
3436
|
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
3165
3437
|
|
3438
|
+
#ifdef __clang__
|
3439
|
+
// temporary until ggml_tensor::backend is removed
|
3440
|
+
#pragma clang diagnostic push
|
3441
|
+
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
3442
|
+
#endif
|
3443
|
+
|
3166
3444
|
*result = (struct ggml_tensor) {
|
3167
3445
|
/*.type =*/ type,
|
3168
3446
|
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
@@ -3185,6 +3463,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
3185
3463
|
/*.padding =*/ { 0 },
|
3186
3464
|
};
|
3187
3465
|
|
3466
|
+
#ifdef __clang__
|
3467
|
+
#pragma clang diagnostic pop
|
3468
|
+
#endif
|
3469
|
+
|
3188
3470
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
3189
3471
|
//ggml_assert_aligned(result->data);
|
3190
3472
|
|
@@ -4563,6 +4845,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
4563
4845
|
return result;
|
4564
4846
|
}
|
4565
4847
|
|
4848
|
+
// ggml_sigmoid
|
4849
|
+
|
4850
|
+
struct ggml_tensor * ggml_sigmoid(
|
4851
|
+
struct ggml_context * ctx,
|
4852
|
+
struct ggml_tensor * a) {
|
4853
|
+
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
4854
|
+
}
|
4855
|
+
|
4856
|
+
struct ggml_tensor * ggml_sigmoid_inplace(
|
4857
|
+
struct ggml_context * ctx,
|
4858
|
+
struct ggml_tensor * a) {
|
4859
|
+
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
4860
|
+
}
|
4861
|
+
|
4566
4862
|
// ggml_gelu
|
4567
4863
|
|
4568
4864
|
struct ggml_tensor * ggml_gelu(
|
@@ -5646,7 +5942,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5646
5942
|
struct ggml_context * ctx,
|
5647
5943
|
struct ggml_tensor * a,
|
5648
5944
|
struct ggml_tensor * mask,
|
5649
|
-
struct ggml_tensor * pos,
|
5650
5945
|
float scale,
|
5651
5946
|
float max_bias,
|
5652
5947
|
bool inplace) {
|
@@ -5660,18 +5955,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5660
5955
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
5661
5956
|
}
|
5662
5957
|
|
5663
|
-
if (pos) {
|
5664
|
-
GGML_ASSERT(ggml_is_vector(pos));
|
5665
|
-
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
5666
|
-
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
5667
|
-
}
|
5668
|
-
|
5669
|
-
if (pos && mask) {
|
5670
|
-
GGML_ASSERT(pos->type == mask->type);
|
5671
|
-
}
|
5672
|
-
|
5673
5958
|
if (max_bias > 0.0f) {
|
5674
|
-
GGML_ASSERT(
|
5959
|
+
GGML_ASSERT(mask);
|
5675
5960
|
}
|
5676
5961
|
|
5677
5962
|
bool is_node = false;
|
@@ -5689,7 +5974,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5689
5974
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5690
5975
|
result->src[0] = a;
|
5691
5976
|
result->src[1] = mask;
|
5692
|
-
result->src[2] = pos;
|
5693
5977
|
|
5694
5978
|
return result;
|
5695
5979
|
}
|
@@ -5697,23 +5981,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
5697
5981
|
struct ggml_tensor * ggml_soft_max(
|
5698
5982
|
struct ggml_context * ctx,
|
5699
5983
|
struct ggml_tensor * a) {
|
5700
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
5984
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
|
5701
5985
|
}
|
5702
5986
|
|
5703
5987
|
struct ggml_tensor * ggml_soft_max_inplace(
|
5704
5988
|
struct ggml_context * ctx,
|
5705
5989
|
struct ggml_tensor * a) {
|
5706
|
-
return ggml_soft_max_impl(ctx, a, NULL,
|
5990
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
|
5707
5991
|
}
|
5708
5992
|
|
5709
5993
|
struct ggml_tensor * ggml_soft_max_ext(
|
5710
5994
|
struct ggml_context * ctx,
|
5711
5995
|
struct ggml_tensor * a,
|
5712
5996
|
struct ggml_tensor * mask,
|
5713
|
-
struct ggml_tensor * pos,
|
5714
5997
|
float scale,
|
5715
5998
|
float max_bias) {
|
5716
|
-
return ggml_soft_max_impl(ctx, a, mask,
|
5999
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
5717
6000
|
}
|
5718
6001
|
|
5719
6002
|
// ggml_soft_max_back
|
@@ -5928,37 +6211,6 @@ struct ggml_tensor * ggml_rope_back(
|
|
5928
6211
|
return result;
|
5929
6212
|
}
|
5930
6213
|
|
5931
|
-
// ggml_alibi
|
5932
|
-
|
5933
|
-
struct ggml_tensor * ggml_alibi(
|
5934
|
-
struct ggml_context * ctx,
|
5935
|
-
struct ggml_tensor * a,
|
5936
|
-
int n_past,
|
5937
|
-
int n_head,
|
5938
|
-
float bias_max) {
|
5939
|
-
GGML_ASSERT(n_past >= 0);
|
5940
|
-
bool is_node = false;
|
5941
|
-
|
5942
|
-
if (a->grad) {
|
5943
|
-
GGML_ASSERT(false); // TODO: implement backward
|
5944
|
-
is_node = true;
|
5945
|
-
}
|
5946
|
-
|
5947
|
-
// TODO: when implement backward, fix this:
|
5948
|
-
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5949
|
-
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
5950
|
-
|
5951
|
-
int32_t op_params[3] = { n_past, n_head };
|
5952
|
-
memcpy(op_params + 2, &bias_max, sizeof(float));
|
5953
|
-
ggml_set_op_params(result, op_params, sizeof(op_params));
|
5954
|
-
|
5955
|
-
result->op = GGML_OP_ALIBI;
|
5956
|
-
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5957
|
-
result->src[0] = a;
|
5958
|
-
|
5959
|
-
return result;
|
5960
|
-
}
|
5961
|
-
|
5962
6214
|
// ggml_clamp
|
5963
6215
|
|
5964
6216
|
struct ggml_tensor * ggml_clamp(
|
@@ -6308,7 +6560,10 @@ struct ggml_tensor * ggml_pool_2d(
|
|
6308
6560
|
static struct ggml_tensor * ggml_upscale_impl(
|
6309
6561
|
struct ggml_context * ctx,
|
6310
6562
|
struct ggml_tensor * a,
|
6311
|
-
int
|
6563
|
+
int ne0,
|
6564
|
+
int ne1,
|
6565
|
+
int ne2,
|
6566
|
+
int ne3) {
|
6312
6567
|
bool is_node = false;
|
6313
6568
|
|
6314
6569
|
if (a->grad) {
|
@@ -6316,19 +6571,45 @@ static struct ggml_tensor * ggml_upscale_impl(
|
|
6316
6571
|
is_node = true;
|
6317
6572
|
}
|
6318
6573
|
|
6574
|
+
GGML_ASSERT(a->ne[0] <= ne0);
|
6575
|
+
GGML_ASSERT(a->ne[1] <= ne1);
|
6576
|
+
GGML_ASSERT(a->ne[2] <= ne2);
|
6577
|
+
GGML_ASSERT(a->ne[3] <= ne3);
|
6578
|
+
|
6319
6579
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
6320
|
-
|
6321
|
-
|
6322
|
-
|
6580
|
+
ne0,
|
6581
|
+
ne1,
|
6582
|
+
ne2,
|
6583
|
+
ne3
|
6584
|
+
);
|
6323
6585
|
|
6324
6586
|
result->op = GGML_OP_UPSCALE;
|
6325
|
-
|
6587
|
+
|
6326
6588
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6327
6589
|
result->src[0] = a;
|
6328
6590
|
|
6329
6591
|
return result;
|
6330
6592
|
}
|
6331
6593
|
|
6594
|
+
struct ggml_tensor * ggml_upscale(
|
6595
|
+
struct ggml_context * ctx,
|
6596
|
+
struct ggml_tensor * a,
|
6597
|
+
int scale_factor) {
|
6598
|
+
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
|
6599
|
+
}
|
6600
|
+
|
6601
|
+
struct ggml_tensor * ggml_upscale_ext(
|
6602
|
+
struct ggml_context * ctx,
|
6603
|
+
struct ggml_tensor * a,
|
6604
|
+
int ne0,
|
6605
|
+
int ne1,
|
6606
|
+
int ne2,
|
6607
|
+
int ne3) {
|
6608
|
+
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
|
6609
|
+
}
|
6610
|
+
|
6611
|
+
// ggml_pad
|
6612
|
+
|
6332
6613
|
struct ggml_tensor * ggml_pad(
|
6333
6614
|
struct ggml_context * ctx,
|
6334
6615
|
struct ggml_tensor * a,
|
@@ -6353,12 +6634,7 @@ struct ggml_tensor * ggml_pad(
|
|
6353
6634
|
return result;
|
6354
6635
|
}
|
6355
6636
|
|
6356
|
-
|
6357
|
-
struct ggml_context * ctx,
|
6358
|
-
struct ggml_tensor * a,
|
6359
|
-
int scale_factor) {
|
6360
|
-
return ggml_upscale_impl(ctx, a, scale_factor);
|
6361
|
-
}
|
6637
|
+
// ggml_arange
|
6362
6638
|
|
6363
6639
|
struct ggml_tensor * ggml_arange(
|
6364
6640
|
struct ggml_context * ctx,
|
@@ -6380,6 +6656,8 @@ struct ggml_tensor * ggml_arange(
|
|
6380
6656
|
return result;
|
6381
6657
|
}
|
6382
6658
|
|
6659
|
+
// ggml_timestep_embedding
|
6660
|
+
|
6383
6661
|
struct ggml_tensor * ggml_timestep_embedding(
|
6384
6662
|
struct ggml_context * ctx,
|
6385
6663
|
struct ggml_tensor * timesteps,
|
@@ -6486,9 +6764,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6486
6764
|
struct ggml_tensor * k,
|
6487
6765
|
struct ggml_tensor * v,
|
6488
6766
|
struct ggml_tensor * mask,
|
6489
|
-
float scale
|
6767
|
+
float scale,
|
6768
|
+
float max_bias) {
|
6490
6769
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
6491
6770
|
// TODO: check if vT can be multiplied by (k*qT)
|
6771
|
+
|
6492
6772
|
if (mask) {
|
6493
6773
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
6494
6774
|
GGML_ASSERT(mask->ne[2] == 1);
|
@@ -6498,6 +6778,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6498
6778
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
6499
6779
|
}
|
6500
6780
|
|
6781
|
+
if (max_bias > 0.0f) {
|
6782
|
+
GGML_ASSERT(mask);
|
6783
|
+
}
|
6784
|
+
|
6501
6785
|
bool is_node = false;
|
6502
6786
|
|
6503
6787
|
if (q->grad || k->grad || v->grad) {
|
@@ -6508,7 +6792,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|
6508
6792
|
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
6509
6793
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6510
6794
|
|
6511
|
-
float params[] = { scale };
|
6795
|
+
float params[] = { scale, max_bias };
|
6512
6796
|
ggml_set_op_params(result, params, sizeof(params));
|
6513
6797
|
|
6514
6798
|
result->op = GGML_OP_FLASH_ATTN_EXT;
|
@@ -6528,7 +6812,7 @@ void ggml_flash_attn_ext_set_prec(
|
|
6528
6812
|
|
6529
6813
|
const int32_t prec_i32 = (int32_t) prec;
|
6530
6814
|
|
6531
|
-
ggml_set_op_params_i32(a,
|
6815
|
+
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
6532
6816
|
}
|
6533
6817
|
|
6534
6818
|
// ggml_flash_ff
|
@@ -10892,6 +11176,52 @@ static void ggml_compute_forward_relu(
|
|
10892
11176
|
}
|
10893
11177
|
}
|
10894
11178
|
|
11179
|
+
// ggml_compute_forward_sigmoid
|
11180
|
+
|
11181
|
+
static void ggml_compute_forward_sigmoid_f32(
|
11182
|
+
const struct ggml_compute_params * params,
|
11183
|
+
struct ggml_tensor * dst) {
|
11184
|
+
|
11185
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11186
|
+
|
11187
|
+
assert(params->ith == 0);
|
11188
|
+
assert(ggml_are_same_shape(src0, dst));
|
11189
|
+
|
11190
|
+
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11191
|
+
return;
|
11192
|
+
}
|
11193
|
+
|
11194
|
+
const int n = ggml_nrows(src0);
|
11195
|
+
const int nc = src0->ne[0];
|
11196
|
+
|
11197
|
+
assert(dst->nb[0] == sizeof(float));
|
11198
|
+
assert(src0->nb[0] == sizeof(float));
|
11199
|
+
|
11200
|
+
for (int i = 0; i < n; i++) {
|
11201
|
+
ggml_vec_sigmoid_f32(nc,
|
11202
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
11203
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
11204
|
+
}
|
11205
|
+
}
|
11206
|
+
|
11207
|
+
static void ggml_compute_forward_sigmoid(
|
11208
|
+
const struct ggml_compute_params * params,
|
11209
|
+
struct ggml_tensor * dst) {
|
11210
|
+
|
11211
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
11212
|
+
|
11213
|
+
switch (src0->type) {
|
11214
|
+
case GGML_TYPE_F32:
|
11215
|
+
{
|
11216
|
+
ggml_compute_forward_sigmoid_f32(params, dst);
|
11217
|
+
} break;
|
11218
|
+
default:
|
11219
|
+
{
|
11220
|
+
GGML_ASSERT(false);
|
11221
|
+
} break;
|
11222
|
+
}
|
11223
|
+
}
|
11224
|
+
|
10895
11225
|
// ggml_compute_forward_gelu
|
10896
11226
|
|
10897
11227
|
static void ggml_compute_forward_gelu_f32(
|
@@ -11742,48 +12072,139 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
11742
12072
|
}
|
11743
12073
|
#endif
|
11744
12074
|
|
11745
|
-
static void
|
11746
|
-
|
11747
|
-
|
12075
|
+
static void ggml_compute_forward_mul_mat_one_chunk(
|
12076
|
+
const struct ggml_compute_params * params,
|
12077
|
+
struct ggml_tensor * dst,
|
12078
|
+
const int64_t num_rows_per_vec_dot,
|
12079
|
+
const int64_t ir0_start,
|
12080
|
+
const int64_t ir0_end,
|
12081
|
+
const int64_t ir1_start,
|
12082
|
+
const int64_t ir1_end) {
|
11748
12083
|
|
11749
12084
|
const struct ggml_tensor * src0 = dst->src[0];
|
11750
12085
|
const struct ggml_tensor * src1 = dst->src[1];
|
11751
12086
|
|
11752
|
-
int64_t t0 = ggml_perf_time_us();
|
11753
|
-
UNUSED(t0);
|
11754
|
-
|
11755
12087
|
GGML_TENSOR_BINARY_OP_LOCALS
|
11756
12088
|
|
11757
|
-
const int ith = params->ith;
|
11758
|
-
const int nth = params->nth;
|
11759
|
-
|
11760
12089
|
const enum ggml_type type = src0->type;
|
11761
12090
|
|
11762
12091
|
const bool src1_cont = ggml_is_contiguous(src1);
|
11763
12092
|
|
11764
|
-
ggml_vec_dot_t const vec_dot
|
11765
|
-
enum ggml_type const vec_dot_type
|
11766
|
-
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
11767
|
-
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
12093
|
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
12094
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
11768
12095
|
|
11769
|
-
|
11770
|
-
|
11771
|
-
|
11772
|
-
GGML_ASSERT(ne3 == ne13);
|
12096
|
+
// broadcast factors
|
12097
|
+
const int64_t r2 = ne12 / ne02;
|
12098
|
+
const int64_t r3 = ne13 / ne03;
|
11773
12099
|
|
11774
|
-
//
|
11775
|
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
11776
|
-
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
12100
|
+
//printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
|
11777
12101
|
|
11778
|
-
//
|
11779
|
-
|
11780
|
-
|
11781
|
-
|
11782
|
-
|
12102
|
+
// threads with no work simply yield (not sure if it helps)
|
12103
|
+
if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
|
12104
|
+
return;
|
12105
|
+
}
|
12106
|
+
|
12107
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12108
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
12109
|
+
|
12110
|
+
assert(ne12 % ne02 == 0);
|
12111
|
+
assert(ne13 % ne03 == 0);
|
12112
|
+
|
12113
|
+
// block-tiling attempt
|
12114
|
+
const int64_t blck_0 = 16;
|
12115
|
+
const int64_t blck_1 = 16;
|
12116
|
+
|
12117
|
+
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
12118
|
+
|
12119
|
+
// attempt to reduce false-sharing (does not seem to make a difference)
|
12120
|
+
// 16 * 2, accounting for mmla kernels
|
12121
|
+
float tmp[32];
|
12122
|
+
|
12123
|
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
12124
|
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
12125
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
|
12126
|
+
const int64_t i13 = (ir1 / (ne12 * ne1));
|
12127
|
+
const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
12128
|
+
const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
12129
|
+
|
12130
|
+
// broadcast src0 into src1
|
12131
|
+
const int64_t i03 = i13 / r3;
|
12132
|
+
const int64_t i02 = i12 / r2;
|
12133
|
+
|
12134
|
+
const int64_t i1 = i11;
|
12135
|
+
const int64_t i2 = i12;
|
12136
|
+
const int64_t i3 = i13;
|
12137
|
+
|
12138
|
+
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
12139
|
+
|
12140
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
12141
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
12142
|
+
// the original src1 data pointer, so we should index using the indices directly
|
12143
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12144
|
+
const char * src1_col = (const char*)wdata +
|
12145
|
+
(src1_cont || src1->type != vec_dot_type
|
12146
|
+
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
12147
|
+
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
12148
|
+
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
12149
|
+
|
12150
|
+
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
12151
|
+
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
12152
|
+
//}
|
12153
|
+
|
12154
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
12155
|
+
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
12156
|
+
}
|
12157
|
+
|
12158
|
+
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
12159
|
+
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
12160
|
+
}
|
12161
|
+
}
|
12162
|
+
}
|
12163
|
+
}
|
12164
|
+
}
|
12165
|
+
|
12166
|
+
static void ggml_compute_forward_mul_mat(
|
12167
|
+
const struct ggml_compute_params * params,
|
12168
|
+
struct ggml_tensor * dst,
|
12169
|
+
struct ggml_compute_state * state) {
|
12170
|
+
|
12171
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
12172
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
12173
|
+
|
12174
|
+
int64_t t0 = ggml_perf_time_us();
|
12175
|
+
UNUSED(t0);
|
12176
|
+
|
12177
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
12178
|
+
|
12179
|
+
const int ith = params->ith;
|
12180
|
+
const int nth = params->nth;
|
12181
|
+
|
12182
|
+
const enum ggml_type type = src0->type;
|
12183
|
+
|
12184
|
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
12185
|
+
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
12186
|
+
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
12187
|
+
|
12188
|
+
GGML_ASSERT(ne0 == ne01);
|
12189
|
+
GGML_ASSERT(ne1 == ne11);
|
12190
|
+
GGML_ASSERT(ne2 == ne12);
|
12191
|
+
GGML_ASSERT(ne3 == ne13);
|
12192
|
+
|
12193
|
+
// we don't support permuted src0 or src1
|
12194
|
+
GGML_ASSERT(nb00 == ggml_type_size(type));
|
12195
|
+
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
12196
|
+
|
12197
|
+
// dst cannot be transposed or permuted
|
12198
|
+
GGML_ASSERT(nb0 == sizeof(float));
|
12199
|
+
GGML_ASSERT(nb0 <= nb1);
|
12200
|
+
GGML_ASSERT(nb1 <= nb2);
|
12201
|
+
GGML_ASSERT(nb2 <= nb3);
|
11783
12202
|
|
11784
12203
|
// broadcast factors
|
11785
|
-
const int64_t r2 = ne12/ne02;
|
11786
|
-
const int64_t r3 = ne13/ne03;
|
12204
|
+
const int64_t r2 = ne12 / ne02;
|
12205
|
+
const int64_t r3 = ne13 / ne03;
|
12206
|
+
UNUSED(r2);
|
12207
|
+
UNUSED(r3);
|
11787
12208
|
|
11788
12209
|
// nb01 >= nb00 - src0 is not transposed
|
11789
12210
|
// compute by src0 rows
|
@@ -11865,6 +12286,8 @@ static void ggml_compute_forward_mul_mat(
|
|
11865
12286
|
#endif
|
11866
12287
|
|
11867
12288
|
#if GGML_USE_LLAMAFILE
|
12289
|
+
const bool src1_cont = ggml_is_contiguous(src1);
|
12290
|
+
|
11868
12291
|
if (src1_cont) {
|
11869
12292
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11870
12293
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
@@ -11890,6 +12313,8 @@ UseGgmlGemm1:;
|
|
11890
12313
|
if (ith != 0) {
|
11891
12314
|
return;
|
11892
12315
|
}
|
12316
|
+
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
12317
|
+
atomic_store(&state->shared->current_chunk, nth);
|
11893
12318
|
if (src1->type != vec_dot_type) {
|
11894
12319
|
char * wdata = params->wdata;
|
11895
12320
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
@@ -11914,11 +12339,11 @@ UseGgmlGemm1:;
|
|
11914
12339
|
return;
|
11915
12340
|
}
|
11916
12341
|
|
11917
|
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
11918
|
-
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
11919
|
-
|
11920
12342
|
#if GGML_USE_LLAMAFILE
|
11921
12343
|
if (src1->type != vec_dot_type) {
|
12344
|
+
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
12345
|
+
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
12346
|
+
|
11922
12347
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
11923
12348
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
11924
12349
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
@@ -11939,98 +12364,87 @@ UseGgmlGemm1:;
|
|
11939
12364
|
UseGgmlGemm2:;
|
11940
12365
|
#endif
|
11941
12366
|
|
11942
|
-
|
11943
|
-
|
11944
|
-
|
11945
|
-
|
11946
|
-
|
11947
|
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
11948
|
-
|
11949
|
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
11950
|
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
11951
|
-
|
11952
|
-
const int64_t ith0 = ith % nth0;
|
11953
|
-
const int64_t ith1 = ith / nth0;
|
11954
|
-
|
11955
|
-
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
11956
|
-
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
11957
|
-
|
11958
|
-
const int64_t ir010 = dr0*ith0;
|
11959
|
-
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
11960
|
-
|
11961
|
-
const int64_t ir110 = dr1*ith1;
|
11962
|
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
11963
|
-
|
11964
|
-
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
11965
|
-
|
11966
|
-
// threads with no work simply yield (not sure if it helps)
|
11967
|
-
if (ir010 >= ir011 || ir110 >= ir111) {
|
11968
|
-
sched_yield();
|
11969
|
-
return;
|
11970
|
-
}
|
12367
|
+
#ifdef GGML_PERF
|
12368
|
+
int chunks_executed = 0;
|
12369
|
+
UNUSED(chunks_executed);
|
12370
|
+
#endif
|
11971
12371
|
|
11972
|
-
|
11973
|
-
|
12372
|
+
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
12373
|
+
const int64_t nr0 = ne0;
|
11974
12374
|
|
11975
|
-
//
|
11976
|
-
const int64_t
|
11977
|
-
const int64_t blck_1 = 16;
|
12375
|
+
// This is the size of the rest of the dimensions of the result
|
12376
|
+
const int64_t nr1 = ne1 * ne2 * ne3;
|
11978
12377
|
|
11979
12378
|
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
11980
|
-
int64_t
|
12379
|
+
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
11981
12380
|
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
11982
12381
|
// this check can be removed once they are extended to support odd numbered rows/cols too
|
11983
12382
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
11984
|
-
|
12383
|
+
num_rows_per_vec_dot = 1;
|
11985
12384
|
}
|
11986
12385
|
|
11987
|
-
|
12386
|
+
// Now select a reasonable chunk size.
|
12387
|
+
int chunk_size = 16;
|
11988
12388
|
|
11989
|
-
//
|
11990
|
-
|
11991
|
-
|
12389
|
+
// We need to step up the size if it's small
|
12390
|
+
if (nr0 == 1 || nr1 == 1) {
|
12391
|
+
chunk_size = 64;
|
12392
|
+
}
|
11992
12393
|
|
11993
|
-
|
11994
|
-
|
11995
|
-
|
11996
|
-
|
11997
|
-
|
11998
|
-
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
12394
|
+
// distribute the work across the inner or outer loop based on which one is larger
|
12395
|
+
// The number of chunks in the 0/1 dim.
|
12396
|
+
// CEIL(nr0/chunk_size)
|
12397
|
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
12398
|
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
11999
12399
|
|
12000
|
-
|
12001
|
-
|
12002
|
-
|
12400
|
+
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
12401
|
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
12402
|
+
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
12403
|
+
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
12404
|
+
// distribute the thread work across the inner or outer loop based on which one is larger
|
12405
|
+
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
12406
|
+
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
12407
|
+
}
|
12003
12408
|
|
12004
|
-
|
12005
|
-
|
12006
|
-
|
12409
|
+
// The number of elements in each chunk
|
12410
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
12411
|
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
12007
12412
|
|
12008
|
-
|
12413
|
+
//if (ith == 0)
|
12414
|
+
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
12009
12415
|
|
12010
|
-
|
12011
|
-
|
12012
|
-
// the original src1 data pointer, so we should index using the indices directly
|
12013
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
12014
|
-
const char * src1_col = (const char *) wdata +
|
12015
|
-
(src1_cont || src1->type != vec_dot_type
|
12016
|
-
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
12017
|
-
: (i11*nb11 + i12*nb12 + i13*nb13));
|
12018
|
-
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
12416
|
+
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
12417
|
+
int current_chunk = ith;
|
12019
12418
|
|
12020
|
-
|
12021
|
-
|
12022
|
-
|
12419
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
12420
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
12421
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
12023
12422
|
|
12024
|
-
|
12025
|
-
|
12026
|
-
}
|
12423
|
+
const int64_t ir0_start = dr0 * ith0;
|
12424
|
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
12027
12425
|
|
12028
|
-
|
12029
|
-
|
12030
|
-
|
12031
|
-
|
12426
|
+
const int64_t ir1_start = dr1 * ith1;
|
12427
|
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
12428
|
+
|
12429
|
+
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
12430
|
+
|
12431
|
+
#ifdef GGML_PERF
|
12432
|
+
chunks_executed++;
|
12433
|
+
#endif
|
12434
|
+
|
12435
|
+
if (nth >= nchunk0 * nchunk1) {
|
12436
|
+
break;
|
12032
12437
|
}
|
12438
|
+
|
12439
|
+
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
12033
12440
|
}
|
12441
|
+
|
12442
|
+
#ifdef GGML_PERF
|
12443
|
+
// These numbers are useful when trying to measure how well the threading scheduling works.
|
12444
|
+
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
|
12445
|
+
//float time = (ggml_perf_time_us() - t0);
|
12446
|
+
//printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
|
12447
|
+
#endif
|
12034
12448
|
}
|
12035
12449
|
|
12036
12450
|
// ggml_compute_forward_mul_mat_id
|
@@ -13333,7 +13747,6 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13333
13747
|
|
13334
13748
|
const struct ggml_tensor * src0 = dst->src[0];
|
13335
13749
|
const struct ggml_tensor * src1 = dst->src[1];
|
13336
|
-
const struct ggml_tensor * src2 = dst->src[2];
|
13337
13750
|
|
13338
13751
|
assert(ggml_is_contiguous(dst));
|
13339
13752
|
assert(ggml_are_same_shape(src0, dst));
|
@@ -13359,8 +13772,8 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13359
13772
|
|
13360
13773
|
// TODO: is this supposed to be ceil instead of floor?
|
13361
13774
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
13362
|
-
const uint32_t
|
13363
|
-
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(
|
13775
|
+
const uint32_t n_head = ne02;
|
13776
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
13364
13777
|
|
13365
13778
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
13366
13779
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
@@ -13377,13 +13790,13 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13377
13790
|
|
13378
13791
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
13379
13792
|
|
13380
|
-
|
13381
|
-
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
13382
|
-
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
13383
|
-
|
13384
|
-
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
13793
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
13385
13794
|
|
13386
13795
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
13796
|
+
// ALiBi
|
13797
|
+
const uint32_t h = (i1/ne01)%ne02; // head
|
13798
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
13799
|
+
|
13387
13800
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
13388
13801
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
13389
13802
|
|
@@ -13396,27 +13809,11 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13396
13809
|
if (mp_f32) {
|
13397
13810
|
if (use_f16) {
|
13398
13811
|
for (int i = 0; i < nc; ++i) {
|
13399
|
-
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
13400
|
-
}
|
13401
|
-
} else {
|
13402
|
-
for (int i = 0; i < nc; ++i) {
|
13403
|
-
wp[i] += mp_f32[i];
|
13404
|
-
}
|
13405
|
-
}
|
13406
|
-
}
|
13407
|
-
|
13408
|
-
// ALiBi bias
|
13409
|
-
if (max_bias > 0.0f) {
|
13410
|
-
const uint32_t h = (i1/ne01)%ne02; // head
|
13411
|
-
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
13412
|
-
|
13413
|
-
if (use_f16) {
|
13414
|
-
for (int i = 0; i < nc; ++i) {
|
13415
|
-
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
13812
|
+
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
|
13416
13813
|
}
|
13417
13814
|
} else {
|
13418
13815
|
for (int i = 0; i < nc; ++i) {
|
13419
|
-
wp[i] += slope*
|
13816
|
+
wp[i] += slope*mp_f32[i];
|
13420
13817
|
}
|
13421
13818
|
}
|
13422
13819
|
}
|
@@ -13431,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32(
|
|
13431
13828
|
float max = -INFINITY;
|
13432
13829
|
ggml_vec_max_f32(nc, &max, wp);
|
13433
13830
|
|
13434
|
-
ggml_float sum =
|
13435
|
-
|
13436
|
-
uint16_t scvt;
|
13437
|
-
for (int i = 0; i < nc; i++) {
|
13438
|
-
if (wp[i] == -INFINITY) {
|
13439
|
-
dp[i] = 0.0f;
|
13440
|
-
} else {
|
13441
|
-
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
13442
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
13443
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
13444
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
13445
|
-
sum += (ggml_float)val;
|
13446
|
-
dp[i] = val;
|
13447
|
-
}
|
13448
|
-
}
|
13449
|
-
|
13831
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
|
13450
13832
|
assert(sum > 0.0);
|
13451
13833
|
|
13452
13834
|
sum = 1.0/sum;
|
@@ -13578,178 +13960,6 @@ static void ggml_compute_forward_soft_max_back(
|
|
13578
13960
|
}
|
13579
13961
|
}
|
13580
13962
|
|
13581
|
-
// ggml_compute_forward_alibi
|
13582
|
-
|
13583
|
-
static void ggml_compute_forward_alibi_f32(
|
13584
|
-
const struct ggml_compute_params * params,
|
13585
|
-
struct ggml_tensor * dst) {
|
13586
|
-
|
13587
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13588
|
-
|
13589
|
-
assert(params->ith == 0);
|
13590
|
-
|
13591
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13592
|
-
return;
|
13593
|
-
}
|
13594
|
-
|
13595
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13596
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
13597
|
-
float max_bias;
|
13598
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13599
|
-
|
13600
|
-
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13601
|
-
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13602
|
-
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13603
|
-
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
13604
|
-
|
13605
|
-
const int64_t n = ggml_nrows(src0);
|
13606
|
-
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13607
|
-
|
13608
|
-
const size_t nb0 = src0->nb[0];
|
13609
|
-
const size_t nb1 = src0->nb[1];
|
13610
|
-
const size_t nb2 = src0->nb[2];
|
13611
|
-
//const int nb3 = src0->nb[3];
|
13612
|
-
|
13613
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
13614
|
-
GGML_ASSERT(n_head == ne2);
|
13615
|
-
|
13616
|
-
// add alibi to src0 (KQ_scaled)
|
13617
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
13618
|
-
|
13619
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13620
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13621
|
-
|
13622
|
-
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13623
|
-
// TODO: k*nb2 or k*nb3
|
13624
|
-
float m_k;
|
13625
|
-
|
13626
|
-
if (k < n_heads_log2_floor) {
|
13627
|
-
m_k = powf(m0, k + 1);
|
13628
|
-
} else {
|
13629
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13630
|
-
}
|
13631
|
-
|
13632
|
-
for (int64_t i = 0; i < ne0; i++) {
|
13633
|
-
for (int64_t j = 0; j < ne1; j++) {
|
13634
|
-
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13635
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13636
|
-
pdst[0] = i * m_k + src[0];
|
13637
|
-
}
|
13638
|
-
}
|
13639
|
-
}
|
13640
|
-
}
|
13641
|
-
|
13642
|
-
static void ggml_compute_forward_alibi_f16(
|
13643
|
-
const struct ggml_compute_params * params,
|
13644
|
-
struct ggml_tensor * dst) {
|
13645
|
-
|
13646
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13647
|
-
|
13648
|
-
assert(params->ith == 0);
|
13649
|
-
|
13650
|
-
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
13651
|
-
return;
|
13652
|
-
}
|
13653
|
-
|
13654
|
-
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13655
|
-
const int n_head = ((int32_t *) dst->op_params)[1];
|
13656
|
-
float max_bias;
|
13657
|
-
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13658
|
-
|
13659
|
-
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13660
|
-
const int ne1 = src0->ne[1]; // seq_len_without_past
|
13661
|
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13662
|
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13663
|
-
|
13664
|
-
const int n = ggml_nrows(src0);
|
13665
|
-
const int ne2_ne3 = n/ne1; // ne2*ne3
|
13666
|
-
|
13667
|
-
const int nb0 = src0->nb[0];
|
13668
|
-
const int nb1 = src0->nb[1];
|
13669
|
-
const int nb2 = src0->nb[2];
|
13670
|
-
//const int nb3 = src0->nb[3];
|
13671
|
-
|
13672
|
-
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
13673
|
-
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
13674
|
-
GGML_ASSERT(n_head == ne2);
|
13675
|
-
|
13676
|
-
// add alibi to src0 (KQ_scaled)
|
13677
|
-
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
13678
|
-
|
13679
|
-
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13680
|
-
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13681
|
-
|
13682
|
-
for (int k = 0; k < ne2_ne3; k++) {
|
13683
|
-
// TODO: k*nb2 or k*nb3
|
13684
|
-
float m_k;
|
13685
|
-
|
13686
|
-
if (k < n_heads_log2_floor) {
|
13687
|
-
m_k = powf(m0, k + 1);
|
13688
|
-
} else {
|
13689
|
-
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
13690
|
-
}
|
13691
|
-
|
13692
|
-
for (int i = 0; i < ne0; i++) {
|
13693
|
-
for (int j = 0; j < ne1; j++) {
|
13694
|
-
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13695
|
-
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13696
|
-
|
13697
|
-
// we return F32
|
13698
|
-
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
13699
|
-
}
|
13700
|
-
}
|
13701
|
-
}
|
13702
|
-
}
|
13703
|
-
|
13704
|
-
static void ggml_compute_forward_alibi(
|
13705
|
-
const struct ggml_compute_params * params,
|
13706
|
-
struct ggml_tensor * dst) {
|
13707
|
-
|
13708
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
13709
|
-
|
13710
|
-
switch (src0->type) {
|
13711
|
-
case GGML_TYPE_F16:
|
13712
|
-
{
|
13713
|
-
ggml_compute_forward_alibi_f16(params, dst);
|
13714
|
-
} break;
|
13715
|
-
case GGML_TYPE_F32:
|
13716
|
-
{
|
13717
|
-
ggml_compute_forward_alibi_f32(params, dst);
|
13718
|
-
} break;
|
13719
|
-
case GGML_TYPE_BF16:
|
13720
|
-
case GGML_TYPE_Q4_0:
|
13721
|
-
case GGML_TYPE_Q4_1:
|
13722
|
-
case GGML_TYPE_Q5_0:
|
13723
|
-
case GGML_TYPE_Q5_1:
|
13724
|
-
case GGML_TYPE_Q8_0:
|
13725
|
-
case GGML_TYPE_Q8_1:
|
13726
|
-
case GGML_TYPE_Q2_K:
|
13727
|
-
case GGML_TYPE_Q3_K:
|
13728
|
-
case GGML_TYPE_Q4_K:
|
13729
|
-
case GGML_TYPE_Q5_K:
|
13730
|
-
case GGML_TYPE_Q6_K:
|
13731
|
-
case GGML_TYPE_IQ2_XXS:
|
13732
|
-
case GGML_TYPE_IQ2_XS:
|
13733
|
-
case GGML_TYPE_IQ3_XXS:
|
13734
|
-
case GGML_TYPE_IQ1_S:
|
13735
|
-
case GGML_TYPE_IQ1_M:
|
13736
|
-
case GGML_TYPE_IQ4_NL:
|
13737
|
-
case GGML_TYPE_IQ4_XS:
|
13738
|
-
case GGML_TYPE_IQ3_S:
|
13739
|
-
case GGML_TYPE_IQ2_S:
|
13740
|
-
case GGML_TYPE_Q8_K:
|
13741
|
-
case GGML_TYPE_I8:
|
13742
|
-
case GGML_TYPE_I16:
|
13743
|
-
case GGML_TYPE_I32:
|
13744
|
-
case GGML_TYPE_I64:
|
13745
|
-
case GGML_TYPE_F64:
|
13746
|
-
case GGML_TYPE_COUNT:
|
13747
|
-
{
|
13748
|
-
GGML_ASSERT(false);
|
13749
|
-
} break;
|
13750
|
-
}
|
13751
|
-
}
|
13752
|
-
|
13753
13963
|
// ggml_compute_forward_clamp
|
13754
13964
|
|
13755
13965
|
static void ggml_compute_forward_clamp_f32(
|
@@ -14972,25 +15182,28 @@ static void ggml_compute_forward_upscale_f32(
|
|
14972
15182
|
return;
|
14973
15183
|
}
|
14974
15184
|
|
14975
|
-
GGML_ASSERT(src0->
|
15185
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
14976
15186
|
|
14977
15187
|
const int ith = params->ith;
|
14978
15188
|
const int nth = params->nth;
|
14979
15189
|
|
14980
15190
|
GGML_TENSOR_UNARY_OP_LOCALS
|
14981
15191
|
|
14982
|
-
const
|
15192
|
+
const float sf0 = (float)ne0/src0->ne[0];
|
15193
|
+
const float sf1 = (float)ne1/src0->ne[1];
|
15194
|
+
const float sf2 = (float)ne2/src0->ne[2];
|
15195
|
+
const float sf3 = (float)ne3/src0->ne[3];
|
14983
15196
|
|
14984
15197
|
// TODO: optimize
|
14985
15198
|
|
14986
15199
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
14987
|
-
const int64_t i03 = i3;
|
15200
|
+
const int64_t i03 = i3 / sf3;
|
14988
15201
|
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
14989
|
-
const int64_t i02 = i2;
|
15202
|
+
const int64_t i02 = i2 / sf2;
|
14990
15203
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14991
|
-
const int64_t i01 = i1 /
|
15204
|
+
const int64_t i01 = i1 / sf1;
|
14992
15205
|
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
14993
|
-
const int64_t i00 = i0 /
|
15206
|
+
const int64_t i00 = i0 / sf0;
|
14994
15207
|
|
14995
15208
|
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
14996
15209
|
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
@@ -15020,6 +15233,7 @@ static void ggml_compute_forward_upscale(
|
|
15020
15233
|
}
|
15021
15234
|
}
|
15022
15235
|
|
15236
|
+
|
15023
15237
|
// ggml_compute_forward_pad
|
15024
15238
|
|
15025
15239
|
static void ggml_compute_forward_pad_f32(
|
@@ -15373,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
|
15373
15587
|
vvexpf(S, S, &Mup);
|
15374
15588
|
ggml_vec_sum_f32(Mup, &sum, S);
|
15375
15589
|
#else
|
15376
|
-
|
15377
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15378
|
-
|
15379
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15380
|
-
if (i >= masked_begin) {
|
15381
|
-
break;
|
15382
|
-
}
|
15383
|
-
float * SS = S + i;
|
15384
|
-
|
15385
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15386
|
-
if (i + j >= masked_begin) {
|
15387
|
-
break;
|
15388
|
-
} else if (SS[j] == -INFINITY) {
|
15389
|
-
SS[j] = 0.0f;
|
15390
|
-
} else {
|
15391
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
15392
|
-
const float val = expf(SS[j] - max);
|
15393
|
-
#else
|
15394
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15395
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15396
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15397
|
-
#endif
|
15398
|
-
sump[j] += (ggml_float)val;
|
15399
|
-
SS[j] = val;
|
15400
|
-
}
|
15401
|
-
}
|
15402
|
-
}
|
15403
|
-
|
15404
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15405
|
-
sum += sump[i];
|
15406
|
-
}
|
15590
|
+
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
15407
15591
|
#endif
|
15408
15592
|
}
|
15409
15593
|
|
@@ -15585,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
|
15585
15769
|
vvexpf(S, S, &Mup);
|
15586
15770
|
ggml_vec_sum_f32(Mup, &sum, S);
|
15587
15771
|
#else
|
15588
|
-
|
15589
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
15590
|
-
|
15591
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
15592
|
-
float * SS = S + i;
|
15593
|
-
|
15594
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
15595
|
-
if (SS[j] == -INFINITY) {
|
15596
|
-
SS[j] = 0.0f;
|
15597
|
-
} else {
|
15598
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
15599
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
15600
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
15601
|
-
sump[j] += (ggml_float)val;
|
15602
|
-
SS[j] = val;
|
15603
|
-
}
|
15604
|
-
}
|
15605
|
-
}
|
15606
|
-
|
15607
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
15608
|
-
sum += sump[i];
|
15609
|
-
}
|
15772
|
+
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
15610
15773
|
#endif
|
15611
15774
|
}
|
15612
15775
|
|
@@ -15763,8 +15926,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15763
15926
|
const int ir0 = dr*ith;
|
15764
15927
|
const int ir1 = MIN(ir0 + dr, nr);
|
15765
15928
|
|
15766
|
-
float scale
|
15767
|
-
|
15929
|
+
float scale = 1.0f;
|
15930
|
+
float max_bias = 0.0f;
|
15931
|
+
|
15932
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
15933
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
15934
|
+
|
15935
|
+
const uint32_t n_head = neq2;
|
15936
|
+
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
15937
|
+
|
15938
|
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
15939
|
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
15768
15940
|
|
15769
15941
|
// loop over n_batch and n_head
|
15770
15942
|
for (int ir = ir0; ir < ir1; ++ir) {
|
@@ -15773,6 +15945,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15773
15945
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
15774
15946
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
15775
15947
|
|
15948
|
+
const uint32_t h = iq2; // head
|
15949
|
+
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
15950
|
+
|
15776
15951
|
float S = 0.0f;
|
15777
15952
|
float M = -INFINITY;
|
15778
15953
|
|
@@ -15796,7 +15971,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
15796
15971
|
// loop over n_kv and n_head_kv
|
15797
15972
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
15798
15973
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
15799
|
-
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15974
|
+
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
15800
15975
|
if (mv == -INFINITY) {
|
15801
15976
|
continue;
|
15802
15977
|
}
|
@@ -15867,7 +16042,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
|
15867
16042
|
const struct ggml_tensor * v,
|
15868
16043
|
const struct ggml_tensor * mask,
|
15869
16044
|
struct ggml_tensor * dst) {
|
15870
|
-
switch (dst->op_params[
|
16045
|
+
switch (dst->op_params[2]) {
|
15871
16046
|
case GGML_PREC_DEFAULT:
|
15872
16047
|
case GGML_PREC_F32:
|
15873
16048
|
{
|
@@ -16221,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|
16221
16396
|
vvexpf(SM, SM, &Mup);
|
16222
16397
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
16223
16398
|
#else
|
16224
|
-
|
16225
|
-
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
16226
|
-
|
16227
|
-
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
16228
|
-
if (i >= masked_begin) {
|
16229
|
-
break;
|
16230
|
-
}
|
16231
|
-
float * SR = S + i;
|
16232
|
-
float * SW = SM + i;
|
16233
|
-
|
16234
|
-
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
16235
|
-
if (i + j >= masked_begin) {
|
16236
|
-
break;
|
16237
|
-
} else if (SR[j] == -INFINITY) {
|
16238
|
-
SW[j] = 0.0f;
|
16239
|
-
} else {
|
16240
|
-
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
16241
|
-
const float val = expf(SR[j] - max);
|
16242
|
-
#else
|
16243
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
16244
|
-
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
16245
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
16246
|
-
#endif
|
16247
|
-
sump[j] += (ggml_float)val;
|
16248
|
-
SW[j] = val;
|
16249
|
-
}
|
16250
|
-
}
|
16251
|
-
}
|
16252
|
-
|
16253
|
-
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
16254
|
-
sum += sump[i];
|
16255
|
-
}
|
16399
|
+
sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
|
16256
16400
|
#endif
|
16257
16401
|
}
|
16258
16402
|
|
@@ -16834,6 +16978,10 @@ static void ggml_compute_forward_unary(
|
|
16834
16978
|
{
|
16835
16979
|
ggml_compute_forward_relu(params, dst);
|
16836
16980
|
} break;
|
16981
|
+
case GGML_UNARY_OP_SIGMOID:
|
16982
|
+
{
|
16983
|
+
ggml_compute_forward_sigmoid(params, dst);
|
16984
|
+
} break;
|
16837
16985
|
case GGML_UNARY_OP_GELU:
|
16838
16986
|
{
|
16839
16987
|
ggml_compute_forward_gelu(params, dst);
|
@@ -17274,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|
17274
17422
|
assert(!isnan(s1[i]));
|
17275
17423
|
}
|
17276
17424
|
#endif
|
17277
|
-
// soft_max
|
17278
|
-
ggml_float sum = 0.0;
|
17279
|
-
{
|
17280
|
-
float max = -INFINITY;
|
17281
|
-
ggml_vec_max_f32(nc, &max, s0);
|
17282
17425
|
|
17283
|
-
|
17284
|
-
|
17285
|
-
|
17286
|
-
|
17287
|
-
|
17288
|
-
|
17289
|
-
const float s = s0[i] - max;
|
17290
|
-
const float val = expf(s);
|
17291
|
-
#else
|
17292
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17293
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17294
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17295
|
-
#endif
|
17296
|
-
sum += (ggml_float)val;
|
17297
|
-
st[i] = val;
|
17298
|
-
}
|
17299
|
-
}
|
17426
|
+
// soft_max
|
17427
|
+
float max = -INFINITY;
|
17428
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17429
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
|
17430
|
+
assert(sum > 0.0);
|
17431
|
+
sum = (1.0 - eps) / sum;
|
17300
17432
|
|
17301
|
-
assert(sum > 0.0);
|
17302
|
-
// sum = 1.0/sum;
|
17303
|
-
}
|
17304
17433
|
// avoid log(0) by rescaling from [0..1] to [eps..1]
|
17305
|
-
sum = (1.0 - eps) / sum;
|
17306
17434
|
ggml_vec_scale_f32(nc, st, sum);
|
17307
17435
|
ggml_vec_add1_f32(nc, st, st, eps);
|
17308
17436
|
ggml_vec_log_f32(nc, st, st);
|
@@ -17392,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|
17392
17520
|
#endif
|
17393
17521
|
|
17394
17522
|
// soft_max
|
17395
|
-
|
17396
|
-
|
17397
|
-
|
17398
|
-
|
17399
|
-
|
17400
|
-
uint16_t scvt; UNUSED(scvt);
|
17401
|
-
for (int i = 0; i < nc; i++) {
|
17402
|
-
if (s0[i] == -INFINITY) {
|
17403
|
-
ds0[i] = 0.0f;
|
17404
|
-
} else {
|
17405
|
-
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
17406
|
-
const float s = s0[i] - max;
|
17407
|
-
const float val = expf(s);
|
17408
|
-
#else
|
17409
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
17410
|
-
memcpy(&scvt, &s, sizeof(scvt));
|
17411
|
-
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
17412
|
-
#endif
|
17413
|
-
sum += (ggml_float)val;
|
17414
|
-
ds0[i] = val;
|
17415
|
-
}
|
17416
|
-
}
|
17417
|
-
|
17418
|
-
assert(sum > 0.0);
|
17419
|
-
sum = (1.0 - eps)/sum;
|
17420
|
-
}
|
17523
|
+
float max = -INFINITY;
|
17524
|
+
ggml_vec_max_f32(nc, &max, s0);
|
17525
|
+
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
17526
|
+
assert(sum > 0.0);
|
17527
|
+
sum = (1.0 - eps) / sum;
|
17421
17528
|
|
17422
17529
|
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
17423
17530
|
ggml_vec_scale_f32(nc, ds0, sum);
|
@@ -17454,7 +17561,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|
17454
17561
|
|
17455
17562
|
/////////////////////////////////
|
17456
17563
|
|
17457
|
-
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
17564
|
+
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
|
17458
17565
|
GGML_ASSERT(params);
|
17459
17566
|
|
17460
17567
|
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
@@ -17552,7 +17659,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17552
17659
|
} break;
|
17553
17660
|
case GGML_OP_MUL_MAT:
|
17554
17661
|
{
|
17555
|
-
ggml_compute_forward_mul_mat(params, tensor);
|
17662
|
+
ggml_compute_forward_mul_mat(params, tensor, state);
|
17556
17663
|
} break;
|
17557
17664
|
case GGML_OP_MUL_MAT_ID:
|
17558
17665
|
{
|
@@ -17630,10 +17737,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
17630
17737
|
{
|
17631
17738
|
ggml_compute_forward_rope_back(params, tensor);
|
17632
17739
|
} break;
|
17633
|
-
case GGML_OP_ALIBI:
|
17634
|
-
{
|
17635
|
-
ggml_compute_forward_alibi(params, tensor);
|
17636
|
-
} break;
|
17637
17740
|
case GGML_OP_CLAMP:
|
17638
17741
|
{
|
17639
17742
|
ggml_compute_forward_clamp(params, tensor);
|
@@ -18652,10 +18755,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18652
18755
|
zero_table);
|
18653
18756
|
}
|
18654
18757
|
} break;
|
18655
|
-
case GGML_OP_ALIBI:
|
18656
|
-
{
|
18657
|
-
GGML_ASSERT(false); // TODO: not implemented
|
18658
|
-
} break;
|
18659
18758
|
case GGML_OP_CLAMP:
|
18660
18759
|
{
|
18661
18760
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -18826,6 +18925,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18826
18925
|
zero_table);
|
18827
18926
|
}
|
18828
18927
|
} break;
|
18928
|
+
case GGML_UNARY_OP_SIGMOID:
|
18929
|
+
{
|
18930
|
+
GGML_ASSERT(false); // TODO: not implemented
|
18931
|
+
} break;
|
18829
18932
|
case GGML_UNARY_OP_GELU:
|
18830
18933
|
{
|
18831
18934
|
GGML_ASSERT(false); // TODO: not implemented
|
@@ -19172,8 +19275,6 @@ typedef int ggml_lock_t;
|
|
19172
19275
|
|
19173
19276
|
#define GGML_LOCK_INITIALIZER 0
|
19174
19277
|
|
19175
|
-
typedef pthread_t ggml_thread_t;
|
19176
|
-
|
19177
19278
|
#define ggml_thread_create pthread_create
|
19178
19279
|
#define ggml_thread_join pthread_join
|
19179
19280
|
|
@@ -19199,8 +19300,6 @@ typedef int ggml_lock_t;
|
|
19199
19300
|
|
19200
19301
|
#define GGML_LOCK_INITIALIZER 0
|
19201
19302
|
|
19202
|
-
typedef pthread_t ggml_thread_t;
|
19203
|
-
|
19204
19303
|
#define ggml_thread_create pthread_create
|
19205
19304
|
#define ggml_thread_join pthread_join
|
19206
19305
|
|
@@ -19280,31 +19379,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
|
19280
19379
|
static void clear_numa_thread_affinity(void) {}
|
19281
19380
|
#endif
|
19282
19381
|
|
19283
|
-
struct ggml_compute_state_shared {
|
19284
|
-
const struct ggml_cgraph * cgraph;
|
19285
|
-
const struct ggml_cplan * cplan;
|
19286
|
-
|
19287
|
-
int64_t perf_node_start_cycles;
|
19288
|
-
int64_t perf_node_start_time_us;
|
19289
|
-
|
19290
|
-
const int n_threads;
|
19291
|
-
|
19292
|
-
// synchronization primitives
|
19293
|
-
atomic_int n_active; // num active threads
|
19294
|
-
atomic_int node_n; // active graph node
|
19295
|
-
atomic_int node_task; // active graph node task phase
|
19296
|
-
|
19297
|
-
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
19298
|
-
void * abort_callback_data;
|
19299
|
-
};
|
19300
|
-
|
19301
|
-
struct ggml_compute_state {
|
19302
|
-
ggml_thread_t thrd;
|
19303
|
-
int ith;
|
19304
|
-
struct ggml_compute_state_shared * shared;
|
19305
|
-
enum ggml_status ec;
|
19306
|
-
};
|
19307
|
-
|
19308
19382
|
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
19309
19383
|
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
19310
19384
|
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
@@ -19355,6 +19429,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19355
19429
|
case GGML_UNARY_OP_TANH:
|
19356
19430
|
case GGML_UNARY_OP_ELU:
|
19357
19431
|
case GGML_UNARY_OP_RELU:
|
19432
|
+
case GGML_UNARY_OP_SIGMOID:
|
19358
19433
|
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
19359
19434
|
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
19360
19435
|
{
|
@@ -19428,10 +19503,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19428
19503
|
{
|
19429
19504
|
n_tasks = n_threads;
|
19430
19505
|
} break;
|
19431
|
-
case GGML_OP_ALIBI:
|
19432
|
-
{
|
19433
|
-
n_tasks = 1; //TODO
|
19434
|
-
} break;
|
19435
19506
|
case GGML_OP_CLAMP:
|
19436
19507
|
{
|
19437
19508
|
n_tasks = 1; //TODO
|
@@ -19580,6 +19651,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19580
19651
|
|
19581
19652
|
* node_n = atomic_load(&state->shared->node_n);
|
19582
19653
|
if (* node_n != last_node_n) break;
|
19654
|
+
#if defined(__SSE3__)
|
19655
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19656
|
+
_mm_pause();
|
19657
|
+
#endif
|
19583
19658
|
}
|
19584
19659
|
}
|
19585
19660
|
|
@@ -19594,6 +19669,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
|
19594
19669
|
|
19595
19670
|
* task_phase = atomic_load(&state->shared->node_task);
|
19596
19671
|
if (* task_phase != last_task_phase) break;
|
19672
|
+
#if defined(__SSE3__)
|
19673
|
+
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19674
|
+
_mm_pause();
|
19675
|
+
#endif
|
19597
19676
|
}
|
19598
19677
|
}
|
19599
19678
|
|
@@ -19633,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19633
19712
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19634
19713
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19635
19714
|
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19636
|
-
ggml_compute_forward(¶ms, node);
|
19715
|
+
ggml_compute_forward(¶ms, node, state);
|
19637
19716
|
}
|
19638
19717
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19639
19718
|
}
|
@@ -19653,17 +19732,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19653
19732
|
/* INIT */
|
19654
19733
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19655
19734
|
params.type = GGML_TASK_TYPE_INIT;
|
19656
|
-
ggml_compute_forward(¶ms, node);
|
19735
|
+
ggml_compute_forward(¶ms, node, state);
|
19657
19736
|
}
|
19658
19737
|
|
19659
19738
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19660
19739
|
// they do something more efficient than spinning (?)
|
19661
19740
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19662
|
-
ggml_compute_forward(¶ms, node);
|
19741
|
+
ggml_compute_forward(¶ms, node, state);
|
19663
19742
|
|
19664
19743
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19665
19744
|
params.type = GGML_TASK_TYPE_FINALIZE;
|
19666
|
-
ggml_compute_forward(¶ms, node);
|
19745
|
+
ggml_compute_forward(¶ms, node, state);
|
19667
19746
|
}
|
19668
19747
|
|
19669
19748
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
@@ -19702,7 +19781,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19702
19781
|
|
19703
19782
|
if (state->ith < n_tasks) {
|
19704
19783
|
if (GGML_OP_HAS_INIT[node->op]) {
|
19705
|
-
ggml_compute_forward(¶ms, node);
|
19784
|
+
ggml_compute_forward(¶ms, node, state);
|
19706
19785
|
}
|
19707
19786
|
}
|
19708
19787
|
|
@@ -19723,7 +19802,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19723
19802
|
|
19724
19803
|
if (state->ith < n_tasks) {
|
19725
19804
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19726
|
-
ggml_compute_forward(¶ms, node);
|
19805
|
+
ggml_compute_forward(¶ms, node, state);
|
19727
19806
|
}
|
19728
19807
|
|
19729
19808
|
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
@@ -19974,6 +20053,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19974
20053
|
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19975
20054
|
/*.abort_callback =*/ NULL,
|
19976
20055
|
/*.abort_callback_data =*/ NULL,
|
20056
|
+
/*.current_chunk; =*/ 0,
|
19977
20057
|
};
|
19978
20058
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
19979
20059
|
|