llama_cpp 0.15.1 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,6 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
- #include "sgemm.h"
8
7
 
9
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
10
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -37,6 +36,10 @@
37
36
  #undef GGML_USE_LLAMAFILE
38
37
  #endif
39
38
 
39
+ #ifdef GGML_USE_LLAMAFILE
40
+ #include "sgemm.h"
41
+ #endif
42
+
40
43
  #if defined(_MSC_VER)
41
44
  // disable "possible loss of data" to avoid hundreds of casts
42
45
  // we should just be careful :)
@@ -109,6 +112,8 @@ typedef void * thread_ret_t;
109
112
 
110
113
  #endif
111
114
 
115
+ typedef pthread_t ggml_thread_t;
116
+
112
117
  #ifdef GGML_USE_CPU_HBM
113
118
  #include <hbwmalloc.h>
114
119
  #endif
@@ -160,9 +165,6 @@ void ggml_print_backtrace(void) {
160
165
  #define GGML_DEBUG 0
161
166
  #define GGML_GELU_FP16
162
167
  #define GGML_GELU_QUICK_FP16
163
- #define GGML_SILU_FP16
164
- // #define GGML_CROSS_ENTROPY_EXP_FP16
165
- // #define GGML_FLASH_ATTN_EXP_FP16
166
168
 
167
169
  #define GGML_SOFT_MAX_UNROLL 4
168
170
  #define GGML_VEC_DOT_UNROLL 2
@@ -313,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
313
315
  // precomputed quick gelu table for f16 (128 KB)
314
316
  static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
315
317
 
316
- // precomputed silu table for f16 (128 KB)
317
- static ggml_fp16_t ggml_table_silu_f16[1 << 16];
318
-
319
- // precomputed exp table for f16 (128 KB)
320
- static ggml_fp16_t ggml_table_exp_f16[1 << 16];
321
-
322
318
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
323
319
  float ggml_table_f32_f16[1 << 16];
324
320
 
@@ -1303,6 +1299,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1303
1299
  #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1304
1300
  #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1305
1301
  #define GGML_F16_VEC_FMA GGML_F32x4_FMA
1302
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
1303
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1306
1304
  #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1307
1305
  // Use vec_xl, not vec_ld, in case the load address is not aligned.
1308
1306
  #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
@@ -1534,6 +1532,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
1534
1532
  #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1535
1533
  #endif
1536
1534
 
1535
+ //
1536
+ // ggml context
1537
+ //
1538
+
1539
+ struct ggml_context {
1540
+ size_t mem_size;
1541
+ void* mem_buffer;
1542
+ bool mem_buffer_owned;
1543
+ bool no_alloc;
1544
+ bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
1545
+
1546
+ int n_objects;
1547
+
1548
+ struct ggml_object* objects_begin;
1549
+ struct ggml_object* objects_end;
1550
+
1551
+ struct ggml_scratch scratch;
1552
+ struct ggml_scratch scratch_save;
1553
+ };
1554
+
1555
+ struct ggml_context_container {
1556
+ bool used;
1557
+
1558
+ struct ggml_context context;
1559
+ };
1560
+
1561
+ struct ggml_compute_state_shared {
1562
+ const struct ggml_cgraph* cgraph;
1563
+ const struct ggml_cplan* cplan;
1564
+
1565
+ int64_t perf_node_start_cycles;
1566
+ int64_t perf_node_start_time_us;
1567
+
1568
+ const int n_threads;
1569
+
1570
+ // synchronization primitives
1571
+ atomic_int n_active; // num active threads
1572
+ atomic_int node_n; // active graph node
1573
+ atomic_int node_task; // active graph node task phase
1574
+
1575
+ ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
1576
+ void* abort_callback_data;
1577
+
1578
+ atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1579
+ };
1580
+
1581
+ struct ggml_compute_state {
1582
+ ggml_thread_t thrd;
1583
+ int ith;
1584
+ struct ggml_compute_state_shared* shared;
1585
+ enum ggml_status ec;
1586
+ };
1587
+
1537
1588
  //
1538
1589
  // fundamental operations
1539
1590
  //
@@ -1949,6 +2000,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
1949
2000
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1950
2001
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1951
2002
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
2003
+ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
1952
2004
  // TODO: optimize performance
1953
2005
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1954
2006
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
@@ -2024,52 +2076,291 @@ inline static float ggml_silu_f32(float x) {
2024
2076
  return x/(1.0f + expf(-x));
2025
2077
  }
2026
2078
 
2027
- //inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2028
- // const uint16_t * i16 = (const uint16_t *) x;
2029
- // for (int i = 0; i < n; ++i) {
2030
- // y[i] = ggml_table_silu_f16[i16[i]];
2031
- // }
2032
- //}
2079
+ #if defined(__ARM_NEON)
2033
2080
 
2034
- #ifdef GGML_SILU_FP16
2035
- inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2036
- uint16_t t;
2037
- for (int i = 0; i < n; ++i) {
2038
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
2039
- memcpy(&t, &fp16, sizeof(uint16_t));
2040
- y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
2041
- }
2042
- }
2081
+ // adapted from arm limited optimized routine
2082
+ // the maximum error is 1.45358 plus 0.5 ulps
2083
+ // numbers above 88.38 will flush to infinity
2084
+ // numbers beneath -103.97 will flush to zero
2085
+ inline static float32x4_t ggml_v_expf(float32x4_t x) {
2086
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
2087
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
2088
+ const float32x4_t n = vsubq_f32(z, r);
2089
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
2090
+ vdupq_n_f32(0x1.7f7d1cp-20f));
2091
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
2092
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
2093
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
2094
+ const float32x4_t u = vmulq_f32(b, b);
2095
+ const float32x4_t j = vfmaq_f32(
2096
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
2097
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
2098
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
2099
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
2100
+ return vfmaq_f32(k, j, k);
2101
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
2102
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
2103
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
2104
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
2105
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
2106
+ }
2107
+
2108
+ // computes silu x/(1+exp(-x)) in single precision vector
2109
+ inline static float32x4_t ggml_v_silu(float32x4_t x) {
2110
+ const float32x4_t one = vdupq_n_f32(1.0f);
2111
+ const float32x4_t zero = vdupq_n_f32(0.0f);
2112
+ const float32x4_t neg_x = vsubq_f32(zero, x);
2113
+ const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
2114
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
2115
+ return vdivq_f32(x, one_plus_exp_neg_x);
2116
+ }
2117
+
2118
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
2119
+
2120
+ // adapted from arm limited optimized routine
2121
+ // the maximum error is 1.45358 plus 0.5 ulps
2122
+ // numbers above 88.38 will flush to infinity
2123
+ // numbers beneath -103.97 will flush to zero
2124
+ inline static __m512 ggml_v_expf(__m512 x) {
2125
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
2126
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
2127
+ const __m512 n = _mm512_sub_ps(z, r);
2128
+ const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
2129
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
2130
+ const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
2131
+ const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
2132
+ const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
2133
+ const __m512 u = _mm512_mul_ps(b, b);
2134
+ const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
2135
+ _mm512_set1_ps(0x1.573e2ep-5f)), u,
2136
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
2137
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
2138
+ u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
2139
+ if (_mm512_kortestz(c, c))
2140
+ return _mm512_fmadd_ps(j, k, k);
2141
+ const __m512i g = _mm512_and_si512(
2142
+ _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
2143
+ _mm512_set1_epi32(0x82000000u));
2144
+ const __m512 s1 =
2145
+ _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
2146
+ const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
2147
+ const __mmask16 d =
2148
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
2149
+ return _mm512_mask_blend_ps(
2150
+ d, _mm512_mask_blend_ps(
2151
+ c, _mm512_fmadd_ps(k, j, k),
2152
+ _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
2153
+ _mm512_mul_ps(s1, s1));
2154
+ }
2155
+
2156
+ // computes silu x/(1+exp(-x)) in single precision vector
2157
+ inline static __m512 ggml_v_silu(__m512 x) {
2158
+ const __m512 one = _mm512_set1_ps(1);
2159
+ const __m512 zero = _mm512_setzero_ps();
2160
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
2161
+ const __m512 exp_neg_x = ggml_v_expf(neg_x);
2162
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
2163
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
2164
+ }
2165
+
2166
+ #elif defined(__AVX2__) && defined(__FMA__)
2167
+
2168
+ // adapted from arm limited optimized routine
2169
+ // the maximum error is 1.45358 plus 0.5 ulps
2170
+ // numbers above 88.38 will flush to infinity
2171
+ // numbers beneath -103.97 will flush to zero
2172
+ inline static __m256 ggml_v_expf(__m256 x) {
2173
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
2174
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
2175
+ const __m256 n = _mm256_sub_ps(z, r);
2176
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
2177
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
2178
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
2179
+ const __m256 k = _mm256_castsi256_ps(
2180
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
2181
+ const __m256i c = _mm256_castps_si256(
2182
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
2183
+ _mm256_set1_ps(126), _CMP_GT_OQ));
2184
+ const __m256 u = _mm256_mul_ps(b, b);
2185
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
2186
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
2187
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
2188
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
2189
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
2190
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
2191
+ return _mm256_fmadd_ps(j, k, k);
2192
+ const __m256i g = _mm256_and_si256(
2193
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
2194
+ _mm256_set1_epi32(0x82000000u));
2195
+ const __m256 s1 =
2196
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
2197
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
2198
+ const __m256i d = _mm256_castps_si256(
2199
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
2200
+ _mm256_set1_ps(192), _CMP_GT_OQ));
2201
+ return _mm256_or_ps(
2202
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
2203
+ _mm256_andnot_ps(
2204
+ _mm256_castsi256_ps(d),
2205
+ _mm256_or_ps(
2206
+ _mm256_and_ps(_mm256_castsi256_ps(c),
2207
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
2208
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
2209
+ }
2210
+
2211
+ // computes silu x/(1+exp(-x)) in single precision vector
2212
+ inline static __m256 ggml_v_silu(__m256 x) {
2213
+ const __m256 one = _mm256_set1_ps(1);
2214
+ const __m256 zero = _mm256_setzero_ps();
2215
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
2216
+ const __m256 exp_neg_x = ggml_v_expf(neg_x);
2217
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
2218
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
2219
+ }
2220
+
2221
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
2222
+
2223
+ #if defined(__FMA__)
2224
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
2225
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
2043
2226
  #else
2044
- inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2045
- for (int i = 0; i < n; ++i) {
2227
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
2228
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
2229
+ #endif
2230
+
2231
+ // adapted from arm limited optimized routine
2232
+ // the maximum error is 1.45358 plus 0.5 ulps
2233
+ // numbers above 88.38 will flush to infinity
2234
+ // numbers beneath -103.97 will flush to zero
2235
+ inline static __m128 ggml_v_expf(__m128 x) {
2236
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
2237
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
2238
+ const __m128 n = _mm_sub_ps(z, r);
2239
+ const __m128 b =
2240
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
2241
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
2242
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
2243
+ const __m128i c =
2244
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
2245
+ const __m128 u = _mm_mul_ps(b, b);
2246
+ const __m128 j =
2247
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
2248
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
2249
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
2250
+ if (!_mm_movemask_epi8(c))
2251
+ return MADD128(j, k, k);
2252
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
2253
+ _mm_set1_epi32(0x82000000u));
2254
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
2255
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
2256
+ const __m128i d =
2257
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
2258
+ return _mm_or_ps(
2259
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
2260
+ _mm_andnot_ps(_mm_castsi128_ps(d),
2261
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
2262
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
2263
+ }
2264
+
2265
+ // computes silu x/(1+exp(-x)) in single precision vector
2266
+ inline static __m128 ggml_v_silu(__m128 x) {
2267
+ const __m128 one = _mm_set1_ps(1);
2268
+ const __m128 zero = _mm_setzero_ps();
2269
+ const __m128 neg_x = _mm_sub_ps(zero, x);
2270
+ const __m128 exp_neg_x = ggml_v_expf(neg_x);
2271
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
2272
+ return _mm_div_ps(x, one_plus_exp_neg_x);
2273
+ }
2274
+
2275
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__
2276
+
2277
+ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2278
+ int i = 0;
2279
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
2280
+ for (; i + 15 < n; i += 16) {
2281
+ _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
2282
+ }
2283
+ #elif defined(__AVX2__) && defined(__FMA__)
2284
+ for (; i + 7 < n; i += 8) {
2285
+ _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
2286
+ }
2287
+ #elif defined(__SSE2__)
2288
+ for (; i + 3 < n; i += 4) {
2289
+ _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
2290
+ }
2291
+ #elif defined(__ARM_NEON)
2292
+ for (; i + 3 < n; i += 4) {
2293
+ vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
2294
+ }
2295
+ #endif
2296
+ for (; i < n; ++i) {
2046
2297
  y[i] = ggml_silu_f32(x[i]);
2047
2298
  }
2048
2299
  }
2300
+
2301
+ static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
2302
+ int i = 0;
2303
+ ggml_float sum = 0;
2304
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
2305
+ for (; i + 15 < n; i += 16) {
2306
+ __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
2307
+ _mm512_set1_ps(max)));
2308
+ _mm512_storeu_ps(y + i, val);
2309
+ sum += (ggml_float)_mm512_reduce_add_ps(val);
2310
+ }
2311
+ #elif defined(__AVX2__) && defined(__FMA__)
2312
+ for (; i + 7 < n; i += 8) {
2313
+ __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
2314
+ _mm256_set1_ps(max)));
2315
+ _mm256_storeu_ps(y + i, val);
2316
+ __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
2317
+ _mm256_castps256_ps128(val));
2318
+ val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
2319
+ val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
2320
+ sum += (ggml_float)_mm_cvtss_f32(val2);
2321
+ }
2322
+ #elif defined(__SSE2__)
2323
+ for (; i + 3 < n; i += 4) {
2324
+ __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
2325
+ _mm_set1_ps(max)));
2326
+ _mm_storeu_ps(y + i, val);
2327
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
2328
+ val = _mm_add_ps(val, _mm_movehl_ps(val, val));
2329
+ val = _mm_add_ss(val, _mm_movehdup_ps(val));
2330
+ #else
2331
+ __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
2332
+ val = _mm_add_ps(val, tmp);
2333
+ tmp = _mm_movehl_ps(tmp, val);
2334
+ val = _mm_add_ss(val, tmp);
2335
+ #endif
2336
+ sum += (ggml_float)_mm_cvtss_f32(val);
2337
+ }
2338
+ #elif defined(__ARM_NEON)
2339
+ for (; i + 3 < n; i += 4) {
2340
+ float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
2341
+ vdupq_n_f32(max)));
2342
+ vst1q_f32(y + i, val);
2343
+ sum += (ggml_float)vaddvq_f32(val);
2344
+ }
2049
2345
  #endif
2346
+ for (; i < n; ++i) {
2347
+ float val = expf(x[i] - max);
2348
+ sum += (ggml_float)val;
2349
+ y[i] = val;
2350
+ }
2351
+ return sum;
2352
+ }
2050
2353
 
2051
2354
  inline static float ggml_silu_backward_f32(float x, float dy) {
2052
2355
  const float s = 1.0f/(1.0f + expf(-x));
2053
2356
  return dy*s*(1.0f + x*(1.0f - s));
2054
2357
  }
2055
2358
 
2056
- #ifdef GGML_SILU_FP16
2057
- inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2058
- for (int i = 0; i < n; ++i) {
2059
- // we did not use x[i] to compute forward silu but its f16 equivalent
2060
- // take derivative at f16 of x[i]:
2061
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
2062
- float usedx = GGML_FP16_TO_FP32(fp16);
2063
- dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
2064
- }
2065
- }
2066
- #else
2067
2359
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2068
2360
  for (int i = 0; i < n; ++i) {
2069
2361
  dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
2070
2362
  }
2071
2363
  }
2072
- #endif
2073
2364
 
2074
2365
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
2075
2366
  #ifndef GGML_USE_ACCELERATE
@@ -2185,7 +2476,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2185
2476
  "SOFT_MAX_BACK",
2186
2477
  "ROPE",
2187
2478
  "ROPE_BACK",
2188
- "ALIBI",
2189
2479
  "CLAMP",
2190
2480
  "CONV_TRANSPOSE_1D",
2191
2481
  "IM2COL",
@@ -2227,7 +2517,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2227
2517
  "CROSS_ENTROPY_LOSS_BACK",
2228
2518
  };
2229
2519
 
2230
- static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2520
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2231
2521
 
2232
2522
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2233
2523
  "none",
@@ -2276,7 +2566,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2276
2566
  "soft_max_back(x)",
2277
2567
  "rope(x)",
2278
2568
  "rope_back(x)",
2279
- "alibi(x)",
2280
2569
  "clamp(x)",
2281
2570
  "conv_transpose_1d(x)",
2282
2571
  "im2col(x)",
@@ -2318,7 +2607,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2318
2607
  "cross_entropy_loss_back(x,y)",
2319
2608
  };
2320
2609
 
2321
- static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2610
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2322
2611
 
2323
2612
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
2324
2613
 
@@ -2331,6 +2620,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
2331
2620
  "TANH",
2332
2621
  "ELU",
2333
2622
  "RELU",
2623
+ "SIGMOID",
2334
2624
  "GELU",
2335
2625
  "GELU_QUICK",
2336
2626
  "SILU",
@@ -2338,7 +2628,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
2338
2628
  "HARDSIGMOID",
2339
2629
  };
2340
2630
 
2341
- static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
2631
+ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
2342
2632
 
2343
2633
 
2344
2634
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2380,32 +2670,6 @@ static void ggml_setup_op_has_task_pass(void) {
2380
2670
  }
2381
2671
  }
2382
2672
 
2383
- //
2384
- // ggml context
2385
- //
2386
-
2387
- struct ggml_context {
2388
- size_t mem_size;
2389
- void * mem_buffer;
2390
- bool mem_buffer_owned;
2391
- bool no_alloc;
2392
- bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
2393
-
2394
- int n_objects;
2395
-
2396
- struct ggml_object * objects_begin;
2397
- struct ggml_object * objects_end;
2398
-
2399
- struct ggml_scratch scratch;
2400
- struct ggml_scratch scratch_save;
2401
- };
2402
-
2403
- struct ggml_context_container {
2404
- bool used;
2405
-
2406
- struct ggml_context context;
2407
- };
2408
-
2409
2673
  //
2410
2674
  // NUMA support
2411
2675
  //
@@ -2819,6 +3083,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
2819
3083
  (t0->ne[3] == t1->ne[3] );
2820
3084
  }
2821
3085
 
3086
+ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3087
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3088
+
3089
+ return
3090
+ (t0->nb[0] == t1->nb[0] ) &&
3091
+ (t0->nb[1] == t1->nb[1] ) &&
3092
+ (t0->nb[2] == t1->nb[2] ) &&
3093
+ (t0->nb[3] == t1->nb[3] );
3094
+ }
3095
+
2822
3096
  // check if t1 can be represented as a repeatition of t0
2823
3097
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2824
3098
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -2878,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2878
3152
  float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
2879
3153
  ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
2880
3154
  ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
2881
- ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
2882
- ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
2883
3155
  }
2884
3156
 
2885
3157
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -3163,6 +3435,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
3163
3435
 
3164
3436
  struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
3165
3437
 
3438
+ #ifdef __clang__
3439
+ // temporary until ggml_tensor::backend is removed
3440
+ #pragma clang diagnostic push
3441
+ #pragma clang diagnostic ignored "-Wdeprecated-declarations"
3442
+ #endif
3443
+
3166
3444
  *result = (struct ggml_tensor) {
3167
3445
  /*.type =*/ type,
3168
3446
  /*.backend =*/ GGML_BACKEND_TYPE_CPU,
@@ -3185,6 +3463,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
3185
3463
  /*.padding =*/ { 0 },
3186
3464
  };
3187
3465
 
3466
+ #ifdef __clang__
3467
+ #pragma clang diagnostic pop
3468
+ #endif
3469
+
3188
3470
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3189
3471
  //ggml_assert_aligned(result->data);
3190
3472
 
@@ -4563,6 +4845,20 @@ struct ggml_tensor * ggml_leaky_relu(
4563
4845
  return result;
4564
4846
  }
4565
4847
 
4848
+ // ggml_sigmoid
4849
+
4850
+ struct ggml_tensor * ggml_sigmoid(
4851
+ struct ggml_context * ctx,
4852
+ struct ggml_tensor * a) {
4853
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
4854
+ }
4855
+
4856
+ struct ggml_tensor * ggml_sigmoid_inplace(
4857
+ struct ggml_context * ctx,
4858
+ struct ggml_tensor * a) {
4859
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
4860
+ }
4861
+
4566
4862
  // ggml_gelu
4567
4863
 
4568
4864
  struct ggml_tensor * ggml_gelu(
@@ -5646,7 +5942,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
5646
5942
  struct ggml_context * ctx,
5647
5943
  struct ggml_tensor * a,
5648
5944
  struct ggml_tensor * mask,
5649
- struct ggml_tensor * pos,
5650
5945
  float scale,
5651
5946
  float max_bias,
5652
5947
  bool inplace) {
@@ -5660,18 +5955,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
5660
5955
  GGML_ASSERT(mask->ne[1] >= a->ne[1]);
5661
5956
  }
5662
5957
 
5663
- if (pos) {
5664
- GGML_ASSERT(ggml_is_vector(pos));
5665
- GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
5666
- GGML_ASSERT(pos->ne[0] == a->ne[0]);
5667
- }
5668
-
5669
- if (pos && mask) {
5670
- GGML_ASSERT(pos->type == mask->type);
5671
- }
5672
-
5673
5958
  if (max_bias > 0.0f) {
5674
- GGML_ASSERT(pos);
5959
+ GGML_ASSERT(mask);
5675
5960
  }
5676
5961
 
5677
5962
  bool is_node = false;
@@ -5689,7 +5974,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
5689
5974
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5690
5975
  result->src[0] = a;
5691
5976
  result->src[1] = mask;
5692
- result->src[2] = pos;
5693
5977
 
5694
5978
  return result;
5695
5979
  }
@@ -5697,23 +5981,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
5697
5981
  struct ggml_tensor * ggml_soft_max(
5698
5982
  struct ggml_context * ctx,
5699
5983
  struct ggml_tensor * a) {
5700
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5984
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
5701
5985
  }
5702
5986
 
5703
5987
  struct ggml_tensor * ggml_soft_max_inplace(
5704
5988
  struct ggml_context * ctx,
5705
5989
  struct ggml_tensor * a) {
5706
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5990
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
5707
5991
  }
5708
5992
 
5709
5993
  struct ggml_tensor * ggml_soft_max_ext(
5710
5994
  struct ggml_context * ctx,
5711
5995
  struct ggml_tensor * a,
5712
5996
  struct ggml_tensor * mask,
5713
- struct ggml_tensor * pos,
5714
5997
  float scale,
5715
5998
  float max_bias) {
5716
- return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5999
+ return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
5717
6000
  }
5718
6001
 
5719
6002
  // ggml_soft_max_back
@@ -5928,37 +6211,6 @@ struct ggml_tensor * ggml_rope_back(
5928
6211
  return result;
5929
6212
  }
5930
6213
 
5931
- // ggml_alibi
5932
-
5933
- struct ggml_tensor * ggml_alibi(
5934
- struct ggml_context * ctx,
5935
- struct ggml_tensor * a,
5936
- int n_past,
5937
- int n_head,
5938
- float bias_max) {
5939
- GGML_ASSERT(n_past >= 0);
5940
- bool is_node = false;
5941
-
5942
- if (a->grad) {
5943
- GGML_ASSERT(false); // TODO: implement backward
5944
- is_node = true;
5945
- }
5946
-
5947
- // TODO: when implement backward, fix this:
5948
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5949
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
5950
-
5951
- int32_t op_params[3] = { n_past, n_head };
5952
- memcpy(op_params + 2, &bias_max, sizeof(float));
5953
- ggml_set_op_params(result, op_params, sizeof(op_params));
5954
-
5955
- result->op = GGML_OP_ALIBI;
5956
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5957
- result->src[0] = a;
5958
-
5959
- return result;
5960
- }
5961
-
5962
6214
  // ggml_clamp
5963
6215
 
5964
6216
  struct ggml_tensor * ggml_clamp(
@@ -6308,7 +6560,10 @@ struct ggml_tensor * ggml_pool_2d(
6308
6560
  static struct ggml_tensor * ggml_upscale_impl(
6309
6561
  struct ggml_context * ctx,
6310
6562
  struct ggml_tensor * a,
6311
- int scale_factor) {
6563
+ int ne0,
6564
+ int ne1,
6565
+ int ne2,
6566
+ int ne3) {
6312
6567
  bool is_node = false;
6313
6568
 
6314
6569
  if (a->grad) {
@@ -6316,19 +6571,45 @@ static struct ggml_tensor * ggml_upscale_impl(
6316
6571
  is_node = true;
6317
6572
  }
6318
6573
 
6574
+ GGML_ASSERT(a->ne[0] <= ne0);
6575
+ GGML_ASSERT(a->ne[1] <= ne1);
6576
+ GGML_ASSERT(a->ne[2] <= ne2);
6577
+ GGML_ASSERT(a->ne[3] <= ne3);
6578
+
6319
6579
  struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
6320
- a->ne[0] * scale_factor,
6321
- a->ne[1] * scale_factor,
6322
- a->ne[2], a->ne[3]);
6580
+ ne0,
6581
+ ne1,
6582
+ ne2,
6583
+ ne3
6584
+ );
6323
6585
 
6324
6586
  result->op = GGML_OP_UPSCALE;
6325
- result->op_params[0] = scale_factor;
6587
+
6326
6588
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6327
6589
  result->src[0] = a;
6328
6590
 
6329
6591
  return result;
6330
6592
  }
6331
6593
 
6594
+ struct ggml_tensor * ggml_upscale(
6595
+ struct ggml_context * ctx,
6596
+ struct ggml_tensor * a,
6597
+ int scale_factor) {
6598
+ return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
6599
+ }
6600
+
6601
+ struct ggml_tensor * ggml_upscale_ext(
6602
+ struct ggml_context * ctx,
6603
+ struct ggml_tensor * a,
6604
+ int ne0,
6605
+ int ne1,
6606
+ int ne2,
6607
+ int ne3) {
6608
+ return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
6609
+ }
6610
+
6611
+ // ggml_pad
6612
+
6332
6613
  struct ggml_tensor * ggml_pad(
6333
6614
  struct ggml_context * ctx,
6334
6615
  struct ggml_tensor * a,
@@ -6353,12 +6634,7 @@ struct ggml_tensor * ggml_pad(
6353
6634
  return result;
6354
6635
  }
6355
6636
 
6356
- struct ggml_tensor * ggml_upscale(
6357
- struct ggml_context * ctx,
6358
- struct ggml_tensor * a,
6359
- int scale_factor) {
6360
- return ggml_upscale_impl(ctx, a, scale_factor);
6361
- }
6637
+ // ggml_arange
6362
6638
 
6363
6639
  struct ggml_tensor * ggml_arange(
6364
6640
  struct ggml_context * ctx,
@@ -6380,6 +6656,8 @@ struct ggml_tensor * ggml_arange(
6380
6656
  return result;
6381
6657
  }
6382
6658
 
6659
+ // ggml_timestep_embedding
6660
+
6383
6661
  struct ggml_tensor * ggml_timestep_embedding(
6384
6662
  struct ggml_context * ctx,
6385
6663
  struct ggml_tensor * timesteps,
@@ -6486,9 +6764,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
6486
6764
  struct ggml_tensor * k,
6487
6765
  struct ggml_tensor * v,
6488
6766
  struct ggml_tensor * mask,
6489
- float scale) {
6767
+ float scale,
6768
+ float max_bias) {
6490
6769
  GGML_ASSERT(ggml_can_mul_mat(k, q));
6491
6770
  // TODO: check if vT can be multiplied by (k*qT)
6771
+
6492
6772
  if (mask) {
6493
6773
  GGML_ASSERT(ggml_is_contiguous(mask));
6494
6774
  GGML_ASSERT(mask->ne[2] == 1);
@@ -6498,6 +6778,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
6498
6778
  //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
6499
6779
  }
6500
6780
 
6781
+ if (max_bias > 0.0f) {
6782
+ GGML_ASSERT(mask);
6783
+ }
6784
+
6501
6785
  bool is_node = false;
6502
6786
 
6503
6787
  if (q->grad || k->grad || v->grad) {
@@ -6508,7 +6792,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
6508
6792
  int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
6509
6793
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6510
6794
 
6511
- float params[] = { scale };
6795
+ float params[] = { scale, max_bias };
6512
6796
  ggml_set_op_params(result, params, sizeof(params));
6513
6797
 
6514
6798
  result->op = GGML_OP_FLASH_ATTN_EXT;
@@ -6528,7 +6812,7 @@ void ggml_flash_attn_ext_set_prec(
6528
6812
 
6529
6813
  const int32_t prec_i32 = (int32_t) prec;
6530
6814
 
6531
- ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
6815
+ ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
6532
6816
  }
6533
6817
 
6534
6818
  // ggml_flash_ff
@@ -10892,6 +11176,52 @@ static void ggml_compute_forward_relu(
10892
11176
  }
10893
11177
  }
10894
11178
 
11179
+ // ggml_compute_forward_sigmoid
11180
+
11181
+ static void ggml_compute_forward_sigmoid_f32(
11182
+ const struct ggml_compute_params * params,
11183
+ struct ggml_tensor * dst) {
11184
+
11185
+ const struct ggml_tensor * src0 = dst->src[0];
11186
+
11187
+ assert(params->ith == 0);
11188
+ assert(ggml_are_same_shape(src0, dst));
11189
+
11190
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11191
+ return;
11192
+ }
11193
+
11194
+ const int n = ggml_nrows(src0);
11195
+ const int nc = src0->ne[0];
11196
+
11197
+ assert(dst->nb[0] == sizeof(float));
11198
+ assert(src0->nb[0] == sizeof(float));
11199
+
11200
+ for (int i = 0; i < n; i++) {
11201
+ ggml_vec_sigmoid_f32(nc,
11202
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
11203
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
11204
+ }
11205
+ }
11206
+
11207
+ static void ggml_compute_forward_sigmoid(
11208
+ const struct ggml_compute_params * params,
11209
+ struct ggml_tensor * dst) {
11210
+
11211
+ const struct ggml_tensor * src0 = dst->src[0];
11212
+
11213
+ switch (src0->type) {
11214
+ case GGML_TYPE_F32:
11215
+ {
11216
+ ggml_compute_forward_sigmoid_f32(params, dst);
11217
+ } break;
11218
+ default:
11219
+ {
11220
+ GGML_ASSERT(false);
11221
+ } break;
11222
+ }
11223
+ }
11224
+
10895
11225
  // ggml_compute_forward_gelu
10896
11226
 
10897
11227
  static void ggml_compute_forward_gelu_f32(
@@ -11742,48 +12072,139 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
11742
12072
  }
11743
12073
  #endif
11744
12074
 
11745
- static void ggml_compute_forward_mul_mat(
11746
- const struct ggml_compute_params * params,
11747
- struct ggml_tensor * dst) {
12075
+ static void ggml_compute_forward_mul_mat_one_chunk(
12076
+ const struct ggml_compute_params * params,
12077
+ struct ggml_tensor * dst,
12078
+ const int64_t num_rows_per_vec_dot,
12079
+ const int64_t ir0_start,
12080
+ const int64_t ir0_end,
12081
+ const int64_t ir1_start,
12082
+ const int64_t ir1_end) {
11748
12083
 
11749
12084
  const struct ggml_tensor * src0 = dst->src[0];
11750
12085
  const struct ggml_tensor * src1 = dst->src[1];
11751
12086
 
11752
- int64_t t0 = ggml_perf_time_us();
11753
- UNUSED(t0);
11754
-
11755
12087
  GGML_TENSOR_BINARY_OP_LOCALS
11756
12088
 
11757
- const int ith = params->ith;
11758
- const int nth = params->nth;
11759
-
11760
12089
  const enum ggml_type type = src0->type;
11761
12090
 
11762
12091
  const bool src1_cont = ggml_is_contiguous(src1);
11763
12092
 
11764
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
11765
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
11766
- ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
11767
- int64_t const vec_dot_num_rows = type_traits[type].nrows;
12093
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
12094
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
11768
12095
 
11769
- GGML_ASSERT(ne0 == ne01);
11770
- GGML_ASSERT(ne1 == ne11);
11771
- GGML_ASSERT(ne2 == ne12);
11772
- GGML_ASSERT(ne3 == ne13);
12096
+ // broadcast factors
12097
+ const int64_t r2 = ne12 / ne02;
12098
+ const int64_t r3 = ne13 / ne03;
11773
12099
 
11774
- // we don't support permuted src0 or src1
11775
- GGML_ASSERT(nb00 == ggml_type_size(type));
11776
- GGML_ASSERT(nb10 == ggml_type_size(src1->type));
12100
+ //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
11777
12101
 
11778
- // dst cannot be transposed or permuted
11779
- GGML_ASSERT(nb0 == sizeof(float));
11780
- GGML_ASSERT(nb0 <= nb1);
11781
- GGML_ASSERT(nb1 <= nb2);
11782
- GGML_ASSERT(nb2 <= nb3);
12102
+ // threads with no work simply yield (not sure if it helps)
12103
+ if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
12104
+ return;
12105
+ }
12106
+
12107
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
12108
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
12109
+
12110
+ assert(ne12 % ne02 == 0);
12111
+ assert(ne13 % ne03 == 0);
12112
+
12113
+ // block-tiling attempt
12114
+ const int64_t blck_0 = 16;
12115
+ const int64_t blck_1 = 16;
12116
+
12117
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
12118
+
12119
+ // attempt to reduce false-sharing (does not seem to make a difference)
12120
+ // 16 * 2, accounting for mmla kernels
12121
+ float tmp[32];
12122
+
12123
+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
12124
+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
12125
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
12126
+ const int64_t i13 = (ir1 / (ne12 * ne1));
12127
+ const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
12128
+ const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
12129
+
12130
+ // broadcast src0 into src1
12131
+ const int64_t i03 = i13 / r3;
12132
+ const int64_t i02 = i12 / r2;
12133
+
12134
+ const int64_t i1 = i11;
12135
+ const int64_t i2 = i12;
12136
+ const int64_t i3 = i13;
12137
+
12138
+ const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
12139
+
12140
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
12141
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
12142
+ // the original src1 data pointer, so we should index using the indices directly
12143
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
12144
+ const char * src1_col = (const char*)wdata +
12145
+ (src1_cont || src1->type != vec_dot_type
12146
+ ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
12147
+ : (i11 * nb11 + i12 * nb12 + i13 * nb13));
12148
+ float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
12149
+
12150
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
12151
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
12152
+ //}
12153
+
12154
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
12155
+ vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
12156
+ }
12157
+
12158
+ for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
12159
+ memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
12160
+ }
12161
+ }
12162
+ }
12163
+ }
12164
+ }
12165
+
12166
+ static void ggml_compute_forward_mul_mat(
12167
+ const struct ggml_compute_params * params,
12168
+ struct ggml_tensor * dst,
12169
+ struct ggml_compute_state * state) {
12170
+
12171
+ const struct ggml_tensor * src0 = dst->src[0];
12172
+ const struct ggml_tensor * src1 = dst->src[1];
12173
+
12174
+ int64_t t0 = ggml_perf_time_us();
12175
+ UNUSED(t0);
12176
+
12177
+ GGML_TENSOR_BINARY_OP_LOCALS
12178
+
12179
+ const int ith = params->ith;
12180
+ const int nth = params->nth;
12181
+
12182
+ const enum ggml_type type = src0->type;
12183
+
12184
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
12185
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
12186
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
12187
+
12188
+ GGML_ASSERT(ne0 == ne01);
12189
+ GGML_ASSERT(ne1 == ne11);
12190
+ GGML_ASSERT(ne2 == ne12);
12191
+ GGML_ASSERT(ne3 == ne13);
12192
+
12193
+ // we don't support permuted src0 or src1
12194
+ GGML_ASSERT(nb00 == ggml_type_size(type));
12195
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
12196
+
12197
+ // dst cannot be transposed or permuted
12198
+ GGML_ASSERT(nb0 == sizeof(float));
12199
+ GGML_ASSERT(nb0 <= nb1);
12200
+ GGML_ASSERT(nb1 <= nb2);
12201
+ GGML_ASSERT(nb2 <= nb3);
11783
12202
 
11784
12203
  // broadcast factors
11785
- const int64_t r2 = ne12/ne02;
11786
- const int64_t r3 = ne13/ne03;
12204
+ const int64_t r2 = ne12 / ne02;
12205
+ const int64_t r3 = ne13 / ne03;
12206
+ UNUSED(r2);
12207
+ UNUSED(r3);
11787
12208
 
11788
12209
  // nb01 >= nb00 - src0 is not transposed
11789
12210
  // compute by src0 rows
@@ -11865,6 +12286,8 @@ static void ggml_compute_forward_mul_mat(
11865
12286
  #endif
11866
12287
 
11867
12288
  #if GGML_USE_LLAMAFILE
12289
+ const bool src1_cont = ggml_is_contiguous(src1);
12290
+
11868
12291
  if (src1_cont) {
11869
12292
  for (int64_t i13 = 0; i13 < ne13; i13++)
11870
12293
  for (int64_t i12 = 0; i12 < ne12; i12++)
@@ -11890,6 +12313,8 @@ UseGgmlGemm1:;
11890
12313
  if (ith != 0) {
11891
12314
  return;
11892
12315
  }
12316
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
12317
+ atomic_store(&state->shared->current_chunk, nth);
11893
12318
  if (src1->type != vec_dot_type) {
11894
12319
  char * wdata = params->wdata;
11895
12320
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -11914,11 +12339,11 @@ UseGgmlGemm1:;
11914
12339
  return;
11915
12340
  }
11916
12341
 
11917
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11918
- const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11919
-
11920
12342
  #if GGML_USE_LLAMAFILE
11921
12343
  if (src1->type != vec_dot_type) {
12344
+ const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
12345
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
12346
+
11922
12347
  for (int64_t i13 = 0; i13 < ne13; i13++)
11923
12348
  for (int64_t i12 = 0; i12 < ne12; i12++)
11924
12349
  if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -11939,98 +12364,87 @@ UseGgmlGemm1:;
11939
12364
  UseGgmlGemm2:;
11940
12365
  #endif
11941
12366
 
11942
- const int64_t nr0 = ne01; // src0 rows
11943
- const int64_t nr1 = ne1*ne12*ne13; // src1 rows
11944
-
11945
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11946
-
11947
- // distribute the thread work across the inner or outer loop based on which one is larger
11948
-
11949
- const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
11950
- const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
11951
-
11952
- const int64_t ith0 = ith % nth0;
11953
- const int64_t ith1 = ith / nth0;
11954
-
11955
- const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
11956
- const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
11957
-
11958
- const int64_t ir010 = dr0*ith0;
11959
- const int64_t ir011 = MIN(ir010 + dr0, nr0);
11960
-
11961
- const int64_t ir110 = dr1*ith1;
11962
- const int64_t ir111 = MIN(ir110 + dr1, nr1);
11963
-
11964
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11965
-
11966
- // threads with no work simply yield (not sure if it helps)
11967
- if (ir010 >= ir011 || ir110 >= ir111) {
11968
- sched_yield();
11969
- return;
11970
- }
12367
+ #ifdef GGML_PERF
12368
+ int chunks_executed = 0;
12369
+ UNUSED(chunks_executed);
12370
+ #endif
11971
12371
 
11972
- assert(ne12 % ne02 == 0);
11973
- assert(ne13 % ne03 == 0);
12372
+ // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
12373
+ const int64_t nr0 = ne0;
11974
12374
 
11975
- // block-tiling attempt
11976
- const int64_t blck_0 = 16;
11977
- const int64_t blck_1 = 16;
12375
+ // This is the size of the rest of the dimensions of the result
12376
+ const int64_t nr1 = ne1 * ne2 * ne3;
11978
12377
 
11979
12378
  // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
11980
- int64_t nrc = vec_dot_num_rows;
12379
+ int64_t num_rows_per_vec_dot = vec_dot_num_rows;
11981
12380
  // TODO: currently the mmla kernels support only even numbered rows/cols.
11982
12381
  // this check can be removed once they are extended to support odd numbered rows/cols too
11983
12382
  if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
11984
- nrc = 1;
12383
+ num_rows_per_vec_dot = 1;
11985
12384
  }
11986
12385
 
11987
- const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
12386
+ // Now select a reasonable chunk size.
12387
+ int chunk_size = 16;
11988
12388
 
11989
- // attempt to reduce false-sharing (does not seem to make a difference)
11990
- // 16 * 2, accounting for mmla kernels
11991
- float tmp[32];
12389
+ // We need to step up the size if it's small
12390
+ if (nr0 == 1 || nr1 == 1) {
12391
+ chunk_size = 64;
12392
+ }
11992
12393
 
11993
- for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11994
- for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11995
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
11996
- const int64_t i13 = (ir1/(ne12*ne1));
11997
- const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
11998
- const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
12394
+ // distribute the work across the inner or outer loop based on which one is larger
12395
+ // The number of chunks in the 0/1 dim.
12396
+ // CEIL(nr0/chunk_size)
12397
+ int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
12398
+ int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
11999
12399
 
12000
- // broadcast src0 into src1
12001
- const int64_t i03 = i13/r3;
12002
- const int64_t i02 = i12/r2;
12400
+ // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
12401
+ // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
12402
+ // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
12403
+ if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
12404
+ // distribute the thread work across the inner or outer loop based on which one is larger
12405
+ nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
12406
+ nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
12407
+ }
12003
12408
 
12004
- const int64_t i1 = i11;
12005
- const int64_t i2 = i12;
12006
- const int64_t i3 = i13;
12409
+ // The number of elements in each chunk
12410
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
12411
+ const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
12007
12412
 
12008
- const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
12413
+ //if (ith == 0)
12414
+ // printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
12009
12415
 
12010
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
12011
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
12012
- // the original src1 data pointer, so we should index using the indices directly
12013
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
12014
- const char * src1_col = (const char *) wdata +
12015
- (src1_cont || src1->type != vec_dot_type
12016
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
12017
- : (i11*nb11 + i12*nb12 + i13*nb13));
12018
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
12416
+ // The first chunk comes from our thread_id, the rest will get auto-assigned.
12417
+ int current_chunk = ith;
12019
12418
 
12020
- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
12021
- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
12022
- //}
12419
+ while (current_chunk < nchunk0 * nchunk1) {
12420
+ const int64_t ith0 = current_chunk % nchunk0;
12421
+ const int64_t ith1 = current_chunk / nchunk0;
12023
12422
 
12024
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
12025
- vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
12026
- }
12423
+ const int64_t ir0_start = dr0 * ith0;
12424
+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
12027
12425
 
12028
- for (int cn = 0; cn < nrc; ++cn) {
12029
- memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
12030
- }
12031
- }
12426
+ const int64_t ir1_start = dr1 * ith1;
12427
+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
12428
+
12429
+ ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
12430
+
12431
+ #ifdef GGML_PERF
12432
+ chunks_executed++;
12433
+ #endif
12434
+
12435
+ if (nth >= nchunk0 * nchunk1) {
12436
+ break;
12032
12437
  }
12438
+
12439
+ current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
12033
12440
  }
12441
+
12442
+ #ifdef GGML_PERF
12443
+ // These numbers are useful when trying to measure how well the threading scheduling works.
12444
+ //int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
12445
+ //float time = (ggml_perf_time_us() - t0);
12446
+ //printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
12447
+ #endif
12034
12448
  }
12035
12449
 
12036
12450
  // ggml_compute_forward_mul_mat_id
@@ -13333,7 +13747,6 @@ static void ggml_compute_forward_soft_max_f32(
13333
13747
 
13334
13748
  const struct ggml_tensor * src0 = dst->src[0];
13335
13749
  const struct ggml_tensor * src1 = dst->src[1];
13336
- const struct ggml_tensor * src2 = dst->src[2];
13337
13750
 
13338
13751
  assert(ggml_is_contiguous(dst));
13339
13752
  assert(ggml_are_same_shape(src0, dst));
@@ -13359,8 +13772,8 @@ static void ggml_compute_forward_soft_max_f32(
13359
13772
 
13360
13773
  // TODO: is this supposed to be ceil instead of floor?
13361
13774
  // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
13362
- const uint32_t n_head_kv = ne02;
13363
- const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
13775
+ const uint32_t n_head = ne02;
13776
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
13364
13777
 
13365
13778
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
13366
13779
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@@ -13377,13 +13790,13 @@ static void ggml_compute_forward_soft_max_f32(
13377
13790
 
13378
13791
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
13379
13792
 
13380
- // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
13381
- ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
13382
- float * pos_f32 = src2 ? (float *) src2->data : src0->data;
13383
-
13384
- const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
13793
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
13385
13794
 
13386
13795
  for (int i1 = ir0; i1 < ir1; i1++) {
13796
+ // ALiBi
13797
+ const uint32_t h = (i1/ne01)%ne02; // head
13798
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
13799
+
13387
13800
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
13388
13801
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
13389
13802
 
@@ -13396,27 +13809,11 @@ static void ggml_compute_forward_soft_max_f32(
13396
13809
  if (mp_f32) {
13397
13810
  if (use_f16) {
13398
13811
  for (int i = 0; i < nc; ++i) {
13399
- wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
13400
- }
13401
- } else {
13402
- for (int i = 0; i < nc; ++i) {
13403
- wp[i] += mp_f32[i];
13404
- }
13405
- }
13406
- }
13407
-
13408
- // ALiBi bias
13409
- if (max_bias > 0.0f) {
13410
- const uint32_t h = (i1/ne01)%ne02; // head
13411
- const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
13412
-
13413
- if (use_f16) {
13414
- for (int i = 0; i < nc; ++i) {
13415
- wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
13812
+ wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
13416
13813
  }
13417
13814
  } else {
13418
13815
  for (int i = 0; i < nc; ++i) {
13419
- wp[i] += slope*pos_f32[i];
13816
+ wp[i] += slope*mp_f32[i];
13420
13817
  }
13421
13818
  }
13422
13819
  }
@@ -13431,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32(
13431
13828
  float max = -INFINITY;
13432
13829
  ggml_vec_max_f32(nc, &max, wp);
13433
13830
 
13434
- ggml_float sum = 0.0;
13435
-
13436
- uint16_t scvt;
13437
- for (int i = 0; i < nc; i++) {
13438
- if (wp[i] == -INFINITY) {
13439
- dp[i] = 0.0f;
13440
- } else {
13441
- // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
13442
- ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
13443
- memcpy(&scvt, &s, sizeof(scvt));
13444
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
13445
- sum += (ggml_float)val;
13446
- dp[i] = val;
13447
- }
13448
- }
13449
-
13831
+ ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
13450
13832
  assert(sum > 0.0);
13451
13833
 
13452
13834
  sum = 1.0/sum;
@@ -13578,178 +13960,6 @@ static void ggml_compute_forward_soft_max_back(
13578
13960
  }
13579
13961
  }
13580
13962
 
13581
- // ggml_compute_forward_alibi
13582
-
13583
- static void ggml_compute_forward_alibi_f32(
13584
- const struct ggml_compute_params * params,
13585
- struct ggml_tensor * dst) {
13586
-
13587
- const struct ggml_tensor * src0 = dst->src[0];
13588
-
13589
- assert(params->ith == 0);
13590
-
13591
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13592
- return;
13593
- }
13594
-
13595
- //const int n_past = ((int32_t *) dst->op_params)[0];
13596
- const int n_head = ((int32_t *) dst->op_params)[1];
13597
- float max_bias;
13598
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13599
-
13600
- const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13601
- const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13602
- const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13603
- //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13604
-
13605
- const int64_t n = ggml_nrows(src0);
13606
- const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13607
-
13608
- const size_t nb0 = src0->nb[0];
13609
- const size_t nb1 = src0->nb[1];
13610
- const size_t nb2 = src0->nb[2];
13611
- //const int nb3 = src0->nb[3];
13612
-
13613
- GGML_ASSERT(nb0 == sizeof(float));
13614
- GGML_ASSERT(n_head == ne2);
13615
-
13616
- // add alibi to src0 (KQ_scaled)
13617
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
13618
-
13619
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13620
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13621
-
13622
- for (int64_t k = 0; k < ne2_ne3; k++) {
13623
- // TODO: k*nb2 or k*nb3
13624
- float m_k;
13625
-
13626
- if (k < n_heads_log2_floor) {
13627
- m_k = powf(m0, k + 1);
13628
- } else {
13629
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
13630
- }
13631
-
13632
- for (int64_t i = 0; i < ne0; i++) {
13633
- for (int64_t j = 0; j < ne1; j++) {
13634
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13635
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13636
- pdst[0] = i * m_k + src[0];
13637
- }
13638
- }
13639
- }
13640
- }
13641
-
13642
- static void ggml_compute_forward_alibi_f16(
13643
- const struct ggml_compute_params * params,
13644
- struct ggml_tensor * dst) {
13645
-
13646
- const struct ggml_tensor * src0 = dst->src[0];
13647
-
13648
- assert(params->ith == 0);
13649
-
13650
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13651
- return;
13652
- }
13653
-
13654
- //const int n_past = ((int32_t *) dst->op_params)[0];
13655
- const int n_head = ((int32_t *) dst->op_params)[1];
13656
- float max_bias;
13657
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13658
-
13659
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13660
- const int ne1 = src0->ne[1]; // seq_len_without_past
13661
- const int ne2 = src0->ne[2]; // n_head -> this is k
13662
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13663
-
13664
- const int n = ggml_nrows(src0);
13665
- const int ne2_ne3 = n/ne1; // ne2*ne3
13666
-
13667
- const int nb0 = src0->nb[0];
13668
- const int nb1 = src0->nb[1];
13669
- const int nb2 = src0->nb[2];
13670
- //const int nb3 = src0->nb[3];
13671
-
13672
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
13673
- //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
13674
- GGML_ASSERT(n_head == ne2);
13675
-
13676
- // add alibi to src0 (KQ_scaled)
13677
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
13678
-
13679
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13680
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13681
-
13682
- for (int k = 0; k < ne2_ne3; k++) {
13683
- // TODO: k*nb2 or k*nb3
13684
- float m_k;
13685
-
13686
- if (k < n_heads_log2_floor) {
13687
- m_k = powf(m0, k + 1);
13688
- } else {
13689
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
13690
- }
13691
-
13692
- for (int i = 0; i < ne0; i++) {
13693
- for (int j = 0; j < ne1; j++) {
13694
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13695
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13696
-
13697
- // we return F32
13698
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
13699
- }
13700
- }
13701
- }
13702
- }
13703
-
13704
- static void ggml_compute_forward_alibi(
13705
- const struct ggml_compute_params * params,
13706
- struct ggml_tensor * dst) {
13707
-
13708
- const struct ggml_tensor * src0 = dst->src[0];
13709
-
13710
- switch (src0->type) {
13711
- case GGML_TYPE_F16:
13712
- {
13713
- ggml_compute_forward_alibi_f16(params, dst);
13714
- } break;
13715
- case GGML_TYPE_F32:
13716
- {
13717
- ggml_compute_forward_alibi_f32(params, dst);
13718
- } break;
13719
- case GGML_TYPE_BF16:
13720
- case GGML_TYPE_Q4_0:
13721
- case GGML_TYPE_Q4_1:
13722
- case GGML_TYPE_Q5_0:
13723
- case GGML_TYPE_Q5_1:
13724
- case GGML_TYPE_Q8_0:
13725
- case GGML_TYPE_Q8_1:
13726
- case GGML_TYPE_Q2_K:
13727
- case GGML_TYPE_Q3_K:
13728
- case GGML_TYPE_Q4_K:
13729
- case GGML_TYPE_Q5_K:
13730
- case GGML_TYPE_Q6_K:
13731
- case GGML_TYPE_IQ2_XXS:
13732
- case GGML_TYPE_IQ2_XS:
13733
- case GGML_TYPE_IQ3_XXS:
13734
- case GGML_TYPE_IQ1_S:
13735
- case GGML_TYPE_IQ1_M:
13736
- case GGML_TYPE_IQ4_NL:
13737
- case GGML_TYPE_IQ4_XS:
13738
- case GGML_TYPE_IQ3_S:
13739
- case GGML_TYPE_IQ2_S:
13740
- case GGML_TYPE_Q8_K:
13741
- case GGML_TYPE_I8:
13742
- case GGML_TYPE_I16:
13743
- case GGML_TYPE_I32:
13744
- case GGML_TYPE_I64:
13745
- case GGML_TYPE_F64:
13746
- case GGML_TYPE_COUNT:
13747
- {
13748
- GGML_ASSERT(false);
13749
- } break;
13750
- }
13751
- }
13752
-
13753
13963
  // ggml_compute_forward_clamp
13754
13964
 
13755
13965
  static void ggml_compute_forward_clamp_f32(
@@ -14972,25 +15182,28 @@ static void ggml_compute_forward_upscale_f32(
14972
15182
  return;
14973
15183
  }
14974
15184
 
14975
- GGML_ASSERT(src0->nb[0] == sizeof(float));
15185
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
14976
15186
 
14977
15187
  const int ith = params->ith;
14978
15188
  const int nth = params->nth;
14979
15189
 
14980
15190
  GGML_TENSOR_UNARY_OP_LOCALS
14981
15191
 
14982
- const int scale_factor = dst->op_params[0];
15192
+ const float sf0 = (float)ne0/src0->ne[0];
15193
+ const float sf1 = (float)ne1/src0->ne[1];
15194
+ const float sf2 = (float)ne2/src0->ne[2];
15195
+ const float sf3 = (float)ne3/src0->ne[3];
14983
15196
 
14984
15197
  // TODO: optimize
14985
15198
 
14986
15199
  for (int64_t i3 = 0; i3 < ne3; i3++) {
14987
- const int64_t i03 = i3;
15200
+ const int64_t i03 = i3 / sf3;
14988
15201
  for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
14989
- const int64_t i02 = i2;
15202
+ const int64_t i02 = i2 / sf2;
14990
15203
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14991
- const int64_t i01 = i1 / scale_factor;
15204
+ const int64_t i01 = i1 / sf1;
14992
15205
  for (int64_t i0 = 0; i0 < ne0; i0++) {
14993
- const int64_t i00 = i0 / scale_factor;
15206
+ const int64_t i00 = i0 / sf0;
14994
15207
 
14995
15208
  const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
14996
15209
  float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
@@ -15020,6 +15233,7 @@ static void ggml_compute_forward_upscale(
15020
15233
  }
15021
15234
  }
15022
15235
 
15236
+
15023
15237
  // ggml_compute_forward_pad
15024
15238
 
15025
15239
  static void ggml_compute_forward_pad_f32(
@@ -15373,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32(
15373
15587
  vvexpf(S, S, &Mup);
15374
15588
  ggml_vec_sum_f32(Mup, &sum, S);
15375
15589
  #else
15376
- uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
15377
- ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
15378
-
15379
- for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
15380
- if (i >= masked_begin) {
15381
- break;
15382
- }
15383
- float * SS = S + i;
15384
-
15385
- for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
15386
- if (i + j >= masked_begin) {
15387
- break;
15388
- } else if (SS[j] == -INFINITY) {
15389
- SS[j] = 0.0f;
15390
- } else {
15391
- #ifndef GGML_FLASH_ATTN_EXP_FP16
15392
- const float val = expf(SS[j] - max);
15393
- #else
15394
- ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
15395
- memcpy(&scvt[j], &s, sizeof(uint16_t));
15396
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
15397
- #endif
15398
- sump[j] += (ggml_float)val;
15399
- SS[j] = val;
15400
- }
15401
- }
15402
- }
15403
-
15404
- for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
15405
- sum += sump[i];
15406
- }
15590
+ sum = ggml_vec_soft_max_f32(Mup, S, S, max);
15407
15591
  #endif
15408
15592
  }
15409
15593
 
@@ -15585,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16(
15585
15769
  vvexpf(S, S, &Mup);
15586
15770
  ggml_vec_sum_f32(Mup, &sum, S);
15587
15771
  #else
15588
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
15589
- ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
15590
-
15591
- for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
15592
- float * SS = S + i;
15593
-
15594
- for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
15595
- if (SS[j] == -INFINITY) {
15596
- SS[j] = 0.0f;
15597
- } else {
15598
- ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
15599
- memcpy(&scvt[j], &s, sizeof(uint16_t));
15600
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
15601
- sump[j] += (ggml_float)val;
15602
- SS[j] = val;
15603
- }
15604
- }
15605
- }
15606
-
15607
- for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
15608
- sum += sump[i];
15609
- }
15772
+ sum = ggml_vec_soft_max_f32(Mup, S, S, max);
15610
15773
  #endif
15611
15774
  }
15612
15775
 
@@ -15763,8 +15926,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15763
15926
  const int ir0 = dr*ith;
15764
15927
  const int ir1 = MIN(ir0 + dr, nr);
15765
15928
 
15766
- float scale = 1.0f;
15767
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
15929
+ float scale = 1.0f;
15930
+ float max_bias = 0.0f;
15931
+
15932
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
15933
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
15934
+
15935
+ const uint32_t n_head = neq2;
15936
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
15937
+
15938
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
15939
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
15768
15940
 
15769
15941
  // loop over n_batch and n_head
15770
15942
  for (int ir = ir0; ir < ir1; ++ir) {
@@ -15773,6 +15945,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15773
15945
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
15774
15946
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
15775
15947
 
15948
+ const uint32_t h = iq2; // head
15949
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
15950
+
15776
15951
  float S = 0.0f;
15777
15952
  float M = -INFINITY;
15778
15953
 
@@ -15796,7 +15971,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15796
15971
  // loop over n_kv and n_head_kv
15797
15972
  // ref: https://arxiv.org/pdf/2112.05682.pdf
15798
15973
  for (int64_t ic = 0; ic < nek1; ++ic) {
15799
- const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
15974
+ const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
15800
15975
  if (mv == -INFINITY) {
15801
15976
  continue;
15802
15977
  }
@@ -15867,7 +16042,7 @@ static void ggml_compute_forward_flash_attn_ext(
15867
16042
  const struct ggml_tensor * v,
15868
16043
  const struct ggml_tensor * mask,
15869
16044
  struct ggml_tensor * dst) {
15870
- switch (dst->op_params[1]) {
16045
+ switch (dst->op_params[2]) {
15871
16046
  case GGML_PREC_DEFAULT:
15872
16047
  case GGML_PREC_F32:
15873
16048
  {
@@ -16221,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
16221
16396
  vvexpf(SM, SM, &Mup);
16222
16397
  ggml_vec_sum_f32(Mup, &sum, SM);
16223
16398
  #else
16224
- uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
16225
- ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
16226
-
16227
- for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
16228
- if (i >= masked_begin) {
16229
- break;
16230
- }
16231
- float * SR = S + i;
16232
- float * SW = SM + i;
16233
-
16234
- for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
16235
- if (i + j >= masked_begin) {
16236
- break;
16237
- } else if (SR[j] == -INFINITY) {
16238
- SW[j] = 0.0f;
16239
- } else {
16240
- #ifndef GGML_FLASH_ATTN_EXP_FP16
16241
- const float val = expf(SR[j] - max);
16242
- #else
16243
- ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
16244
- memcpy(&scvt[j], &s, sizeof(uint16_t));
16245
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
16246
- #endif
16247
- sump[j] += (ggml_float)val;
16248
- SW[j] = val;
16249
- }
16250
- }
16251
- }
16252
-
16253
- for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
16254
- sum += sump[i];
16255
- }
16399
+ sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
16256
16400
  #endif
16257
16401
  }
16258
16402
 
@@ -16834,6 +16978,10 @@ static void ggml_compute_forward_unary(
16834
16978
  {
16835
16979
  ggml_compute_forward_relu(params, dst);
16836
16980
  } break;
16981
+ case GGML_UNARY_OP_SIGMOID:
16982
+ {
16983
+ ggml_compute_forward_sigmoid(params, dst);
16984
+ } break;
16837
16985
  case GGML_UNARY_OP_GELU:
16838
16986
  {
16839
16987
  ggml_compute_forward_gelu(params, dst);
@@ -17274,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
17274
17422
  assert(!isnan(s1[i]));
17275
17423
  }
17276
17424
  #endif
17277
- // soft_max
17278
- ggml_float sum = 0.0;
17279
- {
17280
- float max = -INFINITY;
17281
- ggml_vec_max_f32(nc, &max, s0);
17282
17425
 
17283
- uint16_t scvt; UNUSED(scvt);
17284
- for (int i = 0; i < nc; i++) {
17285
- if (s0[i] == -INFINITY) {
17286
- st[i] = 0.0f;
17287
- } else {
17288
- #ifndef GGML_CROSS_ENTROPY_EXP_FP16
17289
- const float s = s0[i] - max;
17290
- const float val = expf(s);
17291
- #else
17292
- ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
17293
- memcpy(&scvt, &s, sizeof(scvt));
17294
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
17295
- #endif
17296
- sum += (ggml_float)val;
17297
- st[i] = val;
17298
- }
17299
- }
17426
+ // soft_max
17427
+ float max = -INFINITY;
17428
+ ggml_vec_max_f32(nc, &max, s0);
17429
+ ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
17430
+ assert(sum > 0.0);
17431
+ sum = (1.0 - eps) / sum;
17300
17432
 
17301
- assert(sum > 0.0);
17302
- // sum = 1.0/sum;
17303
- }
17304
17433
  // avoid log(0) by rescaling from [0..1] to [eps..1]
17305
- sum = (1.0 - eps) / sum;
17306
17434
  ggml_vec_scale_f32(nc, st, sum);
17307
17435
  ggml_vec_add1_f32(nc, st, st, eps);
17308
17436
  ggml_vec_log_f32(nc, st, st);
@@ -17392,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
17392
17520
  #endif
17393
17521
 
17394
17522
  // soft_max
17395
- ggml_float sum = 0.0;
17396
- {
17397
- float max = -INFINITY;
17398
- ggml_vec_max_f32(nc, &max, s0);
17399
-
17400
- uint16_t scvt; UNUSED(scvt);
17401
- for (int i = 0; i < nc; i++) {
17402
- if (s0[i] == -INFINITY) {
17403
- ds0[i] = 0.0f;
17404
- } else {
17405
- #ifndef GGML_CROSS_ENTROPY_EXP_FP16
17406
- const float s = s0[i] - max;
17407
- const float val = expf(s);
17408
- #else
17409
- ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
17410
- memcpy(&scvt, &s, sizeof(scvt));
17411
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
17412
- #endif
17413
- sum += (ggml_float)val;
17414
- ds0[i] = val;
17415
- }
17416
- }
17417
-
17418
- assert(sum > 0.0);
17419
- sum = (1.0 - eps)/sum;
17420
- }
17523
+ float max = -INFINITY;
17524
+ ggml_vec_max_f32(nc, &max, s0);
17525
+ ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
17526
+ assert(sum > 0.0);
17527
+ sum = (1.0 - eps) / sum;
17421
17528
 
17422
17529
  // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
17423
17530
  ggml_vec_scale_f32(nc, ds0, sum);
@@ -17454,7 +17561,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
17454
17561
 
17455
17562
  /////////////////////////////////
17456
17563
 
17457
- static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
17564
+ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
17458
17565
  GGML_ASSERT(params);
17459
17566
 
17460
17567
  if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
@@ -17552,7 +17659,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
17552
17659
  } break;
17553
17660
  case GGML_OP_MUL_MAT:
17554
17661
  {
17555
- ggml_compute_forward_mul_mat(params, tensor);
17662
+ ggml_compute_forward_mul_mat(params, tensor, state);
17556
17663
  } break;
17557
17664
  case GGML_OP_MUL_MAT_ID:
17558
17665
  {
@@ -17630,10 +17737,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
17630
17737
  {
17631
17738
  ggml_compute_forward_rope_back(params, tensor);
17632
17739
  } break;
17633
- case GGML_OP_ALIBI:
17634
- {
17635
- ggml_compute_forward_alibi(params, tensor);
17636
- } break;
17637
17740
  case GGML_OP_CLAMP:
17638
17741
  {
17639
17742
  ggml_compute_forward_clamp(params, tensor);
@@ -18652,10 +18755,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18652
18755
  zero_table);
18653
18756
  }
18654
18757
  } break;
18655
- case GGML_OP_ALIBI:
18656
- {
18657
- GGML_ASSERT(false); // TODO: not implemented
18658
- } break;
18659
18758
  case GGML_OP_CLAMP:
18660
18759
  {
18661
18760
  GGML_ASSERT(false); // TODO: not implemented
@@ -18826,6 +18925,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18826
18925
  zero_table);
18827
18926
  }
18828
18927
  } break;
18928
+ case GGML_UNARY_OP_SIGMOID:
18929
+ {
18930
+ GGML_ASSERT(false); // TODO: not implemented
18931
+ } break;
18829
18932
  case GGML_UNARY_OP_GELU:
18830
18933
  {
18831
18934
  GGML_ASSERT(false); // TODO: not implemented
@@ -19172,8 +19275,6 @@ typedef int ggml_lock_t;
19172
19275
 
19173
19276
  #define GGML_LOCK_INITIALIZER 0
19174
19277
 
19175
- typedef pthread_t ggml_thread_t;
19176
-
19177
19278
  #define ggml_thread_create pthread_create
19178
19279
  #define ggml_thread_join pthread_join
19179
19280
 
@@ -19199,8 +19300,6 @@ typedef int ggml_lock_t;
19199
19300
 
19200
19301
  #define GGML_LOCK_INITIALIZER 0
19201
19302
 
19202
- typedef pthread_t ggml_thread_t;
19203
-
19204
19303
  #define ggml_thread_create pthread_create
19205
19304
  #define ggml_thread_join pthread_join
19206
19305
 
@@ -19280,31 +19379,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
19280
19379
  static void clear_numa_thread_affinity(void) {}
19281
19380
  #endif
19282
19381
 
19283
- struct ggml_compute_state_shared {
19284
- const struct ggml_cgraph * cgraph;
19285
- const struct ggml_cplan * cplan;
19286
-
19287
- int64_t perf_node_start_cycles;
19288
- int64_t perf_node_start_time_us;
19289
-
19290
- const int n_threads;
19291
-
19292
- // synchronization primitives
19293
- atomic_int n_active; // num active threads
19294
- atomic_int node_n; // active graph node
19295
- atomic_int node_task; // active graph node task phase
19296
-
19297
- ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
19298
- void * abort_callback_data;
19299
- };
19300
-
19301
- struct ggml_compute_state {
19302
- ggml_thread_t thrd;
19303
- int ith;
19304
- struct ggml_compute_state_shared * shared;
19305
- enum ggml_status ec;
19306
- };
19307
-
19308
19382
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
19309
19383
  int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
19310
19384
  int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
@@ -19355,6 +19429,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19355
19429
  case GGML_UNARY_OP_TANH:
19356
19430
  case GGML_UNARY_OP_ELU:
19357
19431
  case GGML_UNARY_OP_RELU:
19432
+ case GGML_UNARY_OP_SIGMOID:
19358
19433
  case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
19359
19434
  case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
19360
19435
  {
@@ -19428,10 +19503,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19428
19503
  {
19429
19504
  n_tasks = n_threads;
19430
19505
  } break;
19431
- case GGML_OP_ALIBI:
19432
- {
19433
- n_tasks = 1; //TODO
19434
- } break;
19435
19506
  case GGML_OP_CLAMP:
19436
19507
  {
19437
19508
  n_tasks = 1; //TODO
@@ -19580,6 +19651,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19580
19651
 
19581
19652
  * node_n = atomic_load(&state->shared->node_n);
19582
19653
  if (* node_n != last_node_n) break;
19654
+ #if defined(__SSE3__)
19655
+ // Tell the processor we're spinning. It's a processor hint for spinlocks.
19656
+ _mm_pause();
19657
+ #endif
19583
19658
  }
19584
19659
  }
19585
19660
 
@@ -19594,6 +19669,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
19594
19669
 
19595
19670
  * task_phase = atomic_load(&state->shared->node_task);
19596
19671
  if (* task_phase != last_task_phase) break;
19672
+ #if defined(__SSE3__)
19673
+ // Tell the processor we're spinning. It's a processor hint for spinlocks.
19674
+ _mm_pause();
19675
+ #endif
19597
19676
  }
19598
19677
  }
19599
19678
 
@@ -19633,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19633
19712
  struct ggml_tensor * node = cgraph->nodes[node_n];
19634
19713
  if (GGML_OP_HAS_FINALIZE[node->op]) {
19635
19714
  params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
19636
- ggml_compute_forward(&params, node);
19715
+ ggml_compute_forward(&params, node, state);
19637
19716
  }
19638
19717
  ggml_graph_compute_perf_stats_node(node, state->shared);
19639
19718
  }
@@ -19653,17 +19732,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19653
19732
  /* INIT */
19654
19733
  if (GGML_OP_HAS_INIT[node->op]) {
19655
19734
  params.type = GGML_TASK_TYPE_INIT;
19656
- ggml_compute_forward(&params, node);
19735
+ ggml_compute_forward(&params, node, state);
19657
19736
  }
19658
19737
 
19659
19738
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
19660
19739
  // they do something more efficient than spinning (?)
19661
19740
  params.type = GGML_TASK_TYPE_COMPUTE;
19662
- ggml_compute_forward(&params, node);
19741
+ ggml_compute_forward(&params, node, state);
19663
19742
 
19664
19743
  if (GGML_OP_HAS_FINALIZE[node->op]) {
19665
19744
  params.type = GGML_TASK_TYPE_FINALIZE;
19666
- ggml_compute_forward(&params, node);
19745
+ ggml_compute_forward(&params, node, state);
19667
19746
  }
19668
19747
 
19669
19748
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -19702,7 +19781,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19702
19781
 
19703
19782
  if (state->ith < n_tasks) {
19704
19783
  if (GGML_OP_HAS_INIT[node->op]) {
19705
- ggml_compute_forward(&params, node);
19784
+ ggml_compute_forward(&params, node, state);
19706
19785
  }
19707
19786
  }
19708
19787
 
@@ -19723,7 +19802,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19723
19802
 
19724
19803
  if (state->ith < n_tasks) {
19725
19804
  params.type = GGML_TASK_TYPE_COMPUTE;
19726
- ggml_compute_forward(&params, node);
19805
+ ggml_compute_forward(&params, node, state);
19727
19806
  }
19728
19807
 
19729
19808
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -19974,6 +20053,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19974
20053
  /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
19975
20054
  /*.abort_callback =*/ NULL,
19976
20055
  /*.abort_callback_data =*/ NULL,
20056
+ /*.current_chunk; =*/ 0,
19977
20057
  };
19978
20058
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19979
20059