llama_cpp 0.15.1 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,6 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
- #include "sgemm.h"
8
7
 
9
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
10
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -37,6 +36,10 @@
37
36
  #undef GGML_USE_LLAMAFILE
38
37
  #endif
39
38
 
39
+ #ifdef GGML_USE_LLAMAFILE
40
+ #include "sgemm.h"
41
+ #endif
42
+
40
43
  #if defined(_MSC_VER)
41
44
  // disable "possible loss of data" to avoid hundreds of casts
42
45
  // we should just be careful :)
@@ -109,6 +112,8 @@ typedef void * thread_ret_t;
109
112
 
110
113
  #endif
111
114
 
115
+ typedef pthread_t ggml_thread_t;
116
+
112
117
  #ifdef GGML_USE_CPU_HBM
113
118
  #include <hbwmalloc.h>
114
119
  #endif
@@ -160,9 +165,6 @@ void ggml_print_backtrace(void) {
160
165
  #define GGML_DEBUG 0
161
166
  #define GGML_GELU_FP16
162
167
  #define GGML_GELU_QUICK_FP16
163
- #define GGML_SILU_FP16
164
- // #define GGML_CROSS_ENTROPY_EXP_FP16
165
- // #define GGML_FLASH_ATTN_EXP_FP16
166
168
 
167
169
  #define GGML_SOFT_MAX_UNROLL 4
168
170
  #define GGML_VEC_DOT_UNROLL 2
@@ -313,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
313
315
  // precomputed quick gelu table for f16 (128 KB)
314
316
  static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
315
317
 
316
- // precomputed silu table for f16 (128 KB)
317
- static ggml_fp16_t ggml_table_silu_f16[1 << 16];
318
-
319
- // precomputed exp table for f16 (128 KB)
320
- static ggml_fp16_t ggml_table_exp_f16[1 << 16];
321
-
322
318
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
323
319
  float ggml_table_f32_f16[1 << 16];
324
320
 
@@ -1303,6 +1299,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1303
1299
  #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1304
1300
  #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1305
1301
  #define GGML_F16_VEC_FMA GGML_F32x4_FMA
1302
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
1303
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1306
1304
  #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1307
1305
  // Use vec_xl, not vec_ld, in case the load address is not aligned.
1308
1306
  #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
@@ -1534,6 +1532,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
1534
1532
  #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1535
1533
  #endif
1536
1534
 
1535
+ //
1536
+ // ggml context
1537
+ //
1538
+
1539
+ struct ggml_context {
1540
+ size_t mem_size;
1541
+ void* mem_buffer;
1542
+ bool mem_buffer_owned;
1543
+ bool no_alloc;
1544
+ bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
1545
+
1546
+ int n_objects;
1547
+
1548
+ struct ggml_object* objects_begin;
1549
+ struct ggml_object* objects_end;
1550
+
1551
+ struct ggml_scratch scratch;
1552
+ struct ggml_scratch scratch_save;
1553
+ };
1554
+
1555
+ struct ggml_context_container {
1556
+ bool used;
1557
+
1558
+ struct ggml_context context;
1559
+ };
1560
+
1561
+ struct ggml_compute_state_shared {
1562
+ const struct ggml_cgraph* cgraph;
1563
+ const struct ggml_cplan* cplan;
1564
+
1565
+ int64_t perf_node_start_cycles;
1566
+ int64_t perf_node_start_time_us;
1567
+
1568
+ const int n_threads;
1569
+
1570
+ // synchronization primitives
1571
+ atomic_int n_active; // num active threads
1572
+ atomic_int node_n; // active graph node
1573
+ atomic_int node_task; // active graph node task phase
1574
+
1575
+ ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
1576
+ void* abort_callback_data;
1577
+
1578
+ atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1579
+ };
1580
+
1581
+ struct ggml_compute_state {
1582
+ ggml_thread_t thrd;
1583
+ int ith;
1584
+ struct ggml_compute_state_shared* shared;
1585
+ enum ggml_status ec;
1586
+ };
1587
+
1537
1588
  //
1538
1589
  // fundamental operations
1539
1590
  //
@@ -1949,6 +2000,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
1949
2000
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1950
2001
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1951
2002
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
2003
+ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
1952
2004
  // TODO: optimize performance
1953
2005
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1954
2006
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
@@ -2024,52 +2076,291 @@ inline static float ggml_silu_f32(float x) {
2024
2076
  return x/(1.0f + expf(-x));
2025
2077
  }
2026
2078
 
2027
- //inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2028
- // const uint16_t * i16 = (const uint16_t *) x;
2029
- // for (int i = 0; i < n; ++i) {
2030
- // y[i] = ggml_table_silu_f16[i16[i]];
2031
- // }
2032
- //}
2079
+ #if defined(__ARM_NEON)
2033
2080
 
2034
- #ifdef GGML_SILU_FP16
2035
- inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2036
- uint16_t t;
2037
- for (int i = 0; i < n; ++i) {
2038
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
2039
- memcpy(&t, &fp16, sizeof(uint16_t));
2040
- y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
2041
- }
2042
- }
2081
+ // adapted from arm limited optimized routine
2082
+ // the maximum error is 1.45358 plus 0.5 ulps
2083
+ // numbers above 88.38 will flush to infinity
2084
+ // numbers beneath -103.97 will flush to zero
2085
+ inline static float32x4_t ggml_v_expf(float32x4_t x) {
2086
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
2087
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
2088
+ const float32x4_t n = vsubq_f32(z, r);
2089
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
2090
+ vdupq_n_f32(0x1.7f7d1cp-20f));
2091
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
2092
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
2093
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
2094
+ const float32x4_t u = vmulq_f32(b, b);
2095
+ const float32x4_t j = vfmaq_f32(
2096
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
2097
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
2098
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
2099
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
2100
+ return vfmaq_f32(k, j, k);
2101
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
2102
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
2103
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
2104
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
2105
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
2106
+ }
2107
+
2108
+ // computes silu x/(1+exp(-x)) in single precision vector
2109
+ inline static float32x4_t ggml_v_silu(float32x4_t x) {
2110
+ const float32x4_t one = vdupq_n_f32(1.0f);
2111
+ const float32x4_t zero = vdupq_n_f32(0.0f);
2112
+ const float32x4_t neg_x = vsubq_f32(zero, x);
2113
+ const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
2114
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
2115
+ return vdivq_f32(x, one_plus_exp_neg_x);
2116
+ }
2117
+
2118
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
2119
+
2120
+ // adapted from arm limited optimized routine
2121
+ // the maximum error is 1.45358 plus 0.5 ulps
2122
+ // numbers above 88.38 will flush to infinity
2123
+ // numbers beneath -103.97 will flush to zero
2124
+ inline static __m512 ggml_v_expf(__m512 x) {
2125
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
2126
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
2127
+ const __m512 n = _mm512_sub_ps(z, r);
2128
+ const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
2129
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
2130
+ const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
2131
+ const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
2132
+ const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
2133
+ const __m512 u = _mm512_mul_ps(b, b);
2134
+ const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
2135
+ _mm512_set1_ps(0x1.573e2ep-5f)), u,
2136
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
2137
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
2138
+ u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
2139
+ if (_mm512_kortestz(c, c))
2140
+ return _mm512_fmadd_ps(j, k, k);
2141
+ const __m512i g = _mm512_and_si512(
2142
+ _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
2143
+ _mm512_set1_epi32(0x82000000u));
2144
+ const __m512 s1 =
2145
+ _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
2146
+ const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
2147
+ const __mmask16 d =
2148
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
2149
+ return _mm512_mask_blend_ps(
2150
+ d, _mm512_mask_blend_ps(
2151
+ c, _mm512_fmadd_ps(k, j, k),
2152
+ _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
2153
+ _mm512_mul_ps(s1, s1));
2154
+ }
2155
+
2156
+ // computes silu x/(1+exp(-x)) in single precision vector
2157
+ inline static __m512 ggml_v_silu(__m512 x) {
2158
+ const __m512 one = _mm512_set1_ps(1);
2159
+ const __m512 zero = _mm512_setzero_ps();
2160
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
2161
+ const __m512 exp_neg_x = ggml_v_expf(neg_x);
2162
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
2163
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
2164
+ }
2165
+
2166
+ #elif defined(__AVX2__) && defined(__FMA__)
2167
+
2168
+ // adapted from arm limited optimized routine
2169
+ // the maximum error is 1.45358 plus 0.5 ulps
2170
+ // numbers above 88.38 will flush to infinity
2171
+ // numbers beneath -103.97 will flush to zero
2172
+ inline static __m256 ggml_v_expf(__m256 x) {
2173
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
2174
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
2175
+ const __m256 n = _mm256_sub_ps(z, r);
2176
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
2177
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
2178
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
2179
+ const __m256 k = _mm256_castsi256_ps(
2180
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
2181
+ const __m256i c = _mm256_castps_si256(
2182
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
2183
+ _mm256_set1_ps(126), _CMP_GT_OQ));
2184
+ const __m256 u = _mm256_mul_ps(b, b);
2185
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
2186
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
2187
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
2188
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
2189
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
2190
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
2191
+ return _mm256_fmadd_ps(j, k, k);
2192
+ const __m256i g = _mm256_and_si256(
2193
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
2194
+ _mm256_set1_epi32(0x82000000u));
2195
+ const __m256 s1 =
2196
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
2197
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
2198
+ const __m256i d = _mm256_castps_si256(
2199
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
2200
+ _mm256_set1_ps(192), _CMP_GT_OQ));
2201
+ return _mm256_or_ps(
2202
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
2203
+ _mm256_andnot_ps(
2204
+ _mm256_castsi256_ps(d),
2205
+ _mm256_or_ps(
2206
+ _mm256_and_ps(_mm256_castsi256_ps(c),
2207
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
2208
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
2209
+ }
2210
+
2211
+ // computes silu x/(1+exp(-x)) in single precision vector
2212
+ inline static __m256 ggml_v_silu(__m256 x) {
2213
+ const __m256 one = _mm256_set1_ps(1);
2214
+ const __m256 zero = _mm256_setzero_ps();
2215
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
2216
+ const __m256 exp_neg_x = ggml_v_expf(neg_x);
2217
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
2218
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
2219
+ }
2220
+
2221
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
2222
+
2223
+ #if defined(__FMA__)
2224
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
2225
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
2043
2226
  #else
2044
- inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2045
- for (int i = 0; i < n; ++i) {
2227
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
2228
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
2229
+ #endif
2230
+
2231
+ // adapted from arm limited optimized routine
2232
+ // the maximum error is 1.45358 plus 0.5 ulps
2233
+ // numbers above 88.38 will flush to infinity
2234
+ // numbers beneath -103.97 will flush to zero
2235
+ inline static __m128 ggml_v_expf(__m128 x) {
2236
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
2237
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
2238
+ const __m128 n = _mm_sub_ps(z, r);
2239
+ const __m128 b =
2240
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
2241
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
2242
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
2243
+ const __m128i c =
2244
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
2245
+ const __m128 u = _mm_mul_ps(b, b);
2246
+ const __m128 j =
2247
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
2248
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
2249
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
2250
+ if (!_mm_movemask_epi8(c))
2251
+ return MADD128(j, k, k);
2252
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
2253
+ _mm_set1_epi32(0x82000000u));
2254
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
2255
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
2256
+ const __m128i d =
2257
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
2258
+ return _mm_or_ps(
2259
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
2260
+ _mm_andnot_ps(_mm_castsi128_ps(d),
2261
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
2262
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
2263
+ }
2264
+
2265
+ // computes silu x/(1+exp(-x)) in single precision vector
2266
+ inline static __m128 ggml_v_silu(__m128 x) {
2267
+ const __m128 one = _mm_set1_ps(1);
2268
+ const __m128 zero = _mm_setzero_ps();
2269
+ const __m128 neg_x = _mm_sub_ps(zero, x);
2270
+ const __m128 exp_neg_x = ggml_v_expf(neg_x);
2271
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
2272
+ return _mm_div_ps(x, one_plus_exp_neg_x);
2273
+ }
2274
+
2275
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__
2276
+
2277
+ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2278
+ int i = 0;
2279
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
2280
+ for (; i + 15 < n; i += 16) {
2281
+ _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
2282
+ }
2283
+ #elif defined(__AVX2__) && defined(__FMA__)
2284
+ for (; i + 7 < n; i += 8) {
2285
+ _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
2286
+ }
2287
+ #elif defined(__SSE2__)
2288
+ for (; i + 3 < n; i += 4) {
2289
+ _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
2290
+ }
2291
+ #elif defined(__ARM_NEON)
2292
+ for (; i + 3 < n; i += 4) {
2293
+ vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
2294
+ }
2295
+ #endif
2296
+ for (; i < n; ++i) {
2046
2297
  y[i] = ggml_silu_f32(x[i]);
2047
2298
  }
2048
2299
  }
2300
+
2301
+ static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
2302
+ int i = 0;
2303
+ ggml_float sum = 0;
2304
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
2305
+ for (; i + 15 < n; i += 16) {
2306
+ __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
2307
+ _mm512_set1_ps(max)));
2308
+ _mm512_storeu_ps(y + i, val);
2309
+ sum += (ggml_float)_mm512_reduce_add_ps(val);
2310
+ }
2311
+ #elif defined(__AVX2__) && defined(__FMA__)
2312
+ for (; i + 7 < n; i += 8) {
2313
+ __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
2314
+ _mm256_set1_ps(max)));
2315
+ _mm256_storeu_ps(y + i, val);
2316
+ __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
2317
+ _mm256_castps256_ps128(val));
2318
+ val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
2319
+ val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
2320
+ sum += (ggml_float)_mm_cvtss_f32(val2);
2321
+ }
2322
+ #elif defined(__SSE2__)
2323
+ for (; i + 3 < n; i += 4) {
2324
+ __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
2325
+ _mm_set1_ps(max)));
2326
+ _mm_storeu_ps(y + i, val);
2327
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
2328
+ val = _mm_add_ps(val, _mm_movehl_ps(val, val));
2329
+ val = _mm_add_ss(val, _mm_movehdup_ps(val));
2330
+ #else
2331
+ __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
2332
+ val = _mm_add_ps(val, tmp);
2333
+ tmp = _mm_movehl_ps(tmp, val);
2334
+ val = _mm_add_ss(val, tmp);
2335
+ #endif
2336
+ sum += (ggml_float)_mm_cvtss_f32(val);
2337
+ }
2338
+ #elif defined(__ARM_NEON)
2339
+ for (; i + 3 < n; i += 4) {
2340
+ float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
2341
+ vdupq_n_f32(max)));
2342
+ vst1q_f32(y + i, val);
2343
+ sum += (ggml_float)vaddvq_f32(val);
2344
+ }
2049
2345
  #endif
2346
+ for (; i < n; ++i) {
2347
+ float val = expf(x[i] - max);
2348
+ sum += (ggml_float)val;
2349
+ y[i] = val;
2350
+ }
2351
+ return sum;
2352
+ }
2050
2353
 
2051
2354
  inline static float ggml_silu_backward_f32(float x, float dy) {
2052
2355
  const float s = 1.0f/(1.0f + expf(-x));
2053
2356
  return dy*s*(1.0f + x*(1.0f - s));
2054
2357
  }
2055
2358
 
2056
- #ifdef GGML_SILU_FP16
2057
- inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2058
- for (int i = 0; i < n; ++i) {
2059
- // we did not use x[i] to compute forward silu but its f16 equivalent
2060
- // take derivative at f16 of x[i]:
2061
- ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
2062
- float usedx = GGML_FP16_TO_FP32(fp16);
2063
- dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
2064
- }
2065
- }
2066
- #else
2067
2359
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2068
2360
  for (int i = 0; i < n; ++i) {
2069
2361
  dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
2070
2362
  }
2071
2363
  }
2072
- #endif
2073
2364
 
2074
2365
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
2075
2366
  #ifndef GGML_USE_ACCELERATE
@@ -2185,7 +2476,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2185
2476
  "SOFT_MAX_BACK",
2186
2477
  "ROPE",
2187
2478
  "ROPE_BACK",
2188
- "ALIBI",
2189
2479
  "CLAMP",
2190
2480
  "CONV_TRANSPOSE_1D",
2191
2481
  "IM2COL",
@@ -2227,7 +2517,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2227
2517
  "CROSS_ENTROPY_LOSS_BACK",
2228
2518
  };
2229
2519
 
2230
- static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2520
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2231
2521
 
2232
2522
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2233
2523
  "none",
@@ -2276,7 +2566,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2276
2566
  "soft_max_back(x)",
2277
2567
  "rope(x)",
2278
2568
  "rope_back(x)",
2279
- "alibi(x)",
2280
2569
  "clamp(x)",
2281
2570
  "conv_transpose_1d(x)",
2282
2571
  "im2col(x)",
@@ -2318,7 +2607,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2318
2607
  "cross_entropy_loss_back(x,y)",
2319
2608
  };
2320
2609
 
2321
- static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2610
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2322
2611
 
2323
2612
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
2324
2613
 
@@ -2331,6 +2620,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
2331
2620
  "TANH",
2332
2621
  "ELU",
2333
2622
  "RELU",
2623
+ "SIGMOID",
2334
2624
  "GELU",
2335
2625
  "GELU_QUICK",
2336
2626
  "SILU",
@@ -2338,7 +2628,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
2338
2628
  "HARDSIGMOID",
2339
2629
  };
2340
2630
 
2341
- static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
2631
+ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
2342
2632
 
2343
2633
 
2344
2634
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2380,32 +2670,6 @@ static void ggml_setup_op_has_task_pass(void) {
2380
2670
  }
2381
2671
  }
2382
2672
 
2383
- //
2384
- // ggml context
2385
- //
2386
-
2387
- struct ggml_context {
2388
- size_t mem_size;
2389
- void * mem_buffer;
2390
- bool mem_buffer_owned;
2391
- bool no_alloc;
2392
- bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
2393
-
2394
- int n_objects;
2395
-
2396
- struct ggml_object * objects_begin;
2397
- struct ggml_object * objects_end;
2398
-
2399
- struct ggml_scratch scratch;
2400
- struct ggml_scratch scratch_save;
2401
- };
2402
-
2403
- struct ggml_context_container {
2404
- bool used;
2405
-
2406
- struct ggml_context context;
2407
- };
2408
-
2409
2673
  //
2410
2674
  // NUMA support
2411
2675
  //
@@ -2819,6 +3083,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
2819
3083
  (t0->ne[3] == t1->ne[3] );
2820
3084
  }
2821
3085
 
3086
+ bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
3087
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
3088
+
3089
+ return
3090
+ (t0->nb[0] == t1->nb[0] ) &&
3091
+ (t0->nb[1] == t1->nb[1] ) &&
3092
+ (t0->nb[2] == t1->nb[2] ) &&
3093
+ (t0->nb[3] == t1->nb[3] );
3094
+ }
3095
+
2822
3096
  // check if t1 can be represented as a repeatition of t0
2823
3097
  static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
2824
3098
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -2878,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2878
3152
  float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
2879
3153
  ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
2880
3154
  ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
2881
- ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
2882
- ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
2883
3155
  }
2884
3156
 
2885
3157
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -3163,6 +3435,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
3163
3435
 
3164
3436
  struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
3165
3437
 
3438
+ #ifdef __clang__
3439
+ // temporary until ggml_tensor::backend is removed
3440
+ #pragma clang diagnostic push
3441
+ #pragma clang diagnostic ignored "-Wdeprecated-declarations"
3442
+ #endif
3443
+
3166
3444
  *result = (struct ggml_tensor) {
3167
3445
  /*.type =*/ type,
3168
3446
  /*.backend =*/ GGML_BACKEND_TYPE_CPU,
@@ -3185,6 +3463,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
3185
3463
  /*.padding =*/ { 0 },
3186
3464
  };
3187
3465
 
3466
+ #ifdef __clang__
3467
+ #pragma clang diagnostic pop
3468
+ #endif
3469
+
3188
3470
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3189
3471
  //ggml_assert_aligned(result->data);
3190
3472
 
@@ -4563,6 +4845,20 @@ struct ggml_tensor * ggml_leaky_relu(
4563
4845
  return result;
4564
4846
  }
4565
4847
 
4848
+ // ggml_sigmoid
4849
+
4850
+ struct ggml_tensor * ggml_sigmoid(
4851
+ struct ggml_context * ctx,
4852
+ struct ggml_tensor * a) {
4853
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
4854
+ }
4855
+
4856
+ struct ggml_tensor * ggml_sigmoid_inplace(
4857
+ struct ggml_context * ctx,
4858
+ struct ggml_tensor * a) {
4859
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
4860
+ }
4861
+
4566
4862
  // ggml_gelu
4567
4863
 
4568
4864
  struct ggml_tensor * ggml_gelu(
@@ -5646,7 +5942,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
5646
5942
  struct ggml_context * ctx,
5647
5943
  struct ggml_tensor * a,
5648
5944
  struct ggml_tensor * mask,
5649
- struct ggml_tensor * pos,
5650
5945
  float scale,
5651
5946
  float max_bias,
5652
5947
  bool inplace) {
@@ -5660,18 +5955,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
5660
5955
  GGML_ASSERT(mask->ne[1] >= a->ne[1]);
5661
5956
  }
5662
5957
 
5663
- if (pos) {
5664
- GGML_ASSERT(ggml_is_vector(pos));
5665
- GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
5666
- GGML_ASSERT(pos->ne[0] == a->ne[0]);
5667
- }
5668
-
5669
- if (pos && mask) {
5670
- GGML_ASSERT(pos->type == mask->type);
5671
- }
5672
-
5673
5958
  if (max_bias > 0.0f) {
5674
- GGML_ASSERT(pos);
5959
+ GGML_ASSERT(mask);
5675
5960
  }
5676
5961
 
5677
5962
  bool is_node = false;
@@ -5689,7 +5974,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
5689
5974
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5690
5975
  result->src[0] = a;
5691
5976
  result->src[1] = mask;
5692
- result->src[2] = pos;
5693
5977
 
5694
5978
  return result;
5695
5979
  }
@@ -5697,23 +5981,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
5697
5981
  struct ggml_tensor * ggml_soft_max(
5698
5982
  struct ggml_context * ctx,
5699
5983
  struct ggml_tensor * a) {
5700
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5984
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
5701
5985
  }
5702
5986
 
5703
5987
  struct ggml_tensor * ggml_soft_max_inplace(
5704
5988
  struct ggml_context * ctx,
5705
5989
  struct ggml_tensor * a) {
5706
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5990
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
5707
5991
  }
5708
5992
 
5709
5993
  struct ggml_tensor * ggml_soft_max_ext(
5710
5994
  struct ggml_context * ctx,
5711
5995
  struct ggml_tensor * a,
5712
5996
  struct ggml_tensor * mask,
5713
- struct ggml_tensor * pos,
5714
5997
  float scale,
5715
5998
  float max_bias) {
5716
- return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5999
+ return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
5717
6000
  }
5718
6001
 
5719
6002
  // ggml_soft_max_back
@@ -5928,37 +6211,6 @@ struct ggml_tensor * ggml_rope_back(
5928
6211
  return result;
5929
6212
  }
5930
6213
 
5931
- // ggml_alibi
5932
-
5933
- struct ggml_tensor * ggml_alibi(
5934
- struct ggml_context * ctx,
5935
- struct ggml_tensor * a,
5936
- int n_past,
5937
- int n_head,
5938
- float bias_max) {
5939
- GGML_ASSERT(n_past >= 0);
5940
- bool is_node = false;
5941
-
5942
- if (a->grad) {
5943
- GGML_ASSERT(false); // TODO: implement backward
5944
- is_node = true;
5945
- }
5946
-
5947
- // TODO: when implement backward, fix this:
5948
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5949
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
5950
-
5951
- int32_t op_params[3] = { n_past, n_head };
5952
- memcpy(op_params + 2, &bias_max, sizeof(float));
5953
- ggml_set_op_params(result, op_params, sizeof(op_params));
5954
-
5955
- result->op = GGML_OP_ALIBI;
5956
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5957
- result->src[0] = a;
5958
-
5959
- return result;
5960
- }
5961
-
5962
6214
  // ggml_clamp
5963
6215
 
5964
6216
  struct ggml_tensor * ggml_clamp(
@@ -6308,7 +6560,10 @@ struct ggml_tensor * ggml_pool_2d(
6308
6560
  static struct ggml_tensor * ggml_upscale_impl(
6309
6561
  struct ggml_context * ctx,
6310
6562
  struct ggml_tensor * a,
6311
- int scale_factor) {
6563
+ int ne0,
6564
+ int ne1,
6565
+ int ne2,
6566
+ int ne3) {
6312
6567
  bool is_node = false;
6313
6568
 
6314
6569
  if (a->grad) {
@@ -6316,19 +6571,45 @@ static struct ggml_tensor * ggml_upscale_impl(
6316
6571
  is_node = true;
6317
6572
  }
6318
6573
 
6574
+ GGML_ASSERT(a->ne[0] <= ne0);
6575
+ GGML_ASSERT(a->ne[1] <= ne1);
6576
+ GGML_ASSERT(a->ne[2] <= ne2);
6577
+ GGML_ASSERT(a->ne[3] <= ne3);
6578
+
6319
6579
  struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
6320
- a->ne[0] * scale_factor,
6321
- a->ne[1] * scale_factor,
6322
- a->ne[2], a->ne[3]);
6580
+ ne0,
6581
+ ne1,
6582
+ ne2,
6583
+ ne3
6584
+ );
6323
6585
 
6324
6586
  result->op = GGML_OP_UPSCALE;
6325
- result->op_params[0] = scale_factor;
6587
+
6326
6588
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6327
6589
  result->src[0] = a;
6328
6590
 
6329
6591
  return result;
6330
6592
  }
6331
6593
 
6594
+ struct ggml_tensor * ggml_upscale(
6595
+ struct ggml_context * ctx,
6596
+ struct ggml_tensor * a,
6597
+ int scale_factor) {
6598
+ return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
6599
+ }
6600
+
6601
+ struct ggml_tensor * ggml_upscale_ext(
6602
+ struct ggml_context * ctx,
6603
+ struct ggml_tensor * a,
6604
+ int ne0,
6605
+ int ne1,
6606
+ int ne2,
6607
+ int ne3) {
6608
+ return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
6609
+ }
6610
+
6611
+ // ggml_pad
6612
+
6332
6613
  struct ggml_tensor * ggml_pad(
6333
6614
  struct ggml_context * ctx,
6334
6615
  struct ggml_tensor * a,
@@ -6353,12 +6634,7 @@ struct ggml_tensor * ggml_pad(
6353
6634
  return result;
6354
6635
  }
6355
6636
 
6356
- struct ggml_tensor * ggml_upscale(
6357
- struct ggml_context * ctx,
6358
- struct ggml_tensor * a,
6359
- int scale_factor) {
6360
- return ggml_upscale_impl(ctx, a, scale_factor);
6361
- }
6637
+ // ggml_arange
6362
6638
 
6363
6639
  struct ggml_tensor * ggml_arange(
6364
6640
  struct ggml_context * ctx,
@@ -6380,6 +6656,8 @@ struct ggml_tensor * ggml_arange(
6380
6656
  return result;
6381
6657
  }
6382
6658
 
6659
+ // ggml_timestep_embedding
6660
+
6383
6661
  struct ggml_tensor * ggml_timestep_embedding(
6384
6662
  struct ggml_context * ctx,
6385
6663
  struct ggml_tensor * timesteps,
@@ -6486,9 +6764,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
6486
6764
  struct ggml_tensor * k,
6487
6765
  struct ggml_tensor * v,
6488
6766
  struct ggml_tensor * mask,
6489
- float scale) {
6767
+ float scale,
6768
+ float max_bias) {
6490
6769
  GGML_ASSERT(ggml_can_mul_mat(k, q));
6491
6770
  // TODO: check if vT can be multiplied by (k*qT)
6771
+
6492
6772
  if (mask) {
6493
6773
  GGML_ASSERT(ggml_is_contiguous(mask));
6494
6774
  GGML_ASSERT(mask->ne[2] == 1);
@@ -6498,6 +6778,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
6498
6778
  //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
6499
6779
  }
6500
6780
 
6781
+ if (max_bias > 0.0f) {
6782
+ GGML_ASSERT(mask);
6783
+ }
6784
+
6501
6785
  bool is_node = false;
6502
6786
 
6503
6787
  if (q->grad || k->grad || v->grad) {
@@ -6508,7 +6792,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
6508
6792
  int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
6509
6793
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6510
6794
 
6511
- float params[] = { scale };
6795
+ float params[] = { scale, max_bias };
6512
6796
  ggml_set_op_params(result, params, sizeof(params));
6513
6797
 
6514
6798
  result->op = GGML_OP_FLASH_ATTN_EXT;
@@ -6528,7 +6812,7 @@ void ggml_flash_attn_ext_set_prec(
6528
6812
 
6529
6813
  const int32_t prec_i32 = (int32_t) prec;
6530
6814
 
6531
- ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
6815
+ ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
6532
6816
  }
6533
6817
 
6534
6818
  // ggml_flash_ff
@@ -10892,6 +11176,52 @@ static void ggml_compute_forward_relu(
10892
11176
  }
10893
11177
  }
10894
11178
 
11179
+ // ggml_compute_forward_sigmoid
11180
+
11181
+ static void ggml_compute_forward_sigmoid_f32(
11182
+ const struct ggml_compute_params * params,
11183
+ struct ggml_tensor * dst) {
11184
+
11185
+ const struct ggml_tensor * src0 = dst->src[0];
11186
+
11187
+ assert(params->ith == 0);
11188
+ assert(ggml_are_same_shape(src0, dst));
11189
+
11190
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
11191
+ return;
11192
+ }
11193
+
11194
+ const int n = ggml_nrows(src0);
11195
+ const int nc = src0->ne[0];
11196
+
11197
+ assert(dst->nb[0] == sizeof(float));
11198
+ assert(src0->nb[0] == sizeof(float));
11199
+
11200
+ for (int i = 0; i < n; i++) {
11201
+ ggml_vec_sigmoid_f32(nc,
11202
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
11203
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
11204
+ }
11205
+ }
11206
+
11207
+ static void ggml_compute_forward_sigmoid(
11208
+ const struct ggml_compute_params * params,
11209
+ struct ggml_tensor * dst) {
11210
+
11211
+ const struct ggml_tensor * src0 = dst->src[0];
11212
+
11213
+ switch (src0->type) {
11214
+ case GGML_TYPE_F32:
11215
+ {
11216
+ ggml_compute_forward_sigmoid_f32(params, dst);
11217
+ } break;
11218
+ default:
11219
+ {
11220
+ GGML_ASSERT(false);
11221
+ } break;
11222
+ }
11223
+ }
11224
+
10895
11225
  // ggml_compute_forward_gelu
10896
11226
 
10897
11227
  static void ggml_compute_forward_gelu_f32(
@@ -11742,48 +12072,139 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
11742
12072
  }
11743
12073
  #endif
11744
12074
 
11745
- static void ggml_compute_forward_mul_mat(
11746
- const struct ggml_compute_params * params,
11747
- struct ggml_tensor * dst) {
12075
+ static void ggml_compute_forward_mul_mat_one_chunk(
12076
+ const struct ggml_compute_params * params,
12077
+ struct ggml_tensor * dst,
12078
+ const int64_t num_rows_per_vec_dot,
12079
+ const int64_t ir0_start,
12080
+ const int64_t ir0_end,
12081
+ const int64_t ir1_start,
12082
+ const int64_t ir1_end) {
11748
12083
 
11749
12084
  const struct ggml_tensor * src0 = dst->src[0];
11750
12085
  const struct ggml_tensor * src1 = dst->src[1];
11751
12086
 
11752
- int64_t t0 = ggml_perf_time_us();
11753
- UNUSED(t0);
11754
-
11755
12087
  GGML_TENSOR_BINARY_OP_LOCALS
11756
12088
 
11757
- const int ith = params->ith;
11758
- const int nth = params->nth;
11759
-
11760
12089
  const enum ggml_type type = src0->type;
11761
12090
 
11762
12091
  const bool src1_cont = ggml_is_contiguous(src1);
11763
12092
 
11764
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
11765
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
11766
- ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
11767
- int64_t const vec_dot_num_rows = type_traits[type].nrows;
12093
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
12094
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
11768
12095
 
11769
- GGML_ASSERT(ne0 == ne01);
11770
- GGML_ASSERT(ne1 == ne11);
11771
- GGML_ASSERT(ne2 == ne12);
11772
- GGML_ASSERT(ne3 == ne13);
12096
+ // broadcast factors
12097
+ const int64_t r2 = ne12 / ne02;
12098
+ const int64_t r3 = ne13 / ne03;
11773
12099
 
11774
- // we don't support permuted src0 or src1
11775
- GGML_ASSERT(nb00 == ggml_type_size(type));
11776
- GGML_ASSERT(nb10 == ggml_type_size(src1->type));
12100
+ //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
11777
12101
 
11778
- // dst cannot be transposed or permuted
11779
- GGML_ASSERT(nb0 == sizeof(float));
11780
- GGML_ASSERT(nb0 <= nb1);
11781
- GGML_ASSERT(nb1 <= nb2);
11782
- GGML_ASSERT(nb2 <= nb3);
12102
+ // threads with no work simply yield (not sure if it helps)
12103
+ if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
12104
+ return;
12105
+ }
12106
+
12107
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
12108
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
12109
+
12110
+ assert(ne12 % ne02 == 0);
12111
+ assert(ne13 % ne03 == 0);
12112
+
12113
+ // block-tiling attempt
12114
+ const int64_t blck_0 = 16;
12115
+ const int64_t blck_1 = 16;
12116
+
12117
+ const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
12118
+
12119
+ // attempt to reduce false-sharing (does not seem to make a difference)
12120
+ // 16 * 2, accounting for mmla kernels
12121
+ float tmp[32];
12122
+
12123
+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
12124
+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
12125
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
12126
+ const int64_t i13 = (ir1 / (ne12 * ne1));
12127
+ const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
12128
+ const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
12129
+
12130
+ // broadcast src0 into src1
12131
+ const int64_t i03 = i13 / r3;
12132
+ const int64_t i02 = i12 / r2;
12133
+
12134
+ const int64_t i1 = i11;
12135
+ const int64_t i2 = i12;
12136
+ const int64_t i3 = i13;
12137
+
12138
+ const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
12139
+
12140
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
12141
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
12142
+ // the original src1 data pointer, so we should index using the indices directly
12143
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
12144
+ const char * src1_col = (const char*)wdata +
12145
+ (src1_cont || src1->type != vec_dot_type
12146
+ ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
12147
+ : (i11 * nb11 + i12 * nb12 + i13 * nb13));
12148
+ float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
12149
+
12150
+ //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
12151
+ // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
12152
+ //}
12153
+
12154
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
12155
+ vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
12156
+ }
12157
+
12158
+ for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
12159
+ memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
12160
+ }
12161
+ }
12162
+ }
12163
+ }
12164
+ }
12165
+
12166
+ static void ggml_compute_forward_mul_mat(
12167
+ const struct ggml_compute_params * params,
12168
+ struct ggml_tensor * dst,
12169
+ struct ggml_compute_state * state) {
12170
+
12171
+ const struct ggml_tensor * src0 = dst->src[0];
12172
+ const struct ggml_tensor * src1 = dst->src[1];
12173
+
12174
+ int64_t t0 = ggml_perf_time_us();
12175
+ UNUSED(t0);
12176
+
12177
+ GGML_TENSOR_BINARY_OP_LOCALS
12178
+
12179
+ const int ith = params->ith;
12180
+ const int nth = params->nth;
12181
+
12182
+ const enum ggml_type type = src0->type;
12183
+
12184
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
12185
+ ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
12186
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
12187
+
12188
+ GGML_ASSERT(ne0 == ne01);
12189
+ GGML_ASSERT(ne1 == ne11);
12190
+ GGML_ASSERT(ne2 == ne12);
12191
+ GGML_ASSERT(ne3 == ne13);
12192
+
12193
+ // we don't support permuted src0 or src1
12194
+ GGML_ASSERT(nb00 == ggml_type_size(type));
12195
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
12196
+
12197
+ // dst cannot be transposed or permuted
12198
+ GGML_ASSERT(nb0 == sizeof(float));
12199
+ GGML_ASSERT(nb0 <= nb1);
12200
+ GGML_ASSERT(nb1 <= nb2);
12201
+ GGML_ASSERT(nb2 <= nb3);
11783
12202
 
11784
12203
  // broadcast factors
11785
- const int64_t r2 = ne12/ne02;
11786
- const int64_t r3 = ne13/ne03;
12204
+ const int64_t r2 = ne12 / ne02;
12205
+ const int64_t r3 = ne13 / ne03;
12206
+ UNUSED(r2);
12207
+ UNUSED(r3);
11787
12208
 
11788
12209
  // nb01 >= nb00 - src0 is not transposed
11789
12210
  // compute by src0 rows
@@ -11865,6 +12286,8 @@ static void ggml_compute_forward_mul_mat(
11865
12286
  #endif
11866
12287
 
11867
12288
  #if GGML_USE_LLAMAFILE
12289
+ const bool src1_cont = ggml_is_contiguous(src1);
12290
+
11868
12291
  if (src1_cont) {
11869
12292
  for (int64_t i13 = 0; i13 < ne13; i13++)
11870
12293
  for (int64_t i12 = 0; i12 < ne12; i12++)
@@ -11890,6 +12313,8 @@ UseGgmlGemm1:;
11890
12313
  if (ith != 0) {
11891
12314
  return;
11892
12315
  }
12316
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
12317
+ atomic_store(&state->shared->current_chunk, nth);
11893
12318
  if (src1->type != vec_dot_type) {
11894
12319
  char * wdata = params->wdata;
11895
12320
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -11914,11 +12339,11 @@ UseGgmlGemm1:;
11914
12339
  return;
11915
12340
  }
11916
12341
 
11917
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
11918
- const size_t row_size = ggml_row_size(vec_dot_type, ne10);
11919
-
11920
12342
  #if GGML_USE_LLAMAFILE
11921
12343
  if (src1->type != vec_dot_type) {
12344
+ const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
12345
+ const size_t row_size = ggml_row_size(vec_dot_type, ne10);
12346
+
11922
12347
  for (int64_t i13 = 0; i13 < ne13; i13++)
11923
12348
  for (int64_t i12 = 0; i12 < ne12; i12++)
11924
12349
  if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -11939,98 +12364,87 @@ UseGgmlGemm1:;
11939
12364
  UseGgmlGemm2:;
11940
12365
  #endif
11941
12366
 
11942
- const int64_t nr0 = ne01; // src0 rows
11943
- const int64_t nr1 = ne1*ne12*ne13; // src1 rows
11944
-
11945
- //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
11946
-
11947
- // distribute the thread work across the inner or outer loop based on which one is larger
11948
-
11949
- const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
11950
- const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
11951
-
11952
- const int64_t ith0 = ith % nth0;
11953
- const int64_t ith1 = ith / nth0;
11954
-
11955
- const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
11956
- const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
11957
-
11958
- const int64_t ir010 = dr0*ith0;
11959
- const int64_t ir011 = MIN(ir010 + dr0, nr0);
11960
-
11961
- const int64_t ir110 = dr1*ith1;
11962
- const int64_t ir111 = MIN(ir110 + dr1, nr1);
11963
-
11964
- //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
11965
-
11966
- // threads with no work simply yield (not sure if it helps)
11967
- if (ir010 >= ir011 || ir110 >= ir111) {
11968
- sched_yield();
11969
- return;
11970
- }
12367
+ #ifdef GGML_PERF
12368
+ int chunks_executed = 0;
12369
+ UNUSED(chunks_executed);
12370
+ #endif
11971
12371
 
11972
- assert(ne12 % ne02 == 0);
11973
- assert(ne13 % ne03 == 0);
12372
+ // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
12373
+ const int64_t nr0 = ne0;
11974
12374
 
11975
- // block-tiling attempt
11976
- const int64_t blck_0 = 16;
11977
- const int64_t blck_1 = 16;
12375
+ // This is the size of the rest of the dimensions of the result
12376
+ const int64_t nr1 = ne1 * ne2 * ne3;
11978
12377
 
11979
12378
  // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
11980
- int64_t nrc = vec_dot_num_rows;
12379
+ int64_t num_rows_per_vec_dot = vec_dot_num_rows;
11981
12380
  // TODO: currently the mmla kernels support only even numbered rows/cols.
11982
12381
  // this check can be removed once they are extended to support odd numbered rows/cols too
11983
12382
  if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
11984
- nrc = 1;
12383
+ num_rows_per_vec_dot = 1;
11985
12384
  }
11986
12385
 
11987
- const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
12386
+ // Now select a reasonable chunk size.
12387
+ int chunk_size = 16;
11988
12388
 
11989
- // attempt to reduce false-sharing (does not seem to make a difference)
11990
- // 16 * 2, accounting for mmla kernels
11991
- float tmp[32];
12389
+ // We need to step up the size if it's small
12390
+ if (nr0 == 1 || nr1 == 1) {
12391
+ chunk_size = 64;
12392
+ }
11992
12393
 
11993
- for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
11994
- for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
11995
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
11996
- const int64_t i13 = (ir1/(ne12*ne1));
11997
- const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
11998
- const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
12394
+ // distribute the work across the inner or outer loop based on which one is larger
12395
+ // The number of chunks in the 0/1 dim.
12396
+ // CEIL(nr0/chunk_size)
12397
+ int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
12398
+ int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
11999
12399
 
12000
- // broadcast src0 into src1
12001
- const int64_t i03 = i13/r3;
12002
- const int64_t i02 = i12/r2;
12400
+ // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
12401
+ // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
12402
+ // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
12403
+ if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
12404
+ // distribute the thread work across the inner or outer loop based on which one is larger
12405
+ nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
12406
+ nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
12407
+ }
12003
12408
 
12004
- const int64_t i1 = i11;
12005
- const int64_t i2 = i12;
12006
- const int64_t i3 = i13;
12409
+ // The number of elements in each chunk
12410
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
12411
+ const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
12007
12412
 
12008
- const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
12413
+ //if (ith == 0)
12414
+ // printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
12009
12415
 
12010
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
12011
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
12012
- // the original src1 data pointer, so we should index using the indices directly
12013
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
12014
- const char * src1_col = (const char *) wdata +
12015
- (src1_cont || src1->type != vec_dot_type
12016
- ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
12017
- : (i11*nb11 + i12*nb12 + i13*nb13));
12018
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
12416
+ // The first chunk comes from our thread_id, the rest will get auto-assigned.
12417
+ int current_chunk = ith;
12019
12418
 
12020
- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
12021
- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
12022
- //}
12419
+ while (current_chunk < nchunk0 * nchunk1) {
12420
+ const int64_t ith0 = current_chunk % nchunk0;
12421
+ const int64_t ith1 = current_chunk / nchunk0;
12023
12422
 
12024
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
12025
- vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
12026
- }
12423
+ const int64_t ir0_start = dr0 * ith0;
12424
+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
12027
12425
 
12028
- for (int cn = 0; cn < nrc; ++cn) {
12029
- memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
12030
- }
12031
- }
12426
+ const int64_t ir1_start = dr1 * ith1;
12427
+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
12428
+
12429
+ ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
12430
+
12431
+ #ifdef GGML_PERF
12432
+ chunks_executed++;
12433
+ #endif
12434
+
12435
+ if (nth >= nchunk0 * nchunk1) {
12436
+ break;
12032
12437
  }
12438
+
12439
+ current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
12033
12440
  }
12441
+
12442
+ #ifdef GGML_PERF
12443
+ // These numbers are useful when trying to measure how well the threading scheduling works.
12444
+ //int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
12445
+ //float time = (ggml_perf_time_us() - t0);
12446
+ //printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
12447
+ #endif
12034
12448
  }
12035
12449
 
12036
12450
  // ggml_compute_forward_mul_mat_id
@@ -13333,7 +13747,6 @@ static void ggml_compute_forward_soft_max_f32(
13333
13747
 
13334
13748
  const struct ggml_tensor * src0 = dst->src[0];
13335
13749
  const struct ggml_tensor * src1 = dst->src[1];
13336
- const struct ggml_tensor * src2 = dst->src[2];
13337
13750
 
13338
13751
  assert(ggml_is_contiguous(dst));
13339
13752
  assert(ggml_are_same_shape(src0, dst));
@@ -13359,8 +13772,8 @@ static void ggml_compute_forward_soft_max_f32(
13359
13772
 
13360
13773
  // TODO: is this supposed to be ceil instead of floor?
13361
13774
  // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
13362
- const uint32_t n_head_kv = ne02;
13363
- const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
13775
+ const uint32_t n_head = ne02;
13776
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
13364
13777
 
13365
13778
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
13366
13779
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@@ -13377,13 +13790,13 @@ static void ggml_compute_forward_soft_max_f32(
13377
13790
 
13378
13791
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
13379
13792
 
13380
- // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
13381
- ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
13382
- float * pos_f32 = src2 ? (float *) src2->data : src0->data;
13383
-
13384
- const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
13793
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
13385
13794
 
13386
13795
  for (int i1 = ir0; i1 < ir1; i1++) {
13796
+ // ALiBi
13797
+ const uint32_t h = (i1/ne01)%ne02; // head
13798
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
13799
+
13387
13800
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
13388
13801
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
13389
13802
 
@@ -13396,27 +13809,11 @@ static void ggml_compute_forward_soft_max_f32(
13396
13809
  if (mp_f32) {
13397
13810
  if (use_f16) {
13398
13811
  for (int i = 0; i < nc; ++i) {
13399
- wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
13400
- }
13401
- } else {
13402
- for (int i = 0; i < nc; ++i) {
13403
- wp[i] += mp_f32[i];
13404
- }
13405
- }
13406
- }
13407
-
13408
- // ALiBi bias
13409
- if (max_bias > 0.0f) {
13410
- const uint32_t h = (i1/ne01)%ne02; // head
13411
- const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
13412
-
13413
- if (use_f16) {
13414
- for (int i = 0; i < nc; ++i) {
13415
- wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
13812
+ wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
13416
13813
  }
13417
13814
  } else {
13418
13815
  for (int i = 0; i < nc; ++i) {
13419
- wp[i] += slope*pos_f32[i];
13816
+ wp[i] += slope*mp_f32[i];
13420
13817
  }
13421
13818
  }
13422
13819
  }
@@ -13431,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32(
13431
13828
  float max = -INFINITY;
13432
13829
  ggml_vec_max_f32(nc, &max, wp);
13433
13830
 
13434
- ggml_float sum = 0.0;
13435
-
13436
- uint16_t scvt;
13437
- for (int i = 0; i < nc; i++) {
13438
- if (wp[i] == -INFINITY) {
13439
- dp[i] = 0.0f;
13440
- } else {
13441
- // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
13442
- ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
13443
- memcpy(&scvt, &s, sizeof(scvt));
13444
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
13445
- sum += (ggml_float)val;
13446
- dp[i] = val;
13447
- }
13448
- }
13449
-
13831
+ ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
13450
13832
  assert(sum > 0.0);
13451
13833
 
13452
13834
  sum = 1.0/sum;
@@ -13578,178 +13960,6 @@ static void ggml_compute_forward_soft_max_back(
13578
13960
  }
13579
13961
  }
13580
13962
 
13581
- // ggml_compute_forward_alibi
13582
-
13583
- static void ggml_compute_forward_alibi_f32(
13584
- const struct ggml_compute_params * params,
13585
- struct ggml_tensor * dst) {
13586
-
13587
- const struct ggml_tensor * src0 = dst->src[0];
13588
-
13589
- assert(params->ith == 0);
13590
-
13591
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13592
- return;
13593
- }
13594
-
13595
- //const int n_past = ((int32_t *) dst->op_params)[0];
13596
- const int n_head = ((int32_t *) dst->op_params)[1];
13597
- float max_bias;
13598
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13599
-
13600
- const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13601
- const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13602
- const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13603
- //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13604
-
13605
- const int64_t n = ggml_nrows(src0);
13606
- const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13607
-
13608
- const size_t nb0 = src0->nb[0];
13609
- const size_t nb1 = src0->nb[1];
13610
- const size_t nb2 = src0->nb[2];
13611
- //const int nb3 = src0->nb[3];
13612
-
13613
- GGML_ASSERT(nb0 == sizeof(float));
13614
- GGML_ASSERT(n_head == ne2);
13615
-
13616
- // add alibi to src0 (KQ_scaled)
13617
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
13618
-
13619
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13620
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13621
-
13622
- for (int64_t k = 0; k < ne2_ne3; k++) {
13623
- // TODO: k*nb2 or k*nb3
13624
- float m_k;
13625
-
13626
- if (k < n_heads_log2_floor) {
13627
- m_k = powf(m0, k + 1);
13628
- } else {
13629
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
13630
- }
13631
-
13632
- for (int64_t i = 0; i < ne0; i++) {
13633
- for (int64_t j = 0; j < ne1; j++) {
13634
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13635
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13636
- pdst[0] = i * m_k + src[0];
13637
- }
13638
- }
13639
- }
13640
- }
13641
-
13642
- static void ggml_compute_forward_alibi_f16(
13643
- const struct ggml_compute_params * params,
13644
- struct ggml_tensor * dst) {
13645
-
13646
- const struct ggml_tensor * src0 = dst->src[0];
13647
-
13648
- assert(params->ith == 0);
13649
-
13650
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13651
- return;
13652
- }
13653
-
13654
- //const int n_past = ((int32_t *) dst->op_params)[0];
13655
- const int n_head = ((int32_t *) dst->op_params)[1];
13656
- float max_bias;
13657
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13658
-
13659
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13660
- const int ne1 = src0->ne[1]; // seq_len_without_past
13661
- const int ne2 = src0->ne[2]; // n_head -> this is k
13662
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13663
-
13664
- const int n = ggml_nrows(src0);
13665
- const int ne2_ne3 = n/ne1; // ne2*ne3
13666
-
13667
- const int nb0 = src0->nb[0];
13668
- const int nb1 = src0->nb[1];
13669
- const int nb2 = src0->nb[2];
13670
- //const int nb3 = src0->nb[3];
13671
-
13672
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
13673
- //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
13674
- GGML_ASSERT(n_head == ne2);
13675
-
13676
- // add alibi to src0 (KQ_scaled)
13677
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
13678
-
13679
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13680
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13681
-
13682
- for (int k = 0; k < ne2_ne3; k++) {
13683
- // TODO: k*nb2 or k*nb3
13684
- float m_k;
13685
-
13686
- if (k < n_heads_log2_floor) {
13687
- m_k = powf(m0, k + 1);
13688
- } else {
13689
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
13690
- }
13691
-
13692
- for (int i = 0; i < ne0; i++) {
13693
- for (int j = 0; j < ne1; j++) {
13694
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13695
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13696
-
13697
- // we return F32
13698
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
13699
- }
13700
- }
13701
- }
13702
- }
13703
-
13704
- static void ggml_compute_forward_alibi(
13705
- const struct ggml_compute_params * params,
13706
- struct ggml_tensor * dst) {
13707
-
13708
- const struct ggml_tensor * src0 = dst->src[0];
13709
-
13710
- switch (src0->type) {
13711
- case GGML_TYPE_F16:
13712
- {
13713
- ggml_compute_forward_alibi_f16(params, dst);
13714
- } break;
13715
- case GGML_TYPE_F32:
13716
- {
13717
- ggml_compute_forward_alibi_f32(params, dst);
13718
- } break;
13719
- case GGML_TYPE_BF16:
13720
- case GGML_TYPE_Q4_0:
13721
- case GGML_TYPE_Q4_1:
13722
- case GGML_TYPE_Q5_0:
13723
- case GGML_TYPE_Q5_1:
13724
- case GGML_TYPE_Q8_0:
13725
- case GGML_TYPE_Q8_1:
13726
- case GGML_TYPE_Q2_K:
13727
- case GGML_TYPE_Q3_K:
13728
- case GGML_TYPE_Q4_K:
13729
- case GGML_TYPE_Q5_K:
13730
- case GGML_TYPE_Q6_K:
13731
- case GGML_TYPE_IQ2_XXS:
13732
- case GGML_TYPE_IQ2_XS:
13733
- case GGML_TYPE_IQ3_XXS:
13734
- case GGML_TYPE_IQ1_S:
13735
- case GGML_TYPE_IQ1_M:
13736
- case GGML_TYPE_IQ4_NL:
13737
- case GGML_TYPE_IQ4_XS:
13738
- case GGML_TYPE_IQ3_S:
13739
- case GGML_TYPE_IQ2_S:
13740
- case GGML_TYPE_Q8_K:
13741
- case GGML_TYPE_I8:
13742
- case GGML_TYPE_I16:
13743
- case GGML_TYPE_I32:
13744
- case GGML_TYPE_I64:
13745
- case GGML_TYPE_F64:
13746
- case GGML_TYPE_COUNT:
13747
- {
13748
- GGML_ASSERT(false);
13749
- } break;
13750
- }
13751
- }
13752
-
13753
13963
  // ggml_compute_forward_clamp
13754
13964
 
13755
13965
  static void ggml_compute_forward_clamp_f32(
@@ -14972,25 +15182,28 @@ static void ggml_compute_forward_upscale_f32(
14972
15182
  return;
14973
15183
  }
14974
15184
 
14975
- GGML_ASSERT(src0->nb[0] == sizeof(float));
15185
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
14976
15186
 
14977
15187
  const int ith = params->ith;
14978
15188
  const int nth = params->nth;
14979
15189
 
14980
15190
  GGML_TENSOR_UNARY_OP_LOCALS
14981
15191
 
14982
- const int scale_factor = dst->op_params[0];
15192
+ const float sf0 = (float)ne0/src0->ne[0];
15193
+ const float sf1 = (float)ne1/src0->ne[1];
15194
+ const float sf2 = (float)ne2/src0->ne[2];
15195
+ const float sf3 = (float)ne3/src0->ne[3];
14983
15196
 
14984
15197
  // TODO: optimize
14985
15198
 
14986
15199
  for (int64_t i3 = 0; i3 < ne3; i3++) {
14987
- const int64_t i03 = i3;
15200
+ const int64_t i03 = i3 / sf3;
14988
15201
  for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
14989
- const int64_t i02 = i2;
15202
+ const int64_t i02 = i2 / sf2;
14990
15203
  for (int64_t i1 = 0; i1 < ne1; i1++) {
14991
- const int64_t i01 = i1 / scale_factor;
15204
+ const int64_t i01 = i1 / sf1;
14992
15205
  for (int64_t i0 = 0; i0 < ne0; i0++) {
14993
- const int64_t i00 = i0 / scale_factor;
15206
+ const int64_t i00 = i0 / sf0;
14994
15207
 
14995
15208
  const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
14996
15209
  float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
@@ -15020,6 +15233,7 @@ static void ggml_compute_forward_upscale(
15020
15233
  }
15021
15234
  }
15022
15235
 
15236
+
15023
15237
  // ggml_compute_forward_pad
15024
15238
 
15025
15239
  static void ggml_compute_forward_pad_f32(
@@ -15373,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32(
15373
15587
  vvexpf(S, S, &Mup);
15374
15588
  ggml_vec_sum_f32(Mup, &sum, S);
15375
15589
  #else
15376
- uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
15377
- ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
15378
-
15379
- for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
15380
- if (i >= masked_begin) {
15381
- break;
15382
- }
15383
- float * SS = S + i;
15384
-
15385
- for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
15386
- if (i + j >= masked_begin) {
15387
- break;
15388
- } else if (SS[j] == -INFINITY) {
15389
- SS[j] = 0.0f;
15390
- } else {
15391
- #ifndef GGML_FLASH_ATTN_EXP_FP16
15392
- const float val = expf(SS[j] - max);
15393
- #else
15394
- ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
15395
- memcpy(&scvt[j], &s, sizeof(uint16_t));
15396
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
15397
- #endif
15398
- sump[j] += (ggml_float)val;
15399
- SS[j] = val;
15400
- }
15401
- }
15402
- }
15403
-
15404
- for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
15405
- sum += sump[i];
15406
- }
15590
+ sum = ggml_vec_soft_max_f32(Mup, S, S, max);
15407
15591
  #endif
15408
15592
  }
15409
15593
 
@@ -15585,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16(
15585
15769
  vvexpf(S, S, &Mup);
15586
15770
  ggml_vec_sum_f32(Mup, &sum, S);
15587
15771
  #else
15588
- uint16_t scvt[GGML_SOFT_MAX_UNROLL];
15589
- ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
15590
-
15591
- for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
15592
- float * SS = S + i;
15593
-
15594
- for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
15595
- if (SS[j] == -INFINITY) {
15596
- SS[j] = 0.0f;
15597
- } else {
15598
- ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
15599
- memcpy(&scvt[j], &s, sizeof(uint16_t));
15600
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
15601
- sump[j] += (ggml_float)val;
15602
- SS[j] = val;
15603
- }
15604
- }
15605
- }
15606
-
15607
- for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
15608
- sum += sump[i];
15609
- }
15772
+ sum = ggml_vec_soft_max_f32(Mup, S, S, max);
15610
15773
  #endif
15611
15774
  }
15612
15775
 
@@ -15763,8 +15926,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15763
15926
  const int ir0 = dr*ith;
15764
15927
  const int ir1 = MIN(ir0 + dr, nr);
15765
15928
 
15766
- float scale = 1.0f;
15767
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
15929
+ float scale = 1.0f;
15930
+ float max_bias = 0.0f;
15931
+
15932
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
15933
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
15934
+
15935
+ const uint32_t n_head = neq2;
15936
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
15937
+
15938
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
15939
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
15768
15940
 
15769
15941
  // loop over n_batch and n_head
15770
15942
  for (int ir = ir0; ir < ir1; ++ir) {
@@ -15773,6 +15945,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15773
15945
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
15774
15946
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
15775
15947
 
15948
+ const uint32_t h = iq2; // head
15949
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
15950
+
15776
15951
  float S = 0.0f;
15777
15952
  float M = -INFINITY;
15778
15953
 
@@ -15796,7 +15971,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15796
15971
  // loop over n_kv and n_head_kv
15797
15972
  // ref: https://arxiv.org/pdf/2112.05682.pdf
15798
15973
  for (int64_t ic = 0; ic < nek1; ++ic) {
15799
- const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
15974
+ const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
15800
15975
  if (mv == -INFINITY) {
15801
15976
  continue;
15802
15977
  }
@@ -15867,7 +16042,7 @@ static void ggml_compute_forward_flash_attn_ext(
15867
16042
  const struct ggml_tensor * v,
15868
16043
  const struct ggml_tensor * mask,
15869
16044
  struct ggml_tensor * dst) {
15870
- switch (dst->op_params[1]) {
16045
+ switch (dst->op_params[2]) {
15871
16046
  case GGML_PREC_DEFAULT:
15872
16047
  case GGML_PREC_F32:
15873
16048
  {
@@ -16221,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
16221
16396
  vvexpf(SM, SM, &Mup);
16222
16397
  ggml_vec_sum_f32(Mup, &sum, SM);
16223
16398
  #else
16224
- uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
16225
- ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
16226
-
16227
- for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
16228
- if (i >= masked_begin) {
16229
- break;
16230
- }
16231
- float * SR = S + i;
16232
- float * SW = SM + i;
16233
-
16234
- for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
16235
- if (i + j >= masked_begin) {
16236
- break;
16237
- } else if (SR[j] == -INFINITY) {
16238
- SW[j] = 0.0f;
16239
- } else {
16240
- #ifndef GGML_FLASH_ATTN_EXP_FP16
16241
- const float val = expf(SR[j] - max);
16242
- #else
16243
- ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
16244
- memcpy(&scvt[j], &s, sizeof(uint16_t));
16245
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
16246
- #endif
16247
- sump[j] += (ggml_float)val;
16248
- SW[j] = val;
16249
- }
16250
- }
16251
- }
16252
-
16253
- for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
16254
- sum += sump[i];
16255
- }
16399
+ sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
16256
16400
  #endif
16257
16401
  }
16258
16402
 
@@ -16834,6 +16978,10 @@ static void ggml_compute_forward_unary(
16834
16978
  {
16835
16979
  ggml_compute_forward_relu(params, dst);
16836
16980
  } break;
16981
+ case GGML_UNARY_OP_SIGMOID:
16982
+ {
16983
+ ggml_compute_forward_sigmoid(params, dst);
16984
+ } break;
16837
16985
  case GGML_UNARY_OP_GELU:
16838
16986
  {
16839
16987
  ggml_compute_forward_gelu(params, dst);
@@ -17274,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
17274
17422
  assert(!isnan(s1[i]));
17275
17423
  }
17276
17424
  #endif
17277
- // soft_max
17278
- ggml_float sum = 0.0;
17279
- {
17280
- float max = -INFINITY;
17281
- ggml_vec_max_f32(nc, &max, s0);
17282
17425
 
17283
- uint16_t scvt; UNUSED(scvt);
17284
- for (int i = 0; i < nc; i++) {
17285
- if (s0[i] == -INFINITY) {
17286
- st[i] = 0.0f;
17287
- } else {
17288
- #ifndef GGML_CROSS_ENTROPY_EXP_FP16
17289
- const float s = s0[i] - max;
17290
- const float val = expf(s);
17291
- #else
17292
- ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
17293
- memcpy(&scvt, &s, sizeof(scvt));
17294
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
17295
- #endif
17296
- sum += (ggml_float)val;
17297
- st[i] = val;
17298
- }
17299
- }
17426
+ // soft_max
17427
+ float max = -INFINITY;
17428
+ ggml_vec_max_f32(nc, &max, s0);
17429
+ ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
17430
+ assert(sum > 0.0);
17431
+ sum = (1.0 - eps) / sum;
17300
17432
 
17301
- assert(sum > 0.0);
17302
- // sum = 1.0/sum;
17303
- }
17304
17433
  // avoid log(0) by rescaling from [0..1] to [eps..1]
17305
- sum = (1.0 - eps) / sum;
17306
17434
  ggml_vec_scale_f32(nc, st, sum);
17307
17435
  ggml_vec_add1_f32(nc, st, st, eps);
17308
17436
  ggml_vec_log_f32(nc, st, st);
@@ -17392,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
17392
17520
  #endif
17393
17521
 
17394
17522
  // soft_max
17395
- ggml_float sum = 0.0;
17396
- {
17397
- float max = -INFINITY;
17398
- ggml_vec_max_f32(nc, &max, s0);
17399
-
17400
- uint16_t scvt; UNUSED(scvt);
17401
- for (int i = 0; i < nc; i++) {
17402
- if (s0[i] == -INFINITY) {
17403
- ds0[i] = 0.0f;
17404
- } else {
17405
- #ifndef GGML_CROSS_ENTROPY_EXP_FP16
17406
- const float s = s0[i] - max;
17407
- const float val = expf(s);
17408
- #else
17409
- ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
17410
- memcpy(&scvt, &s, sizeof(scvt));
17411
- const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
17412
- #endif
17413
- sum += (ggml_float)val;
17414
- ds0[i] = val;
17415
- }
17416
- }
17417
-
17418
- assert(sum > 0.0);
17419
- sum = (1.0 - eps)/sum;
17420
- }
17523
+ float max = -INFINITY;
17524
+ ggml_vec_max_f32(nc, &max, s0);
17525
+ ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
17526
+ assert(sum > 0.0);
17527
+ sum = (1.0 - eps) / sum;
17421
17528
 
17422
17529
  // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
17423
17530
  ggml_vec_scale_f32(nc, ds0, sum);
@@ -17454,7 +17561,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
17454
17561
 
17455
17562
  /////////////////////////////////
17456
17563
 
17457
- static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
17564
+ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
17458
17565
  GGML_ASSERT(params);
17459
17566
 
17460
17567
  if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
@@ -17552,7 +17659,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
17552
17659
  } break;
17553
17660
  case GGML_OP_MUL_MAT:
17554
17661
  {
17555
- ggml_compute_forward_mul_mat(params, tensor);
17662
+ ggml_compute_forward_mul_mat(params, tensor, state);
17556
17663
  } break;
17557
17664
  case GGML_OP_MUL_MAT_ID:
17558
17665
  {
@@ -17630,10 +17737,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
17630
17737
  {
17631
17738
  ggml_compute_forward_rope_back(params, tensor);
17632
17739
  } break;
17633
- case GGML_OP_ALIBI:
17634
- {
17635
- ggml_compute_forward_alibi(params, tensor);
17636
- } break;
17637
17740
  case GGML_OP_CLAMP:
17638
17741
  {
17639
17742
  ggml_compute_forward_clamp(params, tensor);
@@ -18652,10 +18755,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18652
18755
  zero_table);
18653
18756
  }
18654
18757
  } break;
18655
- case GGML_OP_ALIBI:
18656
- {
18657
- GGML_ASSERT(false); // TODO: not implemented
18658
- } break;
18659
18758
  case GGML_OP_CLAMP:
18660
18759
  {
18661
18760
  GGML_ASSERT(false); // TODO: not implemented
@@ -18826,6 +18925,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18826
18925
  zero_table);
18827
18926
  }
18828
18927
  } break;
18928
+ case GGML_UNARY_OP_SIGMOID:
18929
+ {
18930
+ GGML_ASSERT(false); // TODO: not implemented
18931
+ } break;
18829
18932
  case GGML_UNARY_OP_GELU:
18830
18933
  {
18831
18934
  GGML_ASSERT(false); // TODO: not implemented
@@ -19172,8 +19275,6 @@ typedef int ggml_lock_t;
19172
19275
 
19173
19276
  #define GGML_LOCK_INITIALIZER 0
19174
19277
 
19175
- typedef pthread_t ggml_thread_t;
19176
-
19177
19278
  #define ggml_thread_create pthread_create
19178
19279
  #define ggml_thread_join pthread_join
19179
19280
 
@@ -19199,8 +19300,6 @@ typedef int ggml_lock_t;
19199
19300
 
19200
19301
  #define GGML_LOCK_INITIALIZER 0
19201
19302
 
19202
- typedef pthread_t ggml_thread_t;
19203
-
19204
19303
  #define ggml_thread_create pthread_create
19205
19304
  #define ggml_thread_join pthread_join
19206
19305
 
@@ -19280,31 +19379,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
19280
19379
  static void clear_numa_thread_affinity(void) {}
19281
19380
  #endif
19282
19381
 
19283
- struct ggml_compute_state_shared {
19284
- const struct ggml_cgraph * cgraph;
19285
- const struct ggml_cplan * cplan;
19286
-
19287
- int64_t perf_node_start_cycles;
19288
- int64_t perf_node_start_time_us;
19289
-
19290
- const int n_threads;
19291
-
19292
- // synchronization primitives
19293
- atomic_int n_active; // num active threads
19294
- atomic_int node_n; // active graph node
19295
- atomic_int node_task; // active graph node task phase
19296
-
19297
- ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
19298
- void * abort_callback_data;
19299
- };
19300
-
19301
- struct ggml_compute_state {
19302
- ggml_thread_t thrd;
19303
- int ith;
19304
- struct ggml_compute_state_shared * shared;
19305
- enum ggml_status ec;
19306
- };
19307
-
19308
19382
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
19309
19383
  int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
19310
19384
  int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
@@ -19355,6 +19429,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19355
19429
  case GGML_UNARY_OP_TANH:
19356
19430
  case GGML_UNARY_OP_ELU:
19357
19431
  case GGML_UNARY_OP_RELU:
19432
+ case GGML_UNARY_OP_SIGMOID:
19358
19433
  case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
19359
19434
  case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
19360
19435
  {
@@ -19428,10 +19503,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19428
19503
  {
19429
19504
  n_tasks = n_threads;
19430
19505
  } break;
19431
- case GGML_OP_ALIBI:
19432
- {
19433
- n_tasks = 1; //TODO
19434
- } break;
19435
19506
  case GGML_OP_CLAMP:
19436
19507
  {
19437
19508
  n_tasks = 1; //TODO
@@ -19580,6 +19651,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
19580
19651
 
19581
19652
  * node_n = atomic_load(&state->shared->node_n);
19582
19653
  if (* node_n != last_node_n) break;
19654
+ #if defined(__SSE3__)
19655
+ // Tell the processor we're spinning. It's a processor hint for spinlocks.
19656
+ _mm_pause();
19657
+ #endif
19583
19658
  }
19584
19659
  }
19585
19660
 
@@ -19594,6 +19669,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
19594
19669
 
19595
19670
  * task_phase = atomic_load(&state->shared->node_task);
19596
19671
  if (* task_phase != last_task_phase) break;
19672
+ #if defined(__SSE3__)
19673
+ // Tell the processor we're spinning. It's a processor hint for spinlocks.
19674
+ _mm_pause();
19675
+ #endif
19597
19676
  }
19598
19677
  }
19599
19678
 
@@ -19633,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19633
19712
  struct ggml_tensor * node = cgraph->nodes[node_n];
19634
19713
  if (GGML_OP_HAS_FINALIZE[node->op]) {
19635
19714
  params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
19636
- ggml_compute_forward(&params, node);
19715
+ ggml_compute_forward(&params, node, state);
19637
19716
  }
19638
19717
  ggml_graph_compute_perf_stats_node(node, state->shared);
19639
19718
  }
@@ -19653,17 +19732,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19653
19732
  /* INIT */
19654
19733
  if (GGML_OP_HAS_INIT[node->op]) {
19655
19734
  params.type = GGML_TASK_TYPE_INIT;
19656
- ggml_compute_forward(&params, node);
19735
+ ggml_compute_forward(&params, node, state);
19657
19736
  }
19658
19737
 
19659
19738
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
19660
19739
  // they do something more efficient than spinning (?)
19661
19740
  params.type = GGML_TASK_TYPE_COMPUTE;
19662
- ggml_compute_forward(&params, node);
19741
+ ggml_compute_forward(&params, node, state);
19663
19742
 
19664
19743
  if (GGML_OP_HAS_FINALIZE[node->op]) {
19665
19744
  params.type = GGML_TASK_TYPE_FINALIZE;
19666
- ggml_compute_forward(&params, node);
19745
+ ggml_compute_forward(&params, node, state);
19667
19746
  }
19668
19747
 
19669
19748
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -19702,7 +19781,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19702
19781
 
19703
19782
  if (state->ith < n_tasks) {
19704
19783
  if (GGML_OP_HAS_INIT[node->op]) {
19705
- ggml_compute_forward(&params, node);
19784
+ ggml_compute_forward(&params, node, state);
19706
19785
  }
19707
19786
  }
19708
19787
 
@@ -19723,7 +19802,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19723
19802
 
19724
19803
  if (state->ith < n_tasks) {
19725
19804
  params.type = GGML_TASK_TYPE_COMPUTE;
19726
- ggml_compute_forward(&params, node);
19805
+ ggml_compute_forward(&params, node, state);
19727
19806
  }
19728
19807
 
19729
19808
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -19974,6 +20053,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
19974
20053
  /*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
19975
20054
  /*.abort_callback =*/ NULL,
19976
20055
  /*.abort_callback_data =*/ NULL,
20056
+ /*.current_chunk; =*/ 0,
19977
20057
  };
19978
20058
  struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19979
20059