llama_cpp 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
- // Defines CLOCK_MONOTONIC on Linux
2
- #define _GNU_SOURCE
1
+ #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
3
 
4
4
  #include "ggml.h"
5
5
 
@@ -24,6 +24,7 @@
24
24
  #include <stdio.h>
25
25
  #include <float.h>
26
26
  #include <limits.h>
27
+ #include <stdarg.h>
27
28
 
28
29
  #ifdef GGML_USE_METAL
29
30
  #include <unistd.h>
@@ -35,6 +36,12 @@
35
36
  #define static_assert(cond, msg) struct global_scope_noop_trick
36
37
  #endif
37
38
 
39
+ #if defined(_MSC_VER)
40
+ // disable "possible loss of data" to avoid hundreds of casts
41
+ // we should just be careful :)
42
+ #pragma warning(disable: 4244 4267)
43
+ #endif
44
+
38
45
  #if defined(_WIN32)
39
46
 
40
47
  #include <windows.h>
@@ -84,6 +91,11 @@ static int sched_yield (void) {
84
91
  #include <stdatomic.h>
85
92
 
86
93
  typedef void* thread_ret_t;
94
+
95
+ #include <sys/types.h>
96
+ #include <sys/stat.h>
97
+ #include <unistd.h>
98
+
87
99
  #endif
88
100
 
89
101
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -106,11 +118,36 @@ typedef void* thread_ret_t;
106
118
  /*#define GGML_PERF*/
107
119
  #define GGML_DEBUG 0
108
120
  #define GGML_GELU_FP16
121
+ #define GGML_GELU_QUICK_FP16
109
122
  #define GGML_SILU_FP16
110
123
 
111
124
  #define GGML_SOFT_MAX_UNROLL 4
112
125
  #define GGML_VEC_DOT_UNROLL 2
113
126
 
127
+ //
128
+ // logging
129
+ //
130
+
131
+ #if (GGML_DEBUG >= 1)
132
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
133
+ #else
134
+ #define GGML_PRINT_DEBUG(...)
135
+ #endif
136
+
137
+ #if (GGML_DEBUG >= 5)
138
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
139
+ #else
140
+ #define GGML_PRINT_DEBUG_5(...)
141
+ #endif
142
+
143
+ #if (GGML_DEBUG >= 10)
144
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
145
+ #else
146
+ #define GGML_PRINT_DEBUG_10(...)
147
+ #endif
148
+
149
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
150
+
114
151
  #ifdef GGML_USE_ACCELERATE
115
152
  // uncomment to use vDSP for soft max computation
116
153
  // note: not sure if it is actually faster
@@ -123,6 +160,34 @@ typedef void* thread_ret_t;
123
160
  #define GGML_MEM_ALIGN 16
124
161
  #endif
125
162
 
163
+ //
164
+ // logging
165
+ //
166
+
167
+ #if (GGML_DEBUG >= 1)
168
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
169
+ #else
170
+ #define GGML_PRINT_DEBUG(...)
171
+ #endif
172
+
173
+ #if (GGML_DEBUG >= 5)
174
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
175
+ #else
176
+ #define GGML_PRINT_DEBUG_5(...)
177
+ #endif
178
+
179
+ #if (GGML_DEBUG >= 10)
180
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
181
+ #else
182
+ #define GGML_PRINT_DEBUG_10(...)
183
+ #endif
184
+
185
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
186
+
187
+ //
188
+ // end of logging block
189
+ //
190
+
126
191
  #if defined(_MSC_VER) || defined(__MINGW32__)
127
192
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
128
193
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -136,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
136
201
  #endif
137
202
  if (result != 0) {
138
203
  // Handle allocation failure
204
+ const char *error_desc = "unknown allocation error";
205
+ switch (result) {
206
+ case EINVAL:
207
+ error_desc = "invalid alignment value";
208
+ break;
209
+ case ENOMEM:
210
+ error_desc = "insufficient memory";
211
+ break;
212
+ }
213
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
214
+ __func__, error_desc, size/(1024.0*1024.0));
139
215
  return NULL;
140
216
  }
141
217
  return aligned_memory;
@@ -334,6 +410,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
334
410
  // precomputed gelu table for f16 (128 KB)
335
411
  static ggml_fp16_t table_gelu_f16[1 << 16];
336
412
 
413
+ // precomputed quick gelu table for f16 (128 KB)
414
+ static ggml_fp16_t table_gelu_quick_f16[1 << 16];
415
+
337
416
  // precomputed silu table for f16 (128 KB)
338
417
  static ggml_fp16_t table_silu_f16[1 << 16];
339
418
 
@@ -409,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
409
488
  }
410
489
  }
411
490
 
412
-
413
491
  //
414
492
  // timing
415
493
  //
@@ -472,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
472
550
  #define ggml_perf_cycles_per_ms() 0
473
551
  #endif
474
552
 
553
+
475
554
  //
476
555
  // cache line
477
556
  //
@@ -1671,14 +1750,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1671
1750
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1672
1751
  #define GGML_F32x4_REDUCE(res, x) \
1673
1752
  { \
1674
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1675
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1753
+ int offset = GGML_F32_ARR >> 1; \
1754
+ for (int i = 0; i < offset; ++i) { \
1755
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1676
1756
  } \
1677
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1678
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1757
+ offset >>= 1; \
1758
+ for (int i = 0; i < offset; ++i) { \
1759
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1679
1760
  } \
1680
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1681
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1761
+ offset >>= 1; \
1762
+ for (int i = 0; i < offset; ++i) { \
1763
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1682
1764
  } \
1683
1765
  res = GGML_F32x4_REDUCE_ONE(x[0]); \
1684
1766
  }
@@ -1709,14 +1791,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1709
1791
  #define GGML_F16x8_MUL vmulq_f16
1710
1792
  #define GGML_F16x8_REDUCE(res, x) \
1711
1793
  { \
1712
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1713
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1794
+ int offset = GGML_F16_ARR >> 1; \
1795
+ for (int i = 0; i < offset; ++i) { \
1796
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1714
1797
  } \
1715
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1716
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1798
+ offset >>= 1; \
1799
+ for (int i = 0; i < offset; ++i) { \
1800
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1717
1801
  } \
1718
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1719
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1802
+ offset >>= 1; \
1803
+ for (int i = 0; i < offset; ++i) { \
1804
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1720
1805
  } \
1721
1806
  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1722
1807
  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1783,14 +1868,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1783
1868
  #define GGML_F32x8_MUL _mm256_mul_ps
1784
1869
  #define GGML_F32x8_REDUCE(res, x) \
1785
1870
  { \
1786
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1787
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1871
+ int offset = GGML_F32_ARR >> 1; \
1872
+ for (int i = 0; i < offset; ++i) { \
1873
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1788
1874
  } \
1789
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1790
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1875
+ offset >>= 1; \
1876
+ for (int i = 0; i < offset; ++i) { \
1877
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1791
1878
  } \
1792
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1793
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1879
+ offset >>= 1; \
1880
+ for (int i = 0; i < offset; ++i) { \
1881
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1794
1882
  } \
1795
1883
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1796
1884
  _mm256_extractf128_ps(x[0], 1)); \
@@ -1880,14 +1968,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1880
1968
  #define GGML_F32x4_MUL vec_mul
1881
1969
  #define GGML_F32x4_REDUCE(res, x) \
1882
1970
  { \
1883
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1884
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1971
+ int offset = GGML_F32_ARR >> 1; \
1972
+ for (int i = 0; i < offset; ++i) { \
1973
+ x[i] = vec_add(x[i], x[offset+i]); \
1885
1974
  } \
1886
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1887
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1975
+ offset >>= 1; \
1976
+ for (int i = 0; i < offset; ++i) { \
1977
+ x[i] = vec_add(x[i], x[offset+i]); \
1888
1978
  } \
1889
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1890
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1979
+ offset >>= 1; \
1980
+ for (int i = 0; i < offset; ++i) { \
1981
+ x[i] = vec_add(x[i], x[offset+i]); \
1891
1982
  } \
1892
1983
  res = vec_extract(x[0], 0) + \
1893
1984
  vec_extract(x[0], 1) + \
@@ -1943,14 +2034,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1943
2034
  #define GGML_F32x4_MUL wasm_f32x4_mul
1944
2035
  #define GGML_F32x4_REDUCE(res, x) \
1945
2036
  { \
1946
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1947
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
2037
+ int offset = GGML_F32_ARR >> 1; \
2038
+ for (int i = 0; i < offset; ++i) { \
2039
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1948
2040
  } \
1949
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1950
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
2041
+ offset >>= 1; \
2042
+ for (int i = 0; i < offset; ++i) { \
2043
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1951
2044
  } \
1952
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1953
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
2045
+ offset >>= 1; \
2046
+ for (int i = 0; i < offset; ++i) { \
2047
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1954
2048
  } \
1955
2049
  res = wasm_f32x4_extract_lane(x[0], 0) + \
1956
2050
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2005,14 +2099,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2005
2099
  #define GGML_F16x4_MUL wasm_f32x4_mul
2006
2100
  #define GGML_F16x4_REDUCE(res, x) \
2007
2101
  { \
2008
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
2009
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
2102
+ int offset = GGML_F16_ARR >> 1; \
2103
+ for (int i = 0; i < offset; ++i) { \
2104
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2010
2105
  } \
2011
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
2012
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
2106
+ offset >>= 1; \
2107
+ for (int i = 0; i < offset; ++i) { \
2108
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2013
2109
  } \
2014
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
2015
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
2110
+ offset >>= 1; \
2111
+ for (int i = 0; i < offset; ++i) { \
2112
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2016
2113
  } \
2017
2114
  res = wasm_f32x4_extract_lane(x[0], 0) + \
2018
2115
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2054,14 +2151,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2054
2151
  #define GGML_F32x4_MUL _mm_mul_ps
2055
2152
  #define GGML_F32x4_REDUCE(res, x) \
2056
2153
  { \
2057
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2058
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2154
+ int offset = GGML_F32_ARR >> 1; \
2155
+ for (int i = 0; i < offset; ++i) { \
2156
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2059
2157
  } \
2060
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2061
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2158
+ offset >>= 1; \
2159
+ for (int i = 0; i < offset; ++i) { \
2160
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2062
2161
  } \
2063
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2064
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2162
+ offset >>= 1; \
2163
+ for (int i = 0; i < offset; ++i) { \
2164
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2065
2165
  } \
2066
2166
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2067
2167
  res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
@@ -3350,6 +3450,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
3350
3450
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3351
3451
 
3352
3452
  static const float GELU_COEF_A = 0.044715f;
3453
+ static const float GELU_QUICK_COEF = -1.702f;
3353
3454
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3354
3455
 
3355
3456
  inline static float ggml_gelu_f32(float x) {
@@ -3380,6 +3481,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
3380
3481
  }
3381
3482
  #endif
3382
3483
 
3484
+ inline static float ggml_gelu_quick_f32(float x) {
3485
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
3486
+ }
3487
+
3488
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
3489
+ // const uint16_t * i16 = (const uint16_t *) x;
3490
+ // for (int i = 0; i < n; ++i) {
3491
+ // y[i] = table_gelu_quick_f16[i16[i]];
3492
+ // }
3493
+ //}
3494
+
3495
+ #ifdef GGML_GELU_QUICK_FP16
3496
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3497
+ uint16_t t;
3498
+ for (int i = 0; i < n; ++i) {
3499
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
3500
+ memcpy(&t, &fp16, sizeof(uint16_t));
3501
+ y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
3502
+ }
3503
+ }
3504
+ #else
3505
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3506
+ for (int i = 0; i < n; ++i) {
3507
+ y[i] = ggml_gelu_quick_f32(x[i]);
3508
+ }
3509
+ }
3510
+ #endif
3511
+
3383
3512
  // Sigmoid Linear Unit (SiLU) function
3384
3513
  inline static float ggml_silu_f32(float x) {
3385
3514
  return x/(1.0f + expf(-x));
@@ -3469,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3469
3598
  *s = 1.f/(*s);
3470
3599
  }
3471
3600
 
3472
- //
3473
- // logging
3474
- //
3475
-
3476
- #if (GGML_DEBUG >= 1)
3477
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
3478
- #else
3479
- #define GGML_PRINT_DEBUG(...)
3480
- #endif
3481
-
3482
- #if (GGML_DEBUG >= 5)
3483
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
3484
- #else
3485
- #define GGML_PRINT_DEBUG_5(...)
3486
- #endif
3487
-
3488
- #if (GGML_DEBUG >= 10)
3489
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
3490
- #else
3491
- #define GGML_PRINT_DEBUG_10(...)
3492
- #endif
3493
-
3494
- #define GGML_PRINT(...) printf(__VA_ARGS__)
3495
-
3496
3601
  //
3497
3602
  // data types
3498
3603
  //
@@ -3610,6 +3715,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3610
3715
  "STEP",
3611
3716
  "RELU",
3612
3717
  "GELU",
3718
+ "GELU_QUICK",
3613
3719
  "SILU",
3614
3720
  "SILU_BACK",
3615
3721
  "NORM",
@@ -3638,21 +3744,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3638
3744
  "ROPE_BACK",
3639
3745
  "ALIBI",
3640
3746
  "CLAMP",
3641
- "CONV_1D_1S",
3642
- "CONV_1D_2S",
3747
+ "CONV_1D_S1_PH",
3748
+ "CONV_1D_S2_PH",
3749
+ "CONV_2D_SK_P0",
3643
3750
 
3644
3751
  "FLASH_ATTN",
3645
3752
  "FLASH_FF",
3646
3753
  "FLASH_ATTN_BACK",
3754
+ "WIN_PART",
3755
+ "WIN_UNPART",
3647
3756
 
3648
3757
  "MAP_UNARY",
3649
3758
  "MAP_BINARY",
3650
3759
 
3760
+ "MAP_CUSTOM1",
3761
+ "MAP_CUSTOM2",
3762
+ "MAP_CUSTOM3",
3763
+
3651
3764
  "CROSS_ENTROPY_LOSS",
3652
3765
  "CROSS_ENTROPY_LOSS_BACK",
3653
3766
  };
3654
3767
 
3655
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3768
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3656
3769
 
3657
3770
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3658
3771
  "none",
@@ -3678,6 +3791,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3678
3791
  "step(x)",
3679
3792
  "relu(x)",
3680
3793
  "gelu(x)",
3794
+ "gelu_quick(x)",
3681
3795
  "silu(x)",
3682
3796
  "silu_back(x)",
3683
3797
  "norm(x)",
@@ -3706,21 +3820,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3706
3820
  "rope_back(x)",
3707
3821
  "alibi(x)",
3708
3822
  "clamp(x)",
3709
- "conv_1d_1s(x)",
3710
- "conv_1d_2s(x)",
3823
+ "conv_1d_s1_ph(x)",
3824
+ "conv_1d_s2_ph(x)",
3825
+ "conv_2d_sk_p0(x)",
3711
3826
 
3712
3827
  "flash_attn(x)",
3713
3828
  "flash_ff(x)",
3714
3829
  "flash_attn_back(x)",
3830
+ "win_part(x)",
3831
+ "win_unpart(x)",
3715
3832
 
3716
3833
  "f(x)",
3717
3834
  "f(x,y)",
3718
3835
 
3836
+ "custom(x)",
3837
+ "custom(x,y)",
3838
+ "custom(x,y,z)",
3839
+
3719
3840
  "cross_entropy_loss(x,y)",
3720
3841
  "cross_entropy_loss_back(x,y)",
3721
3842
  };
3722
3843
 
3723
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3844
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3724
3845
 
3725
3846
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3726
3847
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3751,12 +3872,31 @@ struct ggml_context_container {
3751
3872
  struct ggml_context context;
3752
3873
  };
3753
3874
 
3875
+ //
3876
+ // NUMA support
3877
+ //
3878
+
3879
+ #define GGML_NUMA_MAX_NODES 8
3880
+ #define GGML_NUMA_MAX_CPUS 512
3881
+
3882
+ struct ggml_numa_node {
3883
+ uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
3884
+ uint32_t n_cpus;
3885
+ };
3886
+
3887
+ struct ggml_numa_nodes {
3888
+ struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
3889
+ uint32_t n_nodes;
3890
+ uint32_t total_cpus; // hardware threads on system
3891
+ };
3892
+
3754
3893
  //
3755
3894
  // ggml state
3756
3895
  //
3757
3896
 
3758
3897
  struct ggml_state {
3759
3898
  struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
3899
+ struct ggml_numa_nodes numa;
3760
3900
  };
3761
3901
 
3762
3902
  // global state
@@ -3781,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
3781
3921
  atomic_fetch_sub(&g_state_barrier, 1);
3782
3922
  }
3783
3923
 
3924
+ void ggml_numa_init(void) {
3925
+ if (g_state.numa.n_nodes > 0) {
3926
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
3927
+
3928
+ return;
3929
+ }
3930
+
3931
+ #ifdef __linux__
3932
+ struct stat st;
3933
+ char path[256];
3934
+ int rv;
3935
+
3936
+ // enumerate nodes
3937
+ while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
3938
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
3939
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3940
+ if (stat(path, &st) != 0) { break; }
3941
+ ++g_state.numa.n_nodes;
3942
+ }
3943
+
3944
+ // enumerate CPUs
3945
+ while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
3946
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
3947
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3948
+ if (stat(path, &st) != 0) { break; }
3949
+ ++g_state.numa.total_cpus;
3950
+ }
3951
+
3952
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
3953
+
3954
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
3955
+ g_state.numa.n_nodes = 0;
3956
+ return;
3957
+ }
3958
+
3959
+ for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
3960
+ struct ggml_numa_node * node = &g_state.numa.nodes[n];
3961
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
3962
+ node->n_cpus = 0;
3963
+ for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
3964
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
3965
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3966
+ if (stat(path, &st) == 0) {
3967
+ node->cpus[node->n_cpus++] = c;
3968
+ GGML_PRINT_DEBUG(" %u", c);
3969
+ }
3970
+ }
3971
+ GGML_PRINT_DEBUG("\n");
3972
+ }
3973
+
3974
+ if (ggml_is_numa()) {
3975
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
3976
+ if (fptr != NULL) {
3977
+ char buf[42];
3978
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3979
+ GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3980
+ }
3981
+ fclose(fptr);
3982
+ }
3983
+ }
3984
+ #else
3985
+ // TODO
3986
+ #endif
3987
+ }
3988
+
3989
+ bool ggml_is_numa(void) {
3990
+ return g_state.numa.n_nodes > 1;
3991
+ }
3992
+
3784
3993
  ////////////////////////////////////////////////////////////////////////////////
3785
3994
 
3786
3995
  void ggml_print_object(const struct ggml_object * obj) {
@@ -4011,7 +4220,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4011
4220
  // initialize time system (required on Windows)
4012
4221
  ggml_time_init();
4013
4222
 
4014
- // initialize GELU, SILU and EXP F32 tables
4223
+ // initialize GELU, Quick GELU, SILU and EXP F32 tables
4015
4224
  {
4016
4225
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
4017
4226
 
@@ -4021,13 +4230,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4021
4230
  memcpy(&ii, &ui, sizeof(ii));
4022
4231
  const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
4023
4232
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
4233
+ table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
4024
4234
  table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
4025
4235
  table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
4026
4236
  }
4027
4237
 
4028
4238
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
4029
4239
 
4030
- GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4240
+ GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4031
4241
  }
4032
4242
 
4033
4243
  // initialize g_state
@@ -4036,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4036
4246
 
4037
4247
  g_state = (struct ggml_state) {
4038
4248
  /*.contexts =*/ { { 0 } },
4249
+ /*.numa =*/ {
4250
+ .n_nodes = 0,
4251
+ .total_cpus = 0,
4252
+ },
4039
4253
  };
4040
4254
 
4041
4255
  for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -4148,14 +4362,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4148
4362
  ctx->no_alloc = no_alloc;
4149
4363
  }
4150
4364
 
4151
- void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4365
+ void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
4152
4366
  return ctx->mem_buffer;
4153
4367
  }
4154
4368
 
4155
- size_t ggml_get_mem_size(struct ggml_context * ctx) {
4369
+ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
4156
4370
  return ctx->mem_size;
4157
4371
  }
4158
4372
 
4373
+ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4374
+ size_t max_size = 0;
4375
+
4376
+ struct ggml_object * obj = ctx->objects_begin;
4377
+
4378
+ while (obj != NULL) {
4379
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4380
+
4381
+ const size_t size = ggml_nbytes(tensor);
4382
+
4383
+ if (max_size < size) {
4384
+ max_size = size;
4385
+ }
4386
+
4387
+ obj = obj->next;
4388
+ }
4389
+
4390
+ return max_size;
4391
+ }
4392
+
4159
4393
  // IMPORTANT:
4160
4394
  // when creating "opt" tensors, always save and load the scratch buffer
4161
4395
  // this is an error prone process, but it is necessary to support inplace
@@ -4639,15 +4873,25 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4639
4873
  return tensor->name;
4640
4874
  }
4641
4875
 
4642
- void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4876
+ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4643
4877
  strncpy(tensor->name, name, sizeof(tensor->name));
4644
4878
  tensor->name[sizeof(tensor->name) - 1] = '\0';
4879
+ return tensor;
4880
+ }
4881
+
4882
+ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
4883
+ va_list args;
4884
+ va_start(args, fmt);
4885
+ vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
4886
+ va_end(args);
4887
+ return tensor;
4645
4888
  }
4646
4889
 
4647
4890
  struct ggml_tensor * ggml_view_tensor(
4648
4891
  struct ggml_context * ctx,
4649
4892
  const struct ggml_tensor * src) {
4650
4893
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
4894
+ ggml_format_name(result, "%s (view)", src->name);
4651
4895
 
4652
4896
  result->nb[0] = src->nb[0];
4653
4897
  result->nb[1] = src->nb[1];
@@ -5420,6 +5664,40 @@ struct ggml_tensor * ggml_gelu_inplace(
5420
5664
  return ggml_gelu_impl(ctx, a, true);
5421
5665
  }
5422
5666
 
5667
+ // ggml_gelu_quick
5668
+
5669
+ struct ggml_tensor * ggml_gelu_quick_impl(
5670
+ struct ggml_context * ctx,
5671
+ struct ggml_tensor * a,
5672
+ bool inplace) {
5673
+ bool is_node = false;
5674
+
5675
+ if (!inplace && (a->grad)) {
5676
+ is_node = true;
5677
+ }
5678
+
5679
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5680
+
5681
+ result->op = GGML_OP_GELU_QUICK;
5682
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5683
+ result->src0 = a;
5684
+ result->src1 = NULL;
5685
+
5686
+ return result;
5687
+ }
5688
+
5689
+ struct ggml_tensor * ggml_gelu_quick(
5690
+ struct ggml_context * ctx,
5691
+ struct ggml_tensor * a) {
5692
+ return ggml_gelu_quick_impl(ctx, a, false);
5693
+ }
5694
+
5695
+ struct ggml_tensor * ggml_gelu_quick_inplace(
5696
+ struct ggml_context * ctx,
5697
+ struct ggml_tensor * a) {
5698
+ return ggml_gelu_quick_impl(ctx, a, true);
5699
+ }
5700
+
5423
5701
  // ggml_silu
5424
5702
 
5425
5703
  struct ggml_tensor * ggml_silu_impl(
@@ -5775,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
5775
6053
 
5776
6054
  // make a view of the destination
5777
6055
  struct ggml_tensor * result = ggml_view_tensor(ctx, b);
6056
+ if (strlen(b->name) > 0) {
6057
+ ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
6058
+ } else {
6059
+ ggml_format_name(result, "%s (copy)", a->name);
6060
+ }
5778
6061
 
5779
6062
  result->op = GGML_OP_CPY;
5780
6063
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5811,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
5811
6094
  }
5812
6095
 
5813
6096
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6097
+ ggml_format_name(result, "%s (cont)", a->name);
5814
6098
 
5815
6099
  result->op = GGML_OP_CONT;
5816
6100
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5854,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
5854
6138
  }
5855
6139
 
5856
6140
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6141
+ ggml_format_name(result, "%s (reshaped)", a->name);
5857
6142
 
5858
6143
  result->op = GGML_OP_RESHAPE;
5859
6144
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5878,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
5878
6163
 
5879
6164
  const int64_t ne[1] = { ne0 };
5880
6165
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6166
+ ggml_format_name(result, "%s (reshaped)", a->name);
5881
6167
 
5882
6168
  result->op = GGML_OP_RESHAPE;
5883
6169
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5903,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
5903
6189
 
5904
6190
  const int64_t ne[2] = { ne0, ne1 };
5905
6191
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6192
+ ggml_format_name(result, "%s (reshaped)", a->name);
5906
6193
 
5907
6194
  result->op = GGML_OP_RESHAPE;
5908
6195
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5929,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
5929
6216
 
5930
6217
  const int64_t ne[3] = { ne0, ne1, ne2 };
5931
6218
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6219
+ ggml_format_name(result, "%s (reshaped)", a->name);
5932
6220
 
5933
6221
  result->op = GGML_OP_RESHAPE;
5934
6222
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5957,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
5957
6245
 
5958
6246
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
5959
6247
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6248
+ ggml_format_name(result, "%s (reshaped)", a->name);
5960
6249
 
5961
6250
  result->op = GGML_OP_RESHAPE;
5962
6251
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5981,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
5981
6270
  }
5982
6271
 
5983
6272
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6273
+ ggml_format_name(result, "%s (view)", a->name);
5984
6274
 
5985
6275
  ggml_scratch_save(ctx);
5986
6276
 
5987
6277
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6278
+ ggml_set_name(offs, "offset");
5988
6279
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
5989
6280
 
5990
6281
  ggml_scratch_load(ctx);
@@ -6017,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
6017
6308
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6018
6309
 
6019
6310
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6311
+ ggml_format_name(result, "%s (view)", a->name);
6020
6312
 
6021
6313
  ggml_scratch_save(ctx);
6022
6314
 
6023
6315
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6316
+ ggml_set_name(offs, "offset");
6024
6317
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6025
6318
 
6026
6319
  ggml_scratch_load(ctx);
@@ -6059,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
6059
6352
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6060
6353
 
6061
6354
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6355
+ ggml_format_name(result, "%s (view)", a->name);
6062
6356
 
6063
6357
  ggml_scratch_save(ctx);
6064
6358
 
6065
6359
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6360
+ ggml_set_name(offs, "offset");
6066
6361
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6067
6362
 
6068
6363
  ggml_scratch_load(ctx);
@@ -6103,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
6103
6398
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6104
6399
 
6105
6400
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6401
+ ggml_format_name(result, "%s (view)", a->name);
6106
6402
 
6107
6403
  ggml_scratch_save(ctx);
6108
6404
 
6109
6405
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6406
+ ggml_set_name(offs, "offset");
6110
6407
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6111
6408
 
6112
6409
  ggml_scratch_load(ctx);
@@ -6152,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
6152
6449
  }
6153
6450
 
6154
6451
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6452
+ ggml_format_name(result, "%s (permuted)", a->name);
6155
6453
 
6156
6454
  int ne[GGML_MAX_DIMS];
6157
6455
  int nb[GGML_MAX_DIMS];
@@ -6211,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
6211
6509
  }
6212
6510
 
6213
6511
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6512
+ ggml_format_name(result, "%s (transposed)", a->name);
6214
6513
 
6215
6514
  result->ne[0] = a->ne[1];
6216
6515
  result->ne[1] = a->ne[0];
@@ -6479,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
6479
6778
  int n_past,
6480
6779
  int n_dims,
6481
6780
  int mode,
6781
+ int n_ctx,
6482
6782
  bool inplace) {
6483
6783
  GGML_ASSERT(n_past >= 0);
6484
6784
  bool is_node = false;
@@ -6491,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
6491
6791
 
6492
6792
  ggml_scratch_save(ctx);
6493
6793
 
6494
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6794
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6495
6795
 
6496
6796
  ((int32_t *) b->data)[0] = n_past;
6497
6797
  ((int32_t *) b->data)[1] = n_dims;
6498
6798
  ((int32_t *) b->data)[2] = mode;
6799
+ ((int32_t *) b->data)[3] = n_ctx;
6499
6800
 
6500
6801
  ggml_scratch_load(ctx);
6501
6802
 
@@ -6512,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
6512
6813
  struct ggml_tensor * a,
6513
6814
  int n_past,
6514
6815
  int n_dims,
6515
- int mode) {
6516
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
6816
+ int mode,
6817
+ int n_ctx) {
6818
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6517
6819
  }
6518
6820
 
6519
6821
  struct ggml_tensor * ggml_rope_inplace(
@@ -6521,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
6521
6823
  struct ggml_tensor * a,
6522
6824
  int n_past,
6523
6825
  int n_dims,
6524
- int mode) {
6525
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
6826
+ int mode,
6827
+ int n_ctx) {
6828
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6526
6829
  }
6527
6830
 
6528
6831
  // ggml_rope_back
@@ -6619,7 +6922,7 @@ struct ggml_tensor * ggml_clamp(
6619
6922
 
6620
6923
  ggml_scratch_save(ctx);
6621
6924
 
6622
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6925
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
6623
6926
 
6624
6927
  ((float *) b->data)[0] = min;
6625
6928
  ((float *) b->data)[1] = max;
@@ -6634,9 +6937,9 @@ struct ggml_tensor * ggml_clamp(
6634
6937
  return result;
6635
6938
  }
6636
6939
 
6637
- // ggml_conv_1d_1s
6940
+ // ggml_conv_1d_s1_ph
6638
6941
 
6639
- struct ggml_tensor * ggml_conv_1d_1s(
6942
+ struct ggml_tensor * ggml_conv_1d_s1_ph(
6640
6943
  struct ggml_context * ctx,
6641
6944
  struct ggml_tensor * a,
6642
6945
  struct ggml_tensor * b) {
@@ -6653,7 +6956,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
6653
6956
  const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6654
6957
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6655
6958
 
6656
- result->op = GGML_OP_CONV_1D_1S;
6959
+ result->op = GGML_OP_CONV_1D_S1_PH;
6657
6960
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6658
6961
  result->src0 = a;
6659
6962
  result->src1 = b;
@@ -6661,9 +6964,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
6661
6964
  return result;
6662
6965
  }
6663
6966
 
6664
- // ggml_conv_1d_2s
6967
+ // ggml_conv_1d_s2_ph
6665
6968
 
6666
- struct ggml_tensor * ggml_conv_1d_2s(
6969
+ struct ggml_tensor * ggml_conv_1d_s2_ph(
6667
6970
  struct ggml_context * ctx,
6668
6971
  struct ggml_tensor * a,
6669
6972
  struct ggml_tensor * b) {
@@ -6680,7 +6983,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
6680
6983
  const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6681
6984
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6682
6985
 
6683
- result->op = GGML_OP_CONV_1D_2S;
6986
+ result->op = GGML_OP_CONV_1D_S2_PH;
6987
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6988
+ result->src0 = a;
6989
+ result->src1 = b;
6990
+
6991
+ return result;
6992
+ }
6993
+
6994
+ // ggml_conv_2d_sk_p0
6995
+
6996
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6997
+ struct ggml_context * ctx,
6998
+ struct ggml_tensor * a,
6999
+ struct ggml_tensor * b) {
7000
+ GGML_ASSERT(b->ne[3] == 1);
7001
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
7002
+ GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
7003
+ GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
7004
+ bool is_node = false;
7005
+
7006
+ if (a->grad || b->grad) {
7007
+ GGML_ASSERT(false); // TODO: implement backward
7008
+ is_node = true;
7009
+ }
7010
+
7011
+ const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
7012
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7013
+
7014
+ result->op = GGML_OP_CONV_2D_SK_P0;
6684
7015
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6685
7016
  result->src0 = a;
6686
7017
  result->src1 = b;
@@ -6814,45 +7145,133 @@ struct ggml_tensor * ggml_flash_attn_back(
6814
7145
  return result;
6815
7146
  }
6816
7147
 
7148
+ // ggml_win_part
6817
7149
 
6818
- // ggml_map_unary
7150
+ struct ggml_tensor * ggml_win_part(
7151
+ struct ggml_context * ctx,
7152
+ struct ggml_tensor * a,
7153
+ int w) {
7154
+ GGML_ASSERT(a->ne[3] == 1);
7155
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
6819
7156
 
6820
- struct ggml_tensor * ggml_map_unary_impl_f32(
6821
- struct ggml_context * ctx,
6822
- struct ggml_tensor * a,
6823
- const ggml_unary_op_f32_t fun,
6824
- bool inplace) {
6825
7157
  bool is_node = false;
6826
7158
 
6827
- if (!inplace && a->grad) {
7159
+ if (a->grad) {
7160
+ GGML_ASSERT(false); // TODO: implement backward
6828
7161
  is_node = true;
6829
7162
  }
6830
7163
 
6831
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
6832
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
6833
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7164
+ // padding
7165
+ const int px = (w - a->ne[1]%w)%w;
7166
+ const int py = (w - a->ne[2]%w)%w;
6834
7167
 
6835
- result->op = GGML_OP_MAP_UNARY;
7168
+ const int npx = (px + a->ne[1])/w;
7169
+ const int npy = (py + a->ne[2])/w;
7170
+ const int np = npx*npy;
7171
+
7172
+ const int64_t ne[4] = { a->ne[0], w, w, np, };
7173
+
7174
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7175
+
7176
+ ggml_scratch_save(ctx);
7177
+
7178
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7179
+
7180
+ ((int32_t *) b->data)[0] = npx;
7181
+ ((int32_t *) b->data)[1] = npy;
7182
+ ((int32_t *) b->data)[2] = w;
7183
+
7184
+ ggml_scratch_load(ctx);
7185
+
7186
+ result->op = GGML_OP_WIN_PART;
6836
7187
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6837
7188
  result->src0 = a;
6838
- result->opt[0] = addr_tensor;
7189
+ result->src1 = NULL;
7190
+ result->opt[0] = b;
6839
7191
 
6840
7192
  return result;
6841
7193
  }
6842
7194
 
6843
- struct ggml_tensor * ggml_map_unary_f32(
6844
- struct ggml_context * ctx,
6845
- struct ggml_tensor * a,
6846
- const ggml_unary_op_f32_t fun) {
6847
- return ggml_map_unary_impl_f32(ctx, a, fun, false);
6848
- }
7195
+ // ggml_win_unpart
6849
7196
 
6850
- struct ggml_tensor * ggml_map_unary_inplace_f32(
6851
- struct ggml_context * ctx,
6852
- struct ggml_tensor * a,
6853
- const ggml_unary_op_f32_t fun) {
6854
- return ggml_map_unary_impl_f32(ctx, a, fun, true);
6855
- }
7197
+ struct ggml_tensor * ggml_win_unpart(
7198
+ struct ggml_context * ctx,
7199
+ struct ggml_tensor * a,
7200
+ int w0,
7201
+ int h0,
7202
+ int w) {
7203
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
7204
+
7205
+ bool is_node = false;
7206
+
7207
+ if (a->grad) {
7208
+ GGML_ASSERT(false); // TODO: implement backward
7209
+ is_node = true;
7210
+ }
7211
+
7212
+ const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7213
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7214
+
7215
+ ggml_scratch_save(ctx);
7216
+
7217
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7218
+
7219
+ ((int32_t *) b->data)[0] = w;
7220
+
7221
+ ggml_scratch_load(ctx);
7222
+
7223
+ result->op = GGML_OP_WIN_UNPART;
7224
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7225
+ result->src0 = a;
7226
+ result->src1 = NULL;
7227
+ result->opt[0] = b;
7228
+
7229
+ return result;
7230
+ }
7231
+
7232
+ // ggml_map_unary
7233
+
7234
+ struct ggml_tensor * ggml_map_unary_impl_f32(
7235
+ struct ggml_context * ctx,
7236
+ struct ggml_tensor * a,
7237
+ const ggml_unary_op_f32_t fun,
7238
+ bool inplace) {
7239
+ bool is_node = false;
7240
+
7241
+ if (!inplace && a->grad) {
7242
+ is_node = true;
7243
+ }
7244
+
7245
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_scratch_save(ctx);
7248
+
7249
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7250
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7251
+
7252
+ ggml_scratch_load(ctx);
7253
+
7254
+ result->op = GGML_OP_MAP_UNARY;
7255
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7256
+ result->src0 = a;
7257
+ result->opt[0] = addr_tensor;
7258
+
7259
+ return result;
7260
+ }
7261
+
7262
+ struct ggml_tensor * ggml_map_unary_f32(
7263
+ struct ggml_context * ctx,
7264
+ struct ggml_tensor * a,
7265
+ const ggml_unary_op_f32_t fun) {
7266
+ return ggml_map_unary_impl_f32(ctx, a, fun, false);
7267
+ }
7268
+
7269
+ struct ggml_tensor * ggml_map_unary_inplace_f32(
7270
+ struct ggml_context * ctx,
7271
+ struct ggml_tensor * a,
7272
+ const ggml_unary_op_f32_t fun) {
7273
+ return ggml_map_unary_impl_f32(ctx, a, fun, true);
7274
+ }
6856
7275
 
6857
7276
  // ggml_map_binary
6858
7277
 
@@ -6870,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
6870
7289
  is_node = true;
6871
7290
  }
6872
7291
 
7292
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7293
+
7294
+ ggml_scratch_save(ctx);
7295
+
6873
7296
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
6874
7297
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
6875
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7298
+
7299
+ ggml_scratch_load(ctx);
6876
7300
 
6877
7301
  result->op = GGML_OP_MAP_BINARY;
6878
7302
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6899,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
6899
7323
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
6900
7324
  }
6901
7325
 
7326
+ // ggml_map_custom1
7327
+
7328
+ struct ggml_tensor * ggml_map_custom1_impl_f32(
7329
+ struct ggml_context * ctx,
7330
+ struct ggml_tensor * a,
7331
+ const ggml_custom1_op_f32_t fun,
7332
+ bool inplace) {
7333
+ bool is_node = false;
7334
+
7335
+ if (!inplace && a->grad) {
7336
+ is_node = true;
7337
+ }
7338
+
7339
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7340
+
7341
+ ggml_scratch_save(ctx);
7342
+
7343
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7344
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7345
+
7346
+ ggml_scratch_load(ctx);
7347
+
7348
+ result->op = GGML_OP_MAP_CUSTOM1;
7349
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7350
+ result->src0 = a;
7351
+ result->opt[0] = addr_tensor;
7352
+
7353
+ return result;
7354
+ }
7355
+
7356
+ struct ggml_tensor * ggml_map_custom1_f32(
7357
+ struct ggml_context * ctx,
7358
+ struct ggml_tensor * a,
7359
+ const ggml_custom1_op_f32_t fun) {
7360
+ return ggml_map_custom1_impl_f32(ctx, a, fun, false);
7361
+ }
7362
+
7363
+ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7364
+ struct ggml_context * ctx,
7365
+ struct ggml_tensor * a,
7366
+ const ggml_custom1_op_f32_t fun) {
7367
+ return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7368
+ }
7369
+
7370
+ // ggml_map_custom2
7371
+
7372
+ struct ggml_tensor * ggml_map_custom2_impl_f32(
7373
+ struct ggml_context * ctx,
7374
+ struct ggml_tensor * a,
7375
+ struct ggml_tensor * b,
7376
+ const ggml_custom2_op_f32_t fun,
7377
+ bool inplace) {
7378
+ bool is_node = false;
7379
+
7380
+ if (!inplace && (a->grad || b->grad)) {
7381
+ is_node = true;
7382
+ }
7383
+
7384
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7385
+
7386
+ ggml_scratch_save(ctx);
7387
+
7388
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7389
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7390
+
7391
+ ggml_scratch_load(ctx);
7392
+
7393
+ result->op = GGML_OP_MAP_CUSTOM2;
7394
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7395
+ result->src0 = a;
7396
+ result->src1 = b;
7397
+ result->opt[0] = addr_tensor;
7398
+
7399
+ return result;
7400
+ }
7401
+
7402
+ struct ggml_tensor * ggml_map_custom2_f32(
7403
+ struct ggml_context * ctx,
7404
+ struct ggml_tensor * a,
7405
+ struct ggml_tensor * b,
7406
+ const ggml_custom2_op_f32_t fun) {
7407
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
7408
+ }
7409
+
7410
+ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7411
+ struct ggml_context * ctx,
7412
+ struct ggml_tensor * a,
7413
+ struct ggml_tensor * b,
7414
+ const ggml_custom2_op_f32_t fun) {
7415
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7416
+ }
7417
+
7418
+ // ggml_map_custom3
7419
+
7420
+ struct ggml_tensor * ggml_map_custom3_impl_f32(
7421
+ struct ggml_context * ctx,
7422
+ struct ggml_tensor * a,
7423
+ struct ggml_tensor * b,
7424
+ struct ggml_tensor * c,
7425
+ const ggml_custom3_op_f32_t fun,
7426
+ bool inplace) {
7427
+ bool is_node = false;
7428
+
7429
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7430
+ is_node = true;
7431
+ }
7432
+
7433
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7434
+
7435
+ ggml_scratch_save(ctx);
7436
+
7437
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7438
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7439
+
7440
+ ggml_scratch_load(ctx);
7441
+
7442
+ result->op = GGML_OP_MAP_CUSTOM3;
7443
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7444
+ result->src0 = a;
7445
+ result->src1 = b;
7446
+ result->opt[0] = addr_tensor;
7447
+ result->opt[1] = c;
7448
+
7449
+ return result;
7450
+ }
7451
+
7452
+ struct ggml_tensor * ggml_map_custom3_f32(
7453
+ struct ggml_context * ctx,
7454
+ struct ggml_tensor * a,
7455
+ struct ggml_tensor * b,
7456
+ struct ggml_tensor * c,
7457
+ const ggml_custom3_op_f32_t fun) {
7458
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
7459
+ }
7460
+
7461
+ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7462
+ struct ggml_context * ctx,
7463
+ struct ggml_tensor * a,
7464
+ struct ggml_tensor * b,
7465
+ struct ggml_tensor * c,
7466
+ const ggml_custom3_op_f32_t fun) {
7467
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7468
+ }
7469
+
6902
7470
  // ggml_cross_entropy_loss
6903
7471
 
6904
7472
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -7892,7 +8460,7 @@ static void ggml_compute_forward_add_q_f32(
7892
8460
 
7893
8461
  void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
7894
8462
  float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
7895
- void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
8463
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
8464
 
7897
8465
  assert(ne00 % 32 == 0);
7898
8466
 
@@ -9453,8 +10021,65 @@ static void ggml_compute_forward_gelu(
9453
10021
  GGML_ASSERT(false);
9454
10022
  } break;
9455
10023
  }
10024
+ }
10025
+
10026
+ // ggml_compute_forward_gelu_quick
10027
+
10028
+ static void ggml_compute_forward_gelu_quick_f32(
10029
+ const struct ggml_compute_params * params,
10030
+ const struct ggml_tensor * src0,
10031
+ struct ggml_tensor * dst) {
10032
+ GGML_ASSERT(ggml_is_contiguous(src0));
10033
+ GGML_ASSERT(ggml_is_contiguous(dst));
10034
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
10035
+
10036
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10037
+ return;
10038
+ }
10039
+
10040
+ const int ith = params->ith;
10041
+ const int nth = params->nth;
10042
+
10043
+ const int nc = src0->ne[0];
10044
+ const int nr = ggml_nrows(src0);
10045
+
10046
+ // rows per thread
10047
+ const int dr = (nr + nth - 1)/nth;
10048
+
10049
+ // row range for this thread
10050
+ const int ir0 = dr*ith;
10051
+ const int ir1 = MIN(ir0 + dr, nr);
10052
+
10053
+ for (int i1 = ir0; i1 < ir1; i1++) {
10054
+ ggml_vec_gelu_quick_f32(nc,
10055
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
10056
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
10057
+
10058
+ #ifndef NDEBUG
10059
+ for (int k = 0; k < nc; k++) {
10060
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
10061
+ UNUSED(x);
10062
+ assert(!isnan(x));
10063
+ assert(!isinf(x));
10064
+ }
10065
+ #endif
10066
+ }
10067
+ }
9456
10068
 
9457
- //printf("XXXXXXXX gelu\n");
10069
+ static void ggml_compute_forward_gelu_quick(
10070
+ const struct ggml_compute_params * params,
10071
+ const struct ggml_tensor * src0,
10072
+ struct ggml_tensor * dst) {
10073
+ switch (src0->type) {
10074
+ case GGML_TYPE_F32:
10075
+ {
10076
+ ggml_compute_forward_gelu_quick_f32(params, src0, dst);
10077
+ } break;
10078
+ default:
10079
+ {
10080
+ GGML_ASSERT(false);
10081
+ } break;
10082
+ }
9458
10083
  }
9459
10084
 
9460
10085
  // ggml_compute_forward_silu
@@ -10852,7 +11477,7 @@ static void ggml_compute_forward_set_f32(
10852
11477
  const int im2 = (ne12 == 0 ? 0 : ne12-1);
10853
11478
  const int im3 = (ne13 == 0 ? 0 : ne13-1);
10854
11479
 
10855
- GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst));
11480
+ GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
10856
11481
 
10857
11482
  GGML_ASSERT(nb10 == sizeof(float));
10858
11483
 
@@ -11573,8 +12198,9 @@ static void ggml_compute_forward_alibi_f32(
11573
12198
  const struct ggml_tensor * src1,
11574
12199
  struct ggml_tensor * dst) {
11575
12200
  assert(params->ith == 0);
11576
- assert(src1->type == GGML_TYPE_I32);
11577
- assert(ggml_nelements(src1) == 3);
12201
+
12202
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
12203
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11578
12204
 
11579
12205
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11580
12206
  return;
@@ -11637,8 +12263,9 @@ static void ggml_compute_forward_alibi_f16(
11637
12263
  const struct ggml_tensor * src1,
11638
12264
  struct ggml_tensor * dst) {
11639
12265
  assert(params->ith == 0);
11640
- assert(src1->type == GGML_TYPE_I32);
11641
- assert(ggml_nelements(src1) == 3);
12266
+
12267
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
12268
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11642
12269
 
11643
12270
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11644
12271
  return;
@@ -11740,15 +12367,16 @@ static void ggml_compute_forward_clamp_f32(
11740
12367
  const struct ggml_tensor * src1,
11741
12368
  struct ggml_tensor * dst) {
11742
12369
  assert(params->ith == 0);
11743
- assert(src1->type == GGML_TYPE_I32);
11744
- assert(ggml_nelements(src1) == 2);
12370
+
12371
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12372
+ GGML_ASSERT(ggml_nelements(src1) == 2);
11745
12373
 
11746
12374
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11747
12375
  return;
11748
12376
  }
11749
12377
 
11750
- const int min = ((float *) src1->data)[0];
11751
- const int max = ((float *) src1->data)[1];
12378
+ const float min = ((float *) src1->data)[0];
12379
+ const float max = ((float *) src1->data)[1];
11752
12380
 
11753
12381
  const int ith = params->ith;
11754
12382
  const int nth = params->nth;
@@ -11816,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
11816
12444
  const struct ggml_tensor * src1,
11817
12445
  struct ggml_tensor * dst) {
11818
12446
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
11819
- GGML_ASSERT(ggml_nelements(src1) == 3);
12447
+ GGML_ASSERT(ggml_nelements(src1) == 4);
11820
12448
 
11821
12449
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11822
12450
  return;
@@ -11825,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
11825
12453
  const int n_past = ((int32_t *) src1->data)[0];
11826
12454
  const int n_dims = ((int32_t *) src1->data)[1];
11827
12455
  const int mode = ((int32_t *) src1->data)[2];
12456
+ const int n_ctx = ((int32_t *) src1->data)[3];
11828
12457
 
11829
12458
  assert(n_past >= 0);
11830
12459
 
@@ -11869,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
11869
12498
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
11870
12499
 
11871
12500
  const bool is_neox = mode & 2;
12501
+ const bool is_glm = mode & 4;
11872
12502
 
11873
12503
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11874
12504
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -11879,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
11879
12509
 
11880
12510
  float theta = (float)p;
11881
12511
 
11882
- if (!is_neox) {
12512
+ if (is_glm) {
12513
+ theta = MIN(p, n_ctx - 2);
12514
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12515
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12516
+ const float cos_theta = cosf(theta);
12517
+ const float sin_theta = sinf(theta);
12518
+ const float cos_block_theta = cosf(block_theta);
12519
+ const float sin_block_theta = sinf(block_theta);
12520
+
12521
+ theta *= theta_scale;
12522
+ block_theta *= theta_scale;
12523
+
12524
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12525
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12526
+
12527
+ const float x0 = src[0];
12528
+ const float x1 = src[n_dims/2];
12529
+ const float x2 = src[n_dims];
12530
+ const float x3 = src[n_dims/2*3];
12531
+
12532
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
12533
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
12534
+ dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
12535
+ dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12536
+ }
12537
+ } else if (!is_neox) {
11883
12538
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11884
12539
  const float cos_theta = cosf(theta);
11885
12540
  const float sin_theta = sinf(theta);
@@ -11929,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
11929
12584
  const struct ggml_tensor * src1,
11930
12585
  struct ggml_tensor * dst) {
11931
12586
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
11932
- GGML_ASSERT(ggml_nelements(src1) == 3);
12587
+ GGML_ASSERT(ggml_nelements(src1) == 4);
11933
12588
 
11934
12589
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11935
12590
  return;
@@ -11938,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
11938
12593
  const int n_past = ((int32_t *) src1->data)[0];
11939
12594
  const int n_dims = ((int32_t *) src1->data)[1];
11940
12595
  const int mode = ((int32_t *) src1->data)[2];
12596
+ const int n_ctx = ((int32_t *) src1->data)[3];
11941
12597
 
11942
12598
  assert(n_past >= 0);
11943
12599
 
@@ -11982,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
11982
12638
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
11983
12639
 
11984
12640
  const bool is_neox = mode & 2;
12641
+ const bool is_glm = mode & 4;
11985
12642
 
11986
12643
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11987
12644
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -11992,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
11992
12649
 
11993
12650
  float theta = (float)p;
11994
12651
 
11995
- if (!is_neox) {
12652
+ if (is_glm) {
12653
+ theta = MIN(p, n_ctx - 2);
12654
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12655
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12656
+ const float cos_theta = cosf(theta);
12657
+ const float sin_theta = sinf(theta);
12658
+ const float cos_block_theta = cosf(block_theta);
12659
+ const float sin_block_theta = sinf(block_theta);
12660
+
12661
+ theta *= theta_scale;
12662
+ block_theta *= theta_scale;
12663
+
12664
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12665
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12666
+
12667
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
12668
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12669
+ const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
12670
+ const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
12671
+
12672
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12673
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12674
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
12675
+ dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12676
+ }
12677
+ } if (!is_neox) {
11996
12678
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11997
12679
  const float cos_theta = cosf(theta);
11998
12680
  const float sin_theta = sinf(theta);
@@ -12306,9 +12988,9 @@ static void ggml_compute_forward_rope_back(
12306
12988
  }
12307
12989
  }
12308
12990
 
12309
- // ggml_compute_forward_conv_1d_1s
12991
+ // ggml_compute_forward_conv_1d_s1_ph
12310
12992
 
12311
- static void ggml_compute_forward_conv_1d_1s_f16_f32(
12993
+ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12312
12994
  const struct ggml_compute_params * params,
12313
12995
  const struct ggml_tensor * src0,
12314
12996
  const struct ggml_tensor * src1,
@@ -12428,7 +13110,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
12428
13110
  }
12429
13111
  }
12430
13112
 
12431
- static void ggml_compute_forward_conv_1d_1s_f32(
13113
+ static void ggml_compute_forward_conv_1d_s1_ph_f32(
12432
13114
  const struct ggml_compute_params * params,
12433
13115
  const struct ggml_tensor * src0,
12434
13116
  const struct ggml_tensor * src1,
@@ -12548,7 +13230,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
12548
13230
  }
12549
13231
  }
12550
13232
 
12551
- static void ggml_compute_forward_conv_1d_1s(
13233
+ static void ggml_compute_forward_conv_1d_s1_ph(
12552
13234
  const struct ggml_compute_params * params,
12553
13235
  const struct ggml_tensor * src0,
12554
13236
  const struct ggml_tensor * src1,
@@ -12556,11 +13238,11 @@ static void ggml_compute_forward_conv_1d_1s(
12556
13238
  switch (src0->type) {
12557
13239
  case GGML_TYPE_F16:
12558
13240
  {
12559
- ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
13241
+ ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
12560
13242
  } break;
12561
13243
  case GGML_TYPE_F32:
12562
13244
  {
12563
- ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
13245
+ ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
12564
13246
  } break;
12565
13247
  default:
12566
13248
  {
@@ -12569,9 +13251,9 @@ static void ggml_compute_forward_conv_1d_1s(
12569
13251
  }
12570
13252
  }
12571
13253
 
12572
- // ggml_compute_forward_conv_1d_2s
13254
+ // ggml_compute_forward_conv_1d_s2_ph
12573
13255
 
12574
- static void ggml_compute_forward_conv_1d_2s_f16_f32(
13256
+ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
12575
13257
  const struct ggml_compute_params * params,
12576
13258
  const struct ggml_tensor * src0,
12577
13259
  const struct ggml_tensor * src1,
@@ -12691,7 +13373,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
12691
13373
  }
12692
13374
  }
12693
13375
 
12694
- static void ggml_compute_forward_conv_1d_2s_f32(
13376
+ static void ggml_compute_forward_conv_1d_s2_ph_f32(
12695
13377
  const struct ggml_compute_params * params,
12696
13378
  const struct ggml_tensor * src0,
12697
13379
  const struct ggml_tensor * src1,
@@ -12811,7 +13493,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
12811
13493
  }
12812
13494
  }
12813
13495
 
12814
- static void ggml_compute_forward_conv_1d_2s(
13496
+ static void ggml_compute_forward_conv_1d_s2_ph(
12815
13497
  const struct ggml_compute_params * params,
12816
13498
  const struct ggml_tensor * src0,
12817
13499
  const struct ggml_tensor * src1,
@@ -12819,11 +13501,11 @@ static void ggml_compute_forward_conv_1d_2s(
12819
13501
  switch (src0->type) {
12820
13502
  case GGML_TYPE_F16:
12821
13503
  {
12822
- ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
13504
+ ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
12823
13505
  } break;
12824
13506
  case GGML_TYPE_F32:
12825
13507
  {
12826
- ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
13508
+ ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
12827
13509
  } break;
12828
13510
  default:
12829
13511
  {
@@ -12832,18 +13514,154 @@ static void ggml_compute_forward_conv_1d_2s(
12832
13514
  }
12833
13515
  }
12834
13516
 
12835
- // ggml_compute_forward_flash_attn
13517
+ // ggml_compute_forward_conv_2d_sk_p0
12836
13518
 
12837
- static void ggml_compute_forward_flash_attn_f32(
13519
+ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
12838
13520
  const struct ggml_compute_params * params,
12839
- const struct ggml_tensor * q,
12840
- const struct ggml_tensor * k,
12841
- const struct ggml_tensor * v,
12842
- const bool masked,
12843
- struct ggml_tensor * dst) {
12844
- int64_t t0 = ggml_perf_time_us();
12845
- UNUSED(t0);
12846
-
13521
+ const struct ggml_tensor * src0,
13522
+ const struct ggml_tensor * src1,
13523
+ struct ggml_tensor * dst) {
13524
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13525
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13526
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13527
+
13528
+ int64_t t0 = ggml_perf_time_us();
13529
+ UNUSED(t0);
13530
+
13531
+ const int ne00 = src0->ne[0];
13532
+ const int ne01 = src0->ne[1];
13533
+ const int ne02 = src0->ne[2];
13534
+ //const int ne03 = src0->ne[3];
13535
+
13536
+ const int ne10 = src1->ne[0];
13537
+ //const int ne11 = src1->ne[1];
13538
+ const int ne12 = src1->ne[2];
13539
+ //const int ne13 = src1->ne[3];
13540
+
13541
+ const int ne0 = dst->ne[0];
13542
+ const int ne1 = dst->ne[1];
13543
+ const int ne2 = dst->ne[2];
13544
+ //const int ne3 = dst->ne[3];
13545
+ //const int ne = ne0*ne1*ne2*ne3;
13546
+
13547
+ const int nb00 = src0->nb[0];
13548
+ //const int nb01 = src0->nb[1];
13549
+ //const int nb02 = src0->nb[2];
13550
+ const int nb03 = src0->nb[3];
13551
+
13552
+ const int nb10 = src1->nb[0];
13553
+ //const int nb11 = src1->nb[1];
13554
+ const int nb12 = src1->nb[2];
13555
+ //const int nb13 = src1->nb[3];
13556
+
13557
+ //const int nb0 = dst->nb[0];
13558
+ //const int nb1 = dst->nb[1];
13559
+ const int nb2 = dst->nb[2];
13560
+ //const int nb3 = dst->nb[3];
13561
+
13562
+ const int ith = params->ith;
13563
+ const int nth = params->nth;
13564
+
13565
+ const int nk0 = ne00;
13566
+ const int nk1 = ne01;
13567
+
13568
+ // size of the convolution row - the kernel size unrolled across all channels
13569
+ const int ew0 = nk0*nk1*ne02;
13570
+
13571
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13572
+ GGML_ASSERT(nb10 == sizeof(float));
13573
+
13574
+ if (params->type == GGML_TASK_INIT) {
13575
+ // TODO: fix this memset (wsize is overestimated)
13576
+ memset(params->wdata, 0, params->wsize);
13577
+
13578
+ // prepare source data (src1)
13579
+ {
13580
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13581
+
13582
+ for (int i12 = 0; i12 < ne12; i12++) {
13583
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13584
+ ggml_fp16_t * dst_data = wdata;
13585
+
13586
+ for (int i1 = 0; i1 < ne1; i1++) {
13587
+ for (int i0 = 0; i0 < ne0; i0++) {
13588
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13589
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13590
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13591
+ GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13592
+ }
13593
+ }
13594
+ }
13595
+ }
13596
+ }
13597
+ }
13598
+
13599
+ return;
13600
+ }
13601
+
13602
+ if (params->type == GGML_TASK_FINALIZE) {
13603
+ return;
13604
+ }
13605
+
13606
+ // total patches in dst
13607
+ const int np = ne2;
13608
+
13609
+ // patches per thread
13610
+ const int dp = (np + nth - 1)/nth;
13611
+
13612
+ // patch range for this thread
13613
+ const int ip0 = dp*ith;
13614
+ const int ip1 = MIN(ip0 + dp, np);
13615
+
13616
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13617
+
13618
+ for (int i2 = ip0; i2 < ip1; i2++) {
13619
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13620
+
13621
+ for (int i1 = 0; i1 < ne1; ++i1) {
13622
+ for (int i0 = 0; i0 < ne0; ++i0) {
13623
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13624
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13625
+ (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
13626
+ }
13627
+ }
13628
+ }
13629
+ }
13630
+
13631
+ static void ggml_compute_forward_conv_2d_sk_p0(
13632
+ const struct ggml_compute_params * params,
13633
+ const struct ggml_tensor * src0,
13634
+ const struct ggml_tensor * src1,
13635
+ struct ggml_tensor * dst) {
13636
+ switch (src0->type) {
13637
+ case GGML_TYPE_F16:
13638
+ {
13639
+ ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
13640
+ } break;
13641
+ case GGML_TYPE_F32:
13642
+ {
13643
+ //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13644
+ GGML_ASSERT(false);
13645
+ } break;
13646
+ default:
13647
+ {
13648
+ GGML_ASSERT(false);
13649
+ } break;
13650
+ }
13651
+ }
13652
+
13653
+ // ggml_compute_forward_flash_attn
13654
+
13655
+ static void ggml_compute_forward_flash_attn_f32(
13656
+ const struct ggml_compute_params * params,
13657
+ const struct ggml_tensor * q,
13658
+ const struct ggml_tensor * k,
13659
+ const struct ggml_tensor * v,
13660
+ const bool masked,
13661
+ struct ggml_tensor * dst) {
13662
+ int64_t t0 = ggml_perf_time_us();
13663
+ UNUSED(t0);
13664
+
12847
13665
  const int64_t neq0 = q->ne[0];
12848
13666
  const int64_t neq1 = q->ne[1];
12849
13667
  const int64_t neq2 = q->ne[2];
@@ -13926,6 +14744,145 @@ static void ggml_compute_forward_flash_attn_back(
13926
14744
  }
13927
14745
  }
13928
14746
 
14747
+ // ggml_compute_forward_win_part
14748
+
14749
+ static void ggml_compute_forward_win_part_f32(
14750
+ const struct ggml_compute_params * params,
14751
+ const struct ggml_tensor * src0,
14752
+ const struct ggml_tensor * opt0,
14753
+ struct ggml_tensor * dst) {
14754
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14755
+ return;
14756
+ }
14757
+
14758
+ const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14759
+ const int64_t ne01 = src0->ne[1];
14760
+ const int64_t ne02 = src0->ne[2];
14761
+ const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14762
+
14763
+ const int64_t ne0 = dst->ne[0];
14764
+ const int64_t ne1 = dst->ne[1];
14765
+ const int64_t ne2 = dst->ne[2];
14766
+ const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
14767
+
14768
+ const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14769
+ const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14770
+ const int32_t w = ((const int32_t *)(opt0->data))[2];
14771
+
14772
+ assert(ne00 == ne0);
14773
+ assert(ne3 == nep0*nep1);
14774
+
14775
+ // TODO: optimize / multi-thread
14776
+ for (int py = 0; py < nep1; ++py) {
14777
+ for (int px = 0; px < nep0; ++px) {
14778
+ const int64_t i3 = py*nep0 + px;
14779
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14780
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14781
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14782
+ const int64_t i02 = py*w + i2;
14783
+ const int64_t i01 = px*w + i1;
14784
+ const int64_t i00 = i0;
14785
+
14786
+ const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
14787
+ const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
14788
+
14789
+ if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
14790
+ ((float *) dst->data)[i] = 0.0f;
14791
+ } else {
14792
+ ((float *) dst->data)[i] = ((float *) src0->data)[j];
14793
+ }
14794
+ }
14795
+ }
14796
+ }
14797
+ }
14798
+ }
14799
+ }
14800
+
14801
+ static void ggml_compute_forward_win_part(
14802
+ const struct ggml_compute_params * params,
14803
+ const struct ggml_tensor * src0,
14804
+ const struct ggml_tensor * opt0,
14805
+ struct ggml_tensor * dst) {
14806
+ switch (src0->type) {
14807
+ case GGML_TYPE_F32:
14808
+ {
14809
+ ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
14810
+ } break;
14811
+ default:
14812
+ {
14813
+ GGML_ASSERT(false);
14814
+ } break;
14815
+ }
14816
+ }
14817
+
14818
+ // ggml_compute_forward_win_unpart
14819
+
14820
+ static void ggml_compute_forward_win_unpart_f32(
14821
+ const struct ggml_compute_params * params,
14822
+ const struct ggml_tensor * src0,
14823
+ const struct ggml_tensor * opt0,
14824
+ struct ggml_tensor * dst) {
14825
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14826
+ return;
14827
+ }
14828
+
14829
+ const int64_t ne00 = src0->ne[0];
14830
+ const int64_t ne01 = src0->ne[1];
14831
+ const int64_t ne02 = src0->ne[2];
14832
+ //const int64_t ne03 = src0->ne[3];
14833
+
14834
+ const int64_t ne0 = dst->ne[0];
14835
+ const int64_t ne1 = dst->ne[1];
14836
+ const int64_t ne2 = dst->ne[2];
14837
+
14838
+ const int32_t w = ((const int32_t *)(opt0->data))[0];
14839
+
14840
+ // padding
14841
+ const int px = (w - ne1%w)%w;
14842
+ //const int py = (w - ne2%w)%w;
14843
+
14844
+ const int npx = (px + ne1)/w;
14845
+ //const int npy = (py + ne2)/w;
14846
+
14847
+ assert(ne0 == ne00);
14848
+
14849
+ // TODO: optimize / multi-thread
14850
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14851
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14852
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14853
+ const int ip2 = i2/w;
14854
+ const int ip1 = i1/w;
14855
+
14856
+ const int64_t i02 = i2%w;
14857
+ const int64_t i01 = i1%w;
14858
+ const int64_t i00 = i0;
14859
+
14860
+ const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
14861
+ const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
14862
+
14863
+ ((float *) dst->data)[j] = ((float *) src0->data)[i];
14864
+ }
14865
+ }
14866
+ }
14867
+ }
14868
+
14869
+ static void ggml_compute_forward_win_unpart(
14870
+ const struct ggml_compute_params * params,
14871
+ const struct ggml_tensor * src0,
14872
+ const struct ggml_tensor * opt0,
14873
+ struct ggml_tensor * dst) {
14874
+ switch (src0->type) {
14875
+ case GGML_TYPE_F32:
14876
+ {
14877
+ ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14878
+ } break;
14879
+ default:
14880
+ {
14881
+ GGML_ASSERT(false);
14882
+ } break;
14883
+ }
14884
+ }
14885
+
13929
14886
  // ggml_compute_forward_map_unary
13930
14887
 
13931
14888
  static void ggml_compute_forward_map_unary_f32(
@@ -14019,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
14019
14976
  }
14020
14977
  }
14021
14978
 
14979
+ // ggml_compute_forward_map_custom1
14980
+
14981
+ static void ggml_compute_forward_map_custom1_f32(
14982
+ const struct ggml_compute_params * params,
14983
+ const struct ggml_tensor * a,
14984
+ struct ggml_tensor * dst,
14985
+ const ggml_custom1_op_f32_t fun) {
14986
+ assert(params->ith == 0);
14987
+
14988
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14989
+ return;
14990
+ }
14991
+
14992
+ fun(dst, a);
14993
+ }
14994
+
14995
+
14996
+ static void ggml_compute_forward_map_custom1(
14997
+ const struct ggml_compute_params * params,
14998
+ const struct ggml_tensor * a,
14999
+ struct ggml_tensor * dst,
15000
+ const ggml_custom1_op_f32_t fun) {
15001
+ switch (a->type) {
15002
+ case GGML_TYPE_F32:
15003
+ {
15004
+ ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
15005
+ } break;
15006
+ default:
15007
+ {
15008
+ GGML_ASSERT(false);
15009
+ } break;
15010
+ }
15011
+ }
15012
+
15013
+ // ggml_compute_forward_map_custom2
15014
+
15015
+ static void ggml_compute_forward_map_custom2_f32(
15016
+ const struct ggml_compute_params * params,
15017
+ const struct ggml_tensor * a,
15018
+ const struct ggml_tensor * b,
15019
+ struct ggml_tensor * dst,
15020
+ const ggml_custom2_op_f32_t fun) {
15021
+ assert(params->ith == 0);
15022
+
15023
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15024
+ return;
15025
+ }
15026
+
15027
+ fun(dst, a, b);
15028
+ }
15029
+
15030
+
15031
+ static void ggml_compute_forward_map_custom2(
15032
+ const struct ggml_compute_params * params,
15033
+ const struct ggml_tensor * a,
15034
+ const struct ggml_tensor * b,
15035
+ struct ggml_tensor * dst,
15036
+ const ggml_custom2_op_f32_t fun) {
15037
+ switch (a->type) {
15038
+ case GGML_TYPE_F32:
15039
+ {
15040
+ ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
15041
+ } break;
15042
+ default:
15043
+ {
15044
+ GGML_ASSERT(false);
15045
+ } break;
15046
+ }
15047
+ }
15048
+
15049
+ // ggml_compute_forward_map_custom3
15050
+
15051
+ static void ggml_compute_forward_map_custom3_f32(
15052
+ const struct ggml_compute_params * params,
15053
+ const struct ggml_tensor * a,
15054
+ const struct ggml_tensor * b,
15055
+ const struct ggml_tensor * c,
15056
+ struct ggml_tensor * dst,
15057
+ const ggml_custom3_op_f32_t fun) {
15058
+ assert(params->ith == 0);
15059
+
15060
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15061
+ return;
15062
+ }
15063
+
15064
+ fun(dst, a, b, c);
15065
+ }
15066
+
15067
+
15068
+ static void ggml_compute_forward_map_custom3(
15069
+ const struct ggml_compute_params * params,
15070
+ const struct ggml_tensor * a,
15071
+ const struct ggml_tensor * b,
15072
+ const struct ggml_tensor * c,
15073
+ struct ggml_tensor * dst,
15074
+ const ggml_custom3_op_f32_t fun) {
15075
+ switch (a->type) {
15076
+ case GGML_TYPE_F32:
15077
+ {
15078
+ ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
15079
+ } break;
15080
+ default:
15081
+ {
15082
+ GGML_ASSERT(false);
15083
+ } break;
15084
+ }
15085
+ }
15086
+
14022
15087
  // ggml_compute_forward_cross_entropy_loss
14023
15088
 
14024
15089
  static void ggml_compute_forward_cross_entropy_loss_f32(
@@ -14309,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14309
15374
  if (skip_cpu) {
14310
15375
  return;
14311
15376
  }
14312
- GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
15377
+ GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
14313
15378
  GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14314
15379
  #endif // GGML_USE_CUBLAS
14315
15380
 
@@ -14398,6 +15463,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14398
15463
  {
14399
15464
  ggml_compute_forward_gelu(params, tensor->src0, tensor);
14400
15465
  } break;
15466
+ case GGML_OP_GELU_QUICK:
15467
+ {
15468
+ ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
15469
+ } break;
14401
15470
  case GGML_OP_SILU:
14402
15471
  {
14403
15472
  ggml_compute_forward_silu(params, tensor->src0, tensor);
@@ -14502,19 +15571,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14502
15571
  {
14503
15572
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
14504
15573
  } break;
14505
- case GGML_OP_CONV_1D_1S:
15574
+ case GGML_OP_CONV_1D_S1_PH:
15575
+ {
15576
+ ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15577
+ } break;
15578
+ case GGML_OP_CONV_1D_S2_PH:
14506
15579
  {
14507
- ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
15580
+ ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
14508
15581
  } break;
14509
- case GGML_OP_CONV_1D_2S:
15582
+ case GGML_OP_CONV_2D_SK_P0:
14510
15583
  {
14511
- ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
15584
+ ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
14512
15585
  } break;
14513
15586
  case GGML_OP_FLASH_ATTN:
14514
15587
  {
14515
- int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
15588
+ const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
14516
15589
  GGML_ASSERT(t == 0 || t == 1);
14517
- bool masked = t != 0;
15590
+ const bool masked = t != 0;
14518
15591
  ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
14519
15592
  } break;
14520
15593
  case GGML_OP_FLASH_FF:
@@ -14528,6 +15601,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14528
15601
  bool masked = t != 0;
14529
15602
  ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
14530
15603
  } break;
15604
+ case GGML_OP_WIN_PART:
15605
+ {
15606
+ ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
15607
+ } break;
15608
+ case GGML_OP_WIN_UNPART:
15609
+ {
15610
+ ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
15611
+ } break;
14531
15612
  case GGML_OP_MAP_UNARY:
14532
15613
  {
14533
15614
  const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
@@ -14540,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14540
15621
  ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
14541
15622
  }
14542
15623
  break;
15624
+ case GGML_OP_MAP_CUSTOM1:
15625
+ {
15626
+ const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
15627
+ ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
15628
+ }
15629
+ break;
15630
+ case GGML_OP_MAP_CUSTOM2:
15631
+ {
15632
+ const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
15633
+ ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
15634
+ }
15635
+ break;
15636
+ case GGML_OP_MAP_CUSTOM3:
15637
+ {
15638
+ const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
15639
+ ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
15640
+ }
15641
+ break;
14543
15642
  case GGML_OP_CROSS_ENTROPY_LOSS:
14544
15643
  {
14545
15644
  ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
@@ -14799,6 +15898,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14799
15898
  {
14800
15899
  GGML_ASSERT(false); // TODO: not implemented
14801
15900
  } break;
15901
+ case GGML_OP_GELU_QUICK:
15902
+ {
15903
+ GGML_ASSERT(false); // TODO: not implemented
15904
+ } break;
14802
15905
  case GGML_OP_ALIBI:
14803
15906
  {
14804
15907
  GGML_ASSERT(false); // TODO: not implemented
@@ -15144,28 +16247,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15144
16247
  {
15145
16248
  if (src0->grad) {
15146
16249
  assert(src1->type == GGML_TYPE_I32);
15147
- assert(ggml_nelements(src1) == 3);
16250
+ assert(ggml_nelements(src1) == 4);
15148
16251
  const int n_past = ((int32_t *) src1->data)[0];
15149
16252
  const int n_dims = ((int32_t *) src1->data)[1];
15150
16253
  const int mode = ((int32_t *) src1->data)[2];
16254
+ const int n_ctx = ((int32_t *) src1->data)[3];
15151
16255
  src0->grad = ggml_add_impl(ctx,
15152
16256
  src0->grad,
15153
16257
  ggml_rope(ctx,
15154
16258
  tensor->grad,
15155
16259
  n_past,
15156
16260
  n_dims,
15157
- mode),
16261
+ mode,
16262
+ n_ctx),
15158
16263
  inplace);
15159
16264
  }
15160
16265
  if (src1->grad) {
15161
16266
  // noop
15162
16267
  }
15163
16268
  } break;
15164
- case GGML_OP_CONV_1D_1S:
16269
+ case GGML_OP_CONV_1D_S1_PH:
15165
16270
  {
15166
16271
  GGML_ASSERT(false); // TODO: not implemented
15167
16272
  } break;
15168
- case GGML_OP_CONV_1D_2S:
16273
+ case GGML_OP_CONV_1D_S2_PH:
16274
+ {
16275
+ GGML_ASSERT(false); // TODO: not implemented
16276
+ } break;
16277
+ case GGML_OP_CONV_2D_SK_P0:
15169
16278
  {
15170
16279
  GGML_ASSERT(false); // TODO: not implemented
15171
16280
  } break;
@@ -15334,8 +16443,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15334
16443
  {
15335
16444
  GGML_ASSERT(false); // not supported
15336
16445
  } break;
16446
+ case GGML_OP_WIN_PART:
16447
+ case GGML_OP_WIN_UNPART:
15337
16448
  case GGML_OP_MAP_UNARY:
15338
16449
  case GGML_OP_MAP_BINARY:
16450
+ case GGML_OP_MAP_CUSTOM1:
16451
+ case GGML_OP_MAP_CUSTOM2:
16452
+ case GGML_OP_MAP_CUSTOM3:
15339
16453
  {
15340
16454
  GGML_ASSERT(false); // not supported
15341
16455
  } break;
@@ -15407,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15407
16521
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15408
16522
 
15409
16523
  if (strlen(node->name) == 0) {
15410
- snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
16524
+ ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
15411
16525
  }
15412
16526
 
15413
16527
  cgraph->leafs[cgraph->n_leafs] = node;
@@ -15416,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15416
16530
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15417
16531
 
15418
16532
  if (strlen(node->name) == 0) {
15419
- snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
16533
+ ggml_format_name(node, "node_%d", cgraph->n_nodes);
15420
16534
  }
15421
16535
 
15422
16536
  cgraph->nodes[cgraph->n_nodes] = node;
@@ -15570,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
15570
16684
 
15571
16685
  #endif
15572
16686
 
16687
+ // Android's libc implementation "bionic" does not support setting affinity
16688
+ #if defined(__linux__) && !defined(__BIONIC__)
16689
+ void set_numa_thread_affinity(int thread_n, int n_threads) {
16690
+ if (!ggml_is_numa()) {
16691
+ return;
16692
+ }
16693
+
16694
+ // run thread on node_num thread_n / (threads per node)
16695
+ const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16696
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16697
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16698
+
16699
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16700
+ CPU_ZERO_S(setsize, cpus);
16701
+ for (size_t i = 0; i < node->n_cpus; ++i) {
16702
+ CPU_SET_S(node->cpus[i], setsize, cpus);
16703
+ }
16704
+
16705
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16706
+ if (rv) {
16707
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16708
+ strerror(rv));
16709
+ }
16710
+
16711
+ CPU_FREE(cpus);
16712
+ }
16713
+
16714
+ void clear_numa_thread_affinity(void) {
16715
+ if (!ggml_is_numa()) {
16716
+ return;
16717
+ }
16718
+
16719
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16720
+
16721
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16722
+ CPU_ZERO_S(setsize, cpus);
16723
+ for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
16724
+ CPU_SET_S(i, setsize, cpus);
16725
+ }
16726
+
16727
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16728
+ if (rv) {
16729
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16730
+ strerror(rv));
16731
+ }
16732
+
16733
+ CPU_FREE(cpus);
16734
+ }
16735
+ #else
16736
+ // TODO: Windows etc.
16737
+ // (the linux implementation may also work on BSD, someone should test)
16738
+ void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16739
+ void clear_numa_thread_affinity(void) {}
16740
+ #endif
16741
+
15573
16742
  struct ggml_compute_state_shared {
15574
- ggml_lock_t spin;
16743
+ struct ggml_cgraph * cgraph;
16744
+
16745
+ int64_t perf_node_start_cycles;
16746
+ int64_t perf_node_start_time_us;
15575
16747
 
15576
16748
  int n_threads;
15577
16749
 
15578
16750
  // synchronization primitives
15579
- atomic_int n_ready;
15580
- atomic_bool has_work;
15581
- atomic_bool stop; // stop all threads
16751
+ atomic_int n_active; // num active threads
16752
+ atomic_int node_n; // active graph node
15582
16753
  };
15583
16754
 
15584
16755
  struct ggml_compute_state {
15585
16756
  ggml_thread_t thrd;
15586
-
15587
- struct ggml_compute_params params;
15588
- struct ggml_tensor * node;
15589
-
16757
+ int ith;
15590
16758
  struct ggml_compute_state_shared * shared;
15591
16759
  };
15592
16760
 
16761
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16762
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16763
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
16764
+
16765
+ node->perf_runs++;
16766
+ node->perf_cycles += cycles_cur;
16767
+ node->perf_time_us += time_us_cur;
16768
+ }
16769
+
15593
16770
  static thread_ret_t ggml_graph_compute_thread(void * data) {
15594
16771
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16772
+ struct ggml_cgraph * cgraph = state->shared->cgraph;
15595
16773
 
15596
16774
  const int n_threads = state->shared->n_threads;
16775
+ set_numa_thread_affinity(state->ith, n_threads);
16776
+
16777
+ int node_n = -1;
15597
16778
 
15598
16779
  while (true) {
15599
- if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
15600
- atomic_store(&state->shared->has_work, false);
15601
- } else {
15602
- while (atomic_load(&state->shared->has_work)) {
15603
- if (atomic_load(&state->shared->stop)) {
15604
- return 0;
15605
- }
15606
- ggml_lock_lock (&state->shared->spin);
15607
- ggml_lock_unlock(&state->shared->spin);
16780
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16781
+ // all other threads are finished and spinning
16782
+ // do finalize and init here so we don't have synchronize again
16783
+ struct ggml_compute_params params = {
16784
+ /*.type =*/ GGML_TASK_FINALIZE,
16785
+ /*.ith =*/ 0,
16786
+ /*.nth =*/ 0,
16787
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16788
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16789
+ };
16790
+
16791
+ if (node_n != -1) {
16792
+ /* FINALIZE */
16793
+ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
+ params.nth = node->n_tasks;
16795
+ ggml_compute_forward(&params, node);
16796
+ ggml_graph_compute_perf_stats_node(node, state->shared);
15608
16797
  }
15609
- }
15610
16798
 
15611
- atomic_fetch_sub(&state->shared->n_ready, 1);
16799
+ // distribute new work or execute it direct if 1T
16800
+ while (++node_n < cgraph->n_nodes) {
16801
+ GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16802
+
16803
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16804
+
16805
+ state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
+ state->shared->perf_node_start_time_us = ggml_perf_time_us();
16807
+
16808
+ /* INIT */
16809
+ params.type = GGML_TASK_INIT;
16810
+ params.nth = node->n_tasks;
16811
+ ggml_compute_forward(&params, node);
16812
+
16813
+ if (node->n_tasks == 1) {
16814
+ // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16815
+ // they do something more efficient than spinning (?)
16816
+ params.type = GGML_TASK_COMPUTE;
16817
+ ggml_compute_forward(&params, node);
15612
16818
 
15613
- // wait for work
15614
- while (!atomic_load(&state->shared->has_work)) {
15615
- if (atomic_load(&state->shared->stop)) {
15616
- return 0;
16819
+ params.type = GGML_TASK_FINALIZE;
16820
+ ggml_compute_forward(&params, node);
16821
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16822
+ } else {
16823
+ break;
16824
+ }
15617
16825
  }
15618
- ggml_lock_lock (&state->shared->spin);
15619
- ggml_lock_unlock(&state->shared->spin);
16826
+
16827
+ atomic_store(&state->shared->n_active, n_threads);
16828
+ atomic_store(&state->shared->node_n, node_n);
16829
+ } else {
16830
+ // wait for other threads to finish
16831
+ const int last = node_n;
16832
+ do {
16833
+ sched_yield();
16834
+ node_n = atomic_load(&state->shared->node_n);
16835
+ } while (node_n == last);
15620
16836
  }
15621
16837
 
15622
16838
  // check if we should stop
15623
- if (atomic_load(&state->shared->stop)) {
15624
- break;
15625
- }
16839
+ if (node_n >= cgraph->n_nodes) break;
15626
16840
 
15627
- if (state->node) {
15628
- if (state->params.ith < state->params.nth) {
15629
- ggml_compute_forward(&state->params, state->node);
15630
- }
16841
+ /* COMPUTE */
16842
+ struct ggml_tensor * node = cgraph->nodes[node_n];
15631
16843
 
15632
- state->node = NULL;
15633
- } else {
15634
- break;
16844
+ struct ggml_compute_params params = {
16845
+ /*.type =*/ GGML_TASK_COMPUTE,
16846
+ /*.ith =*/ state->ith,
16847
+ /*.nth =*/ node->n_tasks,
16848
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16849
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16850
+ };
16851
+
16852
+ if (state->ith < node->n_tasks) {
16853
+ ggml_compute_forward(&params, node);
15635
16854
  }
15636
16855
  }
15637
16856
 
@@ -15642,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15642
16861
  const int n_threads = cgraph->n_threads;
15643
16862
 
15644
16863
  struct ggml_compute_state_shared state_shared = {
15645
- /*.spin =*/ GGML_LOCK_INITIALIZER,
15646
- /*.n_threads =*/ n_threads,
15647
- /*.n_ready =*/ 0,
15648
- /*.has_work =*/ false,
15649
- /*.stop =*/ false,
16864
+ /*.cgraph =*/ cgraph,
16865
+ /*.perf_node_start_cycles =*/ 0,
16866
+ /*.perf_node_start_time_us =*/ 0,
16867
+ /*.n_threads =*/ n_threads,
16868
+ /*.n_active =*/ n_threads,
16869
+ /*.node_n =*/ -1,
15650
16870
  };
15651
- struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
15652
-
15653
- // create thread pool
15654
- if (n_threads > 1) {
15655
- ggml_lock_init(&state_shared.spin);
15656
-
15657
- atomic_store(&state_shared.has_work, true);
15658
-
15659
- for (int j = 0; j < n_threads - 1; j++) {
15660
- workers[j] = (struct ggml_compute_state) {
15661
- .thrd = 0,
15662
- .params = {
15663
- .type = GGML_TASK_COMPUTE,
15664
- .ith = j + 1,
15665
- .nth = n_threads,
15666
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
15667
- .wdata = cgraph->work ? cgraph->work->data : NULL,
15668
- },
15669
- .node = NULL,
15670
- .shared = &state_shared,
15671
- };
15672
-
15673
- int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
15674
- GGML_ASSERT(rc == 0);
15675
- UNUSED(rc);
15676
- }
15677
- }
16871
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
15678
16872
 
15679
16873
  // initialize tasks + work buffer
15680
16874
  {
@@ -15742,6 +16936,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15742
16936
  } break;
15743
16937
  case GGML_OP_MUL:
15744
16938
  case GGML_OP_GELU:
16939
+ case GGML_OP_GELU_QUICK:
15745
16940
  case GGML_OP_SILU:
15746
16941
  case GGML_OP_SILU_BACK:
15747
16942
  case GGML_OP_NORM:
@@ -15817,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15817
17012
  } break;
15818
17013
  case GGML_OP_SCALE:
15819
17014
  {
15820
- node->n_tasks = n_threads;
17015
+ node->n_tasks = 1;
15821
17016
  } break;
15822
17017
  case GGML_OP_SET:
15823
17018
  case GGML_OP_CONT:
@@ -15848,8 +17043,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15848
17043
  {
15849
17044
  node->n_tasks = 1; //TODO
15850
17045
  } break;
15851
- case GGML_OP_CONV_1D_1S:
15852
- case GGML_OP_CONV_1D_2S:
17046
+ case GGML_OP_CONV_1D_S1_PH:
17047
+ case GGML_OP_CONV_1D_S2_PH:
15853
17048
  {
15854
17049
  node->n_tasks = n_threads;
15855
17050
 
@@ -15876,6 +17071,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15876
17071
  GGML_ASSERT(false);
15877
17072
  }
15878
17073
 
17074
+ work_size = MAX(work_size, cur);
17075
+ } break;
17076
+ case GGML_OP_CONV_2D_SK_P0:
17077
+ {
17078
+ node->n_tasks = n_threads;
17079
+
17080
+ GGML_ASSERT(node->src1->ne[3] == 1);
17081
+
17082
+ const int64_t ne00 = node->src0->ne[0]; // W
17083
+ const int64_t ne01 = node->src0->ne[1]; // H
17084
+ const int64_t ne02 = node->src0->ne[2]; // C
17085
+ const int64_t ne03 = node->src0->ne[3]; // N
17086
+
17087
+ const int64_t ne10 = node->src1->ne[0]; // W
17088
+ const int64_t ne11 = node->src1->ne[1]; // H
17089
+ const int64_t ne12 = node->src1->ne[2]; // C
17090
+
17091
+ const int64_t nk = ne00*ne01;
17092
+
17093
+ UNUSED(ne02);
17094
+ UNUSED(ne03);
17095
+ UNUSED(nk);
17096
+
17097
+ size_t cur = 0;
17098
+
17099
+ if (node->src0->type == GGML_TYPE_F16 &&
17100
+ node->src1->type == GGML_TYPE_F32) {
17101
+ cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
17102
+ } else if (node->src0->type == GGML_TYPE_F32 &&
17103
+ node->src1->type == GGML_TYPE_F32) {
17104
+ cur = sizeof(float)* (ne10*ne11*ne12);
17105
+ } else {
17106
+ GGML_ASSERT(false);
17107
+ }
17108
+
15879
17109
  work_size = MAX(work_size, cur);
15880
17110
  } break;
15881
17111
  case GGML_OP_FLASH_ATTN:
@@ -15937,8 +17167,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15937
17167
 
15938
17168
  work_size = MAX(work_size, cur);
15939
17169
  } break;
17170
+ case GGML_OP_WIN_PART:
17171
+ case GGML_OP_WIN_UNPART:
15940
17172
  case GGML_OP_MAP_UNARY:
15941
17173
  case GGML_OP_MAP_BINARY:
17174
+ case GGML_OP_MAP_CUSTOM1:
17175
+ case GGML_OP_MAP_CUSTOM2:
17176
+ case GGML_OP_MAP_CUSTOM3:
15942
17177
  {
15943
17178
  node->n_tasks = 1;
15944
17179
  } break;
@@ -15981,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15981
17216
  }
15982
17217
  }
15983
17218
 
15984
- const int64_t perf_start_cycles = ggml_perf_cycles();
15985
- const int64_t perf_start_time_us = ggml_perf_time_us();
15986
-
15987
- for (int i = 0; i < cgraph->n_nodes; i++) {
15988
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
15989
-
15990
- struct ggml_tensor * node = cgraph->nodes[i];
15991
-
15992
- // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
15993
- //if (node->grad == NULL && node->perf_runs > 0) {
15994
- // continue;
15995
- //}
15996
-
15997
- const int64_t perf_node_start_cycles = ggml_perf_cycles();
15998
- const int64_t perf_node_start_time_us = ggml_perf_time_us();
15999
-
16000
- // INIT
16001
- struct ggml_compute_params params = {
16002
- /*.type =*/ GGML_TASK_INIT,
16003
- /*.ith =*/ 0,
16004
- /*.nth =*/ node->n_tasks,
16005
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16006
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16007
- };
16008
-
16009
- ggml_compute_forward(&params, node);
16010
-
16011
- // COMPUTE
16012
- if (node->n_tasks > 1) {
16013
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16014
- atomic_store(&state_shared.has_work, false);
16015
- }
16016
-
16017
- while (atomic_load(&state_shared.has_work)) {
16018
- ggml_lock_lock (&state_shared.spin);
16019
- ggml_lock_unlock(&state_shared.spin);
16020
- }
16021
-
16022
- // launch thread pool
16023
- for (int j = 0; j < n_threads - 1; j++) {
16024
- workers[j].params = (struct ggml_compute_params) {
16025
- .type = GGML_TASK_COMPUTE,
16026
- .ith = j + 1,
16027
- .nth = node->n_tasks,
16028
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16029
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16030
- };
16031
- workers[j].node = node;
16032
- }
16033
-
16034
- atomic_fetch_sub(&state_shared.n_ready, 1);
16035
-
16036
- while (atomic_load(&state_shared.n_ready) > 0) {
16037
- ggml_lock_lock (&state_shared.spin);
16038
- ggml_lock_unlock(&state_shared.spin);
16039
- }
16040
-
16041
- atomic_store(&state_shared.has_work, true);
16042
- }
16043
-
16044
- params.type = GGML_TASK_COMPUTE;
16045
- ggml_compute_forward(&params, node);
16046
-
16047
- // wait for thread pool
16048
- if (node->n_tasks > 1) {
16049
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16050
- atomic_store(&state_shared.has_work, false);
16051
- }
16052
-
16053
- while (atomic_load(&state_shared.has_work)) {
16054
- ggml_lock_lock (&state_shared.spin);
16055
- ggml_lock_unlock(&state_shared.spin);
16056
- }
16057
-
16058
- atomic_fetch_sub(&state_shared.n_ready, 1);
16059
-
16060
- while (atomic_load(&state_shared.n_ready) != 0) {
16061
- ggml_lock_lock (&state_shared.spin);
16062
- ggml_lock_unlock(&state_shared.spin);
16063
- }
16064
- }
16065
-
16066
- // FINALIZE
16067
- if (node->n_tasks > 1) {
16068
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16069
- atomic_store(&state_shared.has_work, false);
16070
- }
16071
-
16072
- while (atomic_load(&state_shared.has_work)) {
16073
- ggml_lock_lock (&state_shared.spin);
16074
- ggml_lock_unlock(&state_shared.spin);
16075
- }
16076
-
16077
- // launch thread pool
16078
- for (int j = 0; j < n_threads - 1; j++) {
16079
- workers[j].params = (struct ggml_compute_params) {
16080
- .type = GGML_TASK_FINALIZE,
16081
- .ith = j + 1,
16082
- .nth = node->n_tasks,
16083
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16084
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16085
- };
16086
- workers[j].node = node;
16087
- }
16088
-
16089
- atomic_fetch_sub(&state_shared.n_ready, 1);
16090
-
16091
- while (atomic_load(&state_shared.n_ready) > 0) {
16092
- ggml_lock_lock (&state_shared.spin);
16093
- ggml_lock_unlock(&state_shared.spin);
16094
- }
17219
+ // create thread pool
17220
+ if (n_threads > 1) {
17221
+ for (int j = 1; j < n_threads; ++j) {
17222
+ workers[j] = (struct ggml_compute_state) {
17223
+ .thrd = 0,
17224
+ .ith = j,
17225
+ .shared = &state_shared,
17226
+ };
16095
17227
 
16096
- atomic_store(&state_shared.has_work, true);
17228
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
17229
+ GGML_ASSERT(rc == 0);
16097
17230
  }
17231
+ }
17232
+ workers[0].ith = 0;
17233
+ workers[0].shared = &state_shared;
16098
17234
 
16099
- params.type = GGML_TASK_FINALIZE;
16100
- ggml_compute_forward(&params, node);
16101
-
16102
- // wait for thread pool
16103
- if (node->n_tasks > 1) {
16104
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16105
- atomic_store(&state_shared.has_work, false);
16106
- }
16107
-
16108
- while (atomic_load(&state_shared.has_work)) {
16109
- ggml_lock_lock (&state_shared.spin);
16110
- ggml_lock_unlock(&state_shared.spin);
16111
- }
16112
-
16113
- atomic_fetch_sub(&state_shared.n_ready, 1);
17235
+ const int64_t perf_start_cycles = ggml_perf_cycles();
17236
+ const int64_t perf_start_time_us = ggml_perf_time_us();
16114
17237
 
16115
- while (atomic_load(&state_shared.n_ready) != 0) {
16116
- ggml_lock_lock (&state_shared.spin);
16117
- ggml_lock_unlock(&state_shared.spin);
16118
- }
16119
- }
17238
+ // this is a work thread too
17239
+ ggml_graph_compute_thread(&workers[0]);
16120
17240
 
16121
- // performance stats (node)
16122
- {
16123
- int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
16124
- int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
16125
-
16126
- node->perf_runs++;
16127
- node->perf_cycles += perf_cycles_cur;
16128
- node->perf_time_us += perf_time_us_cur;
16129
- }
16130
- }
17241
+ // don't leave affinity set on the main thread
17242
+ clear_numa_thread_affinity();
16131
17243
 
16132
17244
  // join thread pool
16133
17245
  if (n_threads > 1) {
16134
- atomic_store(&state_shared.stop, true);
16135
- atomic_store(&state_shared.has_work, true);
16136
-
16137
- for (int j = 0; j < n_threads - 1; j++) {
16138
- int rc = ggml_thread_join(workers[j].thrd, NULL);
17246
+ for (int j = 1; j < n_threads; j++) {
17247
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
16139
17248
  GGML_ASSERT(rc == 0);
16140
- UNUSED(rc);
16141
17249
  }
16142
-
16143
- ggml_lock_destroy(&state_shared.spin);
16144
17250
  }
16145
17251
 
16146
17252
  // performance stats (graph)
@@ -16469,16 +17575,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16469
17575
 
16470
17576
  if (!*ctx_data) {
16471
17577
  fprintf(stderr, "%s: failed to create ggml context\n", __func__);
17578
+ fclose(fin);
16472
17579
  return result;
16473
17580
  }
16474
17581
  }
16475
17582
 
16476
17583
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
16477
17584
 
16478
- const size_t ret = fread(data->data, sizeof(char), fsize, fin);
16479
- if (ret != fsize) {
16480
- fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
16481
- return result;
17585
+ {
17586
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
17587
+ if (ret != fsize) {
17588
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
17589
+ fclose(fin);
17590
+ return result;
17591
+ }
16482
17592
  }
16483
17593
 
16484
17594
  fclose(fin);
@@ -16758,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
16758
17868
  return NULL;
16759
17869
  }
16760
17870
 
17871
+ static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17872
+ struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
17873
+ struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
17874
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
17875
+ gparent0 ? (void *) gparent0 : (void *) parent,
17876
+ gparent0 ? "g" : "x",
17877
+ gparent ? (void *) gparent : (void *) node,
17878
+ gparent ? "g" : "x",
17879
+ gparent ? "empty" : "vee",
17880
+ gparent ? "dashed" : "solid",
17881
+ label);
17882
+ }
17883
+
17884
+ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17885
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
17886
+ (void *) parent, "x",
17887
+ (void *) node, "x",
17888
+ label);
17889
+ }
17890
+
16761
17891
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
16762
17892
  char color[16];
16763
17893
 
@@ -16793,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16793
17923
  (void *) node, color);
16794
17924
 
16795
17925
  if (strlen(node->name) > 0) {
16796
- fprintf(fp, "%s |", node->name);
17926
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17927
+ } else {
17928
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
16797
17929
  }
16798
17930
 
16799
17931
  if (node->n_dims == 2) {
@@ -16802,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16802
17934
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
16803
17935
  }
16804
17936
 
16805
-
16806
17937
  if (node->grad) {
16807
17938
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
16808
17939
  } else {
@@ -16821,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16821
17952
  (void *) node, color);
16822
17953
 
16823
17954
  if (strlen(node->name) > 0) {
16824
- fprintf(fp, "%s | ", node->name);
17955
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17956
+ } else {
17957
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
16825
17958
  }
16826
- if (ggml_nelements(node) == 1) {
16827
- if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
16828
- fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
16829
- }
16830
- else {
16831
- fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
17959
+
17960
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17961
+ if (ggml_nelements(node) < 5) {
17962
+ fprintf(fp, " | (");
17963
+ for (int j = 0; j < ggml_nelements(node); j++) {
17964
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17965
+ fprintf(fp, "%d", ggml_get_i32_1d(node, j));
17966
+ }
17967
+ else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
17968
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
17969
+ }
17970
+ else {
17971
+ fprintf(fp, "#");
17972
+ }
17973
+ if (j < ggml_nelements(node) - 1) {
17974
+ fprintf(fp, ", ");
17975
+ }
16832
17976
  }
16833
- }
16834
- else {
16835
- fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17977
+ fprintf(fp, ")");
16836
17978
  }
16837
17979
  fprintf(fp, "\"; ]\n");
16838
17980
  }
@@ -16840,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16840
17982
  for (int i = 0; i < gb->n_nodes; i++) {
16841
17983
  struct ggml_tensor * node = gb->nodes[i];
16842
17984
 
16843
- struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
16844
-
16845
17985
  if (node->src0) {
16846
- struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
16847
-
16848
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
16849
- parent0 ? (void *) parent0 : (void *) node->src0,
16850
- parent0 ? "g" : "x",
16851
- parent ? (void *) parent : (void *) node,
16852
- parent ? "g" : "x",
16853
- parent ? "empty" : "vee",
16854
- parent ? "dashed" : "solid");
17986
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
16855
17987
  }
16856
17988
 
16857
17989
  if (node->src1) {
16858
- struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
16859
-
16860
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
16861
- parent1 ? (void *) parent1 : (void *) node->src1,
16862
- parent1 ? "g" : "x",
16863
- parent ? (void *) parent : (void *) node,
16864
- parent ? "g" : "x",
16865
- parent ? "empty" : "vee",
16866
- parent ? "dashed" : "solid");
17990
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
17991
+ }
17992
+
17993
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
17994
+ if (node->opt[j]) {
17995
+ char label[16];
17996
+ snprintf(label, sizeof(label), "opt %d", j);
17997
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
17998
+ }
16867
17999
  }
16868
18000
  }
16869
18001
 
@@ -16871,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16871
18003
  struct ggml_tensor * node = gb->leafs[i];
16872
18004
 
16873
18005
  if (node->src0) {
16874
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
16875
- (void *) node->src0, "x",
16876
- (void *) node, "x");
18006
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
16877
18007
  }
16878
18008
 
16879
18009
  if (node->src1) {
16880
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
16881
- (void *) node->src1, "x",
16882
- (void *) node, "x");
18010
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
18011
+ }
18012
+
18013
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
18014
+ if (node->opt[j]) {
18015
+ char label[16];
18016
+ snprintf(label, sizeof(label), "opt %d", j);
18017
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
18018
+ }
16883
18019
  }
16884
18020
  }
16885
18021
 
@@ -17598,7 +18734,6 @@ GGML_API void ggml_opt_init(
17598
18734
  ggml_set_zero(opt->lbfgs.g);
17599
18735
  ggml_set_zero(opt->lbfgs.gp);
17600
18736
  ggml_set_zero(opt->lbfgs.d);
17601
- ggml_set_zero(opt->lbfgs.pf);
17602
18737
  if (opt->lbfgs.pf) {
17603
18738
  ggml_set_zero(opt->lbfgs.pf);
17604
18739
  }