llama_cpp 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- // Defines CLOCK_MONOTONIC on Linux
2
- #define _GNU_SOURCE
1
+ #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
3
 
4
4
  #include "ggml.h"
5
5
 
@@ -24,6 +24,7 @@
24
24
  #include <stdio.h>
25
25
  #include <float.h>
26
26
  #include <limits.h>
27
+ #include <stdarg.h>
27
28
 
28
29
  #ifdef GGML_USE_METAL
29
30
  #include <unistd.h>
@@ -35,6 +36,12 @@
35
36
  #define static_assert(cond, msg) struct global_scope_noop_trick
36
37
  #endif
37
38
 
39
+ #if defined(_MSC_VER)
40
+ // disable "possible loss of data" to avoid hundreds of casts
41
+ // we should just be careful :)
42
+ #pragma warning(disable: 4244 4267)
43
+ #endif
44
+
38
45
  #if defined(_WIN32)
39
46
 
40
47
  #include <windows.h>
@@ -84,6 +91,11 @@ static int sched_yield (void) {
84
91
  #include <stdatomic.h>
85
92
 
86
93
  typedef void* thread_ret_t;
94
+
95
+ #include <sys/types.h>
96
+ #include <sys/stat.h>
97
+ #include <unistd.h>
98
+
87
99
  #endif
88
100
 
89
101
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -106,11 +118,36 @@ typedef void* thread_ret_t;
106
118
  /*#define GGML_PERF*/
107
119
  #define GGML_DEBUG 0
108
120
  #define GGML_GELU_FP16
121
+ #define GGML_GELU_QUICK_FP16
109
122
  #define GGML_SILU_FP16
110
123
 
111
124
  #define GGML_SOFT_MAX_UNROLL 4
112
125
  #define GGML_VEC_DOT_UNROLL 2
113
126
 
127
+ //
128
+ // logging
129
+ //
130
+
131
+ #if (GGML_DEBUG >= 1)
132
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
133
+ #else
134
+ #define GGML_PRINT_DEBUG(...)
135
+ #endif
136
+
137
+ #if (GGML_DEBUG >= 5)
138
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
139
+ #else
140
+ #define GGML_PRINT_DEBUG_5(...)
141
+ #endif
142
+
143
+ #if (GGML_DEBUG >= 10)
144
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
145
+ #else
146
+ #define GGML_PRINT_DEBUG_10(...)
147
+ #endif
148
+
149
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
150
+
114
151
  #ifdef GGML_USE_ACCELERATE
115
152
  // uncomment to use vDSP for soft max computation
116
153
  // note: not sure if it is actually faster
@@ -123,6 +160,34 @@ typedef void* thread_ret_t;
123
160
  #define GGML_MEM_ALIGN 16
124
161
  #endif
125
162
 
163
+ //
164
+ // logging
165
+ //
166
+
167
+ #if (GGML_DEBUG >= 1)
168
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
169
+ #else
170
+ #define GGML_PRINT_DEBUG(...)
171
+ #endif
172
+
173
+ #if (GGML_DEBUG >= 5)
174
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
175
+ #else
176
+ #define GGML_PRINT_DEBUG_5(...)
177
+ #endif
178
+
179
+ #if (GGML_DEBUG >= 10)
180
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
181
+ #else
182
+ #define GGML_PRINT_DEBUG_10(...)
183
+ #endif
184
+
185
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
186
+
187
+ //
188
+ // end of logging block
189
+ //
190
+
126
191
  #if defined(_MSC_VER) || defined(__MINGW32__)
127
192
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
128
193
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -136,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
136
201
  #endif
137
202
  if (result != 0) {
138
203
  // Handle allocation failure
204
+ const char *error_desc = "unknown allocation error";
205
+ switch (result) {
206
+ case EINVAL:
207
+ error_desc = "invalid alignment value";
208
+ break;
209
+ case ENOMEM:
210
+ error_desc = "insufficient memory";
211
+ break;
212
+ }
213
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
214
+ __func__, error_desc, size/(1024.0*1024.0));
139
215
  return NULL;
140
216
  }
141
217
  return aligned_memory;
@@ -334,6 +410,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
334
410
  // precomputed gelu table for f16 (128 KB)
335
411
  static ggml_fp16_t table_gelu_f16[1 << 16];
336
412
 
413
+ // precomputed quick gelu table for f16 (128 KB)
414
+ static ggml_fp16_t table_gelu_quick_f16[1 << 16];
415
+
337
416
  // precomputed silu table for f16 (128 KB)
338
417
  static ggml_fp16_t table_silu_f16[1 << 16];
339
418
 
@@ -409,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
409
488
  }
410
489
  }
411
490
 
412
-
413
491
  //
414
492
  // timing
415
493
  //
@@ -472,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
472
550
  #define ggml_perf_cycles_per_ms() 0
473
551
  #endif
474
552
 
553
+
475
554
  //
476
555
  // cache line
477
556
  //
@@ -1671,14 +1750,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1671
1750
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1672
1751
  #define GGML_F32x4_REDUCE(res, x) \
1673
1752
  { \
1674
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1675
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1753
+ int offset = GGML_F32_ARR >> 1; \
1754
+ for (int i = 0; i < offset; ++i) { \
1755
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1676
1756
  } \
1677
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1678
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1757
+ offset >>= 1; \
1758
+ for (int i = 0; i < offset; ++i) { \
1759
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1679
1760
  } \
1680
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1681
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1761
+ offset >>= 1; \
1762
+ for (int i = 0; i < offset; ++i) { \
1763
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1682
1764
  } \
1683
1765
  res = GGML_F32x4_REDUCE_ONE(x[0]); \
1684
1766
  }
@@ -1709,14 +1791,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1709
1791
  #define GGML_F16x8_MUL vmulq_f16
1710
1792
  #define GGML_F16x8_REDUCE(res, x) \
1711
1793
  { \
1712
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1713
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1794
+ int offset = GGML_F16_ARR >> 1; \
1795
+ for (int i = 0; i < offset; ++i) { \
1796
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1714
1797
  } \
1715
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1716
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1798
+ offset >>= 1; \
1799
+ for (int i = 0; i < offset; ++i) { \
1800
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1717
1801
  } \
1718
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1719
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1802
+ offset >>= 1; \
1803
+ for (int i = 0; i < offset; ++i) { \
1804
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1720
1805
  } \
1721
1806
  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1722
1807
  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1783,14 +1868,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1783
1868
  #define GGML_F32x8_MUL _mm256_mul_ps
1784
1869
  #define GGML_F32x8_REDUCE(res, x) \
1785
1870
  { \
1786
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1787
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1871
+ int offset = GGML_F32_ARR >> 1; \
1872
+ for (int i = 0; i < offset; ++i) { \
1873
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1788
1874
  } \
1789
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1790
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1875
+ offset >>= 1; \
1876
+ for (int i = 0; i < offset; ++i) { \
1877
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1791
1878
  } \
1792
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1793
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1879
+ offset >>= 1; \
1880
+ for (int i = 0; i < offset; ++i) { \
1881
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1794
1882
  } \
1795
1883
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1796
1884
  _mm256_extractf128_ps(x[0], 1)); \
@@ -1880,14 +1968,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1880
1968
  #define GGML_F32x4_MUL vec_mul
1881
1969
  #define GGML_F32x4_REDUCE(res, x) \
1882
1970
  { \
1883
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1884
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1971
+ int offset = GGML_F32_ARR >> 1; \
1972
+ for (int i = 0; i < offset; ++i) { \
1973
+ x[i] = vec_add(x[i], x[offset+i]); \
1885
1974
  } \
1886
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1887
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1975
+ offset >>= 1; \
1976
+ for (int i = 0; i < offset; ++i) { \
1977
+ x[i] = vec_add(x[i], x[offset+i]); \
1888
1978
  } \
1889
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1890
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1979
+ offset >>= 1; \
1980
+ for (int i = 0; i < offset; ++i) { \
1981
+ x[i] = vec_add(x[i], x[offset+i]); \
1891
1982
  } \
1892
1983
  res = vec_extract(x[0], 0) + \
1893
1984
  vec_extract(x[0], 1) + \
@@ -1943,14 +2034,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1943
2034
  #define GGML_F32x4_MUL wasm_f32x4_mul
1944
2035
  #define GGML_F32x4_REDUCE(res, x) \
1945
2036
  { \
1946
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1947
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
2037
+ int offset = GGML_F32_ARR >> 1; \
2038
+ for (int i = 0; i < offset; ++i) { \
2039
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1948
2040
  } \
1949
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1950
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
2041
+ offset >>= 1; \
2042
+ for (int i = 0; i < offset; ++i) { \
2043
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1951
2044
  } \
1952
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1953
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
2045
+ offset >>= 1; \
2046
+ for (int i = 0; i < offset; ++i) { \
2047
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1954
2048
  } \
1955
2049
  res = wasm_f32x4_extract_lane(x[0], 0) + \
1956
2050
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2005,14 +2099,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2005
2099
  #define GGML_F16x4_MUL wasm_f32x4_mul
2006
2100
  #define GGML_F16x4_REDUCE(res, x) \
2007
2101
  { \
2008
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
2009
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
2102
+ int offset = GGML_F16_ARR >> 1; \
2103
+ for (int i = 0; i < offset; ++i) { \
2104
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2010
2105
  } \
2011
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
2012
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
2106
+ offset >>= 1; \
2107
+ for (int i = 0; i < offset; ++i) { \
2108
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2013
2109
  } \
2014
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
2015
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
2110
+ offset >>= 1; \
2111
+ for (int i = 0; i < offset; ++i) { \
2112
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2016
2113
  } \
2017
2114
  res = wasm_f32x4_extract_lane(x[0], 0) + \
2018
2115
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2054,14 +2151,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2054
2151
  #define GGML_F32x4_MUL _mm_mul_ps
2055
2152
  #define GGML_F32x4_REDUCE(res, x) \
2056
2153
  { \
2057
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2058
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2154
+ int offset = GGML_F32_ARR >> 1; \
2155
+ for (int i = 0; i < offset; ++i) { \
2156
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2059
2157
  } \
2060
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2061
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2158
+ offset >>= 1; \
2159
+ for (int i = 0; i < offset; ++i) { \
2160
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2062
2161
  } \
2063
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2064
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2162
+ offset >>= 1; \
2163
+ for (int i = 0; i < offset; ++i) { \
2164
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2065
2165
  } \
2066
2166
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2067
2167
  res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
@@ -3350,6 +3450,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
3350
3450
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3351
3451
 
3352
3452
  static const float GELU_COEF_A = 0.044715f;
3453
+ static const float GELU_QUICK_COEF = -1.702f;
3353
3454
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3354
3455
 
3355
3456
  inline static float ggml_gelu_f32(float x) {
@@ -3380,6 +3481,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
3380
3481
  }
3381
3482
  #endif
3382
3483
 
3484
+ inline static float ggml_gelu_quick_f32(float x) {
3485
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
3486
+ }
3487
+
3488
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
3489
+ // const uint16_t * i16 = (const uint16_t *) x;
3490
+ // for (int i = 0; i < n; ++i) {
3491
+ // y[i] = table_gelu_quick_f16[i16[i]];
3492
+ // }
3493
+ //}
3494
+
3495
+ #ifdef GGML_GELU_QUICK_FP16
3496
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3497
+ uint16_t t;
3498
+ for (int i = 0; i < n; ++i) {
3499
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
3500
+ memcpy(&t, &fp16, sizeof(uint16_t));
3501
+ y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
3502
+ }
3503
+ }
3504
+ #else
3505
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3506
+ for (int i = 0; i < n; ++i) {
3507
+ y[i] = ggml_gelu_quick_f32(x[i]);
3508
+ }
3509
+ }
3510
+ #endif
3511
+
3383
3512
  // Sigmoid Linear Unit (SiLU) function
3384
3513
  inline static float ggml_silu_f32(float x) {
3385
3514
  return x/(1.0f + expf(-x));
@@ -3469,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3469
3598
  *s = 1.f/(*s);
3470
3599
  }
3471
3600
 
3472
- //
3473
- // logging
3474
- //
3475
-
3476
- #if (GGML_DEBUG >= 1)
3477
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
3478
- #else
3479
- #define GGML_PRINT_DEBUG(...)
3480
- #endif
3481
-
3482
- #if (GGML_DEBUG >= 5)
3483
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
3484
- #else
3485
- #define GGML_PRINT_DEBUG_5(...)
3486
- #endif
3487
-
3488
- #if (GGML_DEBUG >= 10)
3489
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
3490
- #else
3491
- #define GGML_PRINT_DEBUG_10(...)
3492
- #endif
3493
-
3494
- #define GGML_PRINT(...) printf(__VA_ARGS__)
3495
-
3496
3601
  //
3497
3602
  // data types
3498
3603
  //
@@ -3610,6 +3715,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3610
3715
  "STEP",
3611
3716
  "RELU",
3612
3717
  "GELU",
3718
+ "GELU_QUICK",
3613
3719
  "SILU",
3614
3720
  "SILU_BACK",
3615
3721
  "NORM",
@@ -3638,21 +3744,28 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3638
3744
  "ROPE_BACK",
3639
3745
  "ALIBI",
3640
3746
  "CLAMP",
3641
- "CONV_1D_1S",
3642
- "CONV_1D_2S",
3747
+ "CONV_1D_S1_PH",
3748
+ "CONV_1D_S2_PH",
3749
+ "CONV_2D_SK_P0",
3643
3750
 
3644
3751
  "FLASH_ATTN",
3645
3752
  "FLASH_FF",
3646
3753
  "FLASH_ATTN_BACK",
3754
+ "WIN_PART",
3755
+ "WIN_UNPART",
3647
3756
 
3648
3757
  "MAP_UNARY",
3649
3758
  "MAP_BINARY",
3650
3759
 
3760
+ "MAP_CUSTOM1",
3761
+ "MAP_CUSTOM2",
3762
+ "MAP_CUSTOM3",
3763
+
3651
3764
  "CROSS_ENTROPY_LOSS",
3652
3765
  "CROSS_ENTROPY_LOSS_BACK",
3653
3766
  };
3654
3767
 
3655
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3768
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3656
3769
 
3657
3770
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3658
3771
  "none",
@@ -3678,6 +3791,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3678
3791
  "step(x)",
3679
3792
  "relu(x)",
3680
3793
  "gelu(x)",
3794
+ "gelu_quick(x)",
3681
3795
  "silu(x)",
3682
3796
  "silu_back(x)",
3683
3797
  "norm(x)",
@@ -3706,21 +3820,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3706
3820
  "rope_back(x)",
3707
3821
  "alibi(x)",
3708
3822
  "clamp(x)",
3709
- "conv_1d_1s(x)",
3710
- "conv_1d_2s(x)",
3823
+ "conv_1d_s1_ph(x)",
3824
+ "conv_1d_s2_ph(x)",
3825
+ "conv_2d_sk_p0(x)",
3711
3826
 
3712
3827
  "flash_attn(x)",
3713
3828
  "flash_ff(x)",
3714
3829
  "flash_attn_back(x)",
3830
+ "win_part(x)",
3831
+ "win_unpart(x)",
3715
3832
 
3716
3833
  "f(x)",
3717
3834
  "f(x,y)",
3718
3835
 
3836
+ "custom(x)",
3837
+ "custom(x,y)",
3838
+ "custom(x,y,z)",
3839
+
3719
3840
  "cross_entropy_loss(x,y)",
3720
3841
  "cross_entropy_loss_back(x,y)",
3721
3842
  };
3722
3843
 
3723
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3844
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3724
3845
 
3725
3846
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3726
3847
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3751,12 +3872,31 @@ struct ggml_context_container {
3751
3872
  struct ggml_context context;
3752
3873
  };
3753
3874
 
3875
+ //
3876
+ // NUMA support
3877
+ //
3878
+
3879
+ #define GGML_NUMA_MAX_NODES 8
3880
+ #define GGML_NUMA_MAX_CPUS 512
3881
+
3882
+ struct ggml_numa_node {
3883
+ uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
3884
+ uint32_t n_cpus;
3885
+ };
3886
+
3887
+ struct ggml_numa_nodes {
3888
+ struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
3889
+ uint32_t n_nodes;
3890
+ uint32_t total_cpus; // hardware threads on system
3891
+ };
3892
+
3754
3893
  //
3755
3894
  // ggml state
3756
3895
  //
3757
3896
 
3758
3897
  struct ggml_state {
3759
3898
  struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
3899
+ struct ggml_numa_nodes numa;
3760
3900
  };
3761
3901
 
3762
3902
  // global state
@@ -3781,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
3781
3921
  atomic_fetch_sub(&g_state_barrier, 1);
3782
3922
  }
3783
3923
 
3924
+ void ggml_numa_init(void) {
3925
+ if (g_state.numa.n_nodes > 0) {
3926
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
3927
+
3928
+ return;
3929
+ }
3930
+
3931
+ #ifdef __linux__
3932
+ struct stat st;
3933
+ char path[256];
3934
+ int rv;
3935
+
3936
+ // enumerate nodes
3937
+ while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
3938
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
3939
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3940
+ if (stat(path, &st) != 0) { break; }
3941
+ ++g_state.numa.n_nodes;
3942
+ }
3943
+
3944
+ // enumerate CPUs
3945
+ while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
3946
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
3947
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3948
+ if (stat(path, &st) != 0) { break; }
3949
+ ++g_state.numa.total_cpus;
3950
+ }
3951
+
3952
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
3953
+
3954
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
3955
+ g_state.numa.n_nodes = 0;
3956
+ return;
3957
+ }
3958
+
3959
+ for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
3960
+ struct ggml_numa_node * node = &g_state.numa.nodes[n];
3961
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
3962
+ node->n_cpus = 0;
3963
+ for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
3964
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
3965
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3966
+ if (stat(path, &st) == 0) {
3967
+ node->cpus[node->n_cpus++] = c;
3968
+ GGML_PRINT_DEBUG(" %u", c);
3969
+ }
3970
+ }
3971
+ GGML_PRINT_DEBUG("\n");
3972
+ }
3973
+
3974
+ if (ggml_is_numa()) {
3975
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
3976
+ if (fptr != NULL) {
3977
+ char buf[42];
3978
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3979
+ GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3980
+ }
3981
+ fclose(fptr);
3982
+ }
3983
+ }
3984
+ #else
3985
+ // TODO
3986
+ #endif
3987
+ }
3988
+
3989
+ bool ggml_is_numa(void) {
3990
+ return g_state.numa.n_nodes > 1;
3991
+ }
3992
+
3784
3993
  ////////////////////////////////////////////////////////////////////////////////
3785
3994
 
3786
3995
  void ggml_print_object(const struct ggml_object * obj) {
@@ -4011,7 +4220,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4011
4220
  // initialize time system (required on Windows)
4012
4221
  ggml_time_init();
4013
4222
 
4014
- // initialize GELU, SILU and EXP F32 tables
4223
+ // initialize GELU, Quick GELU, SILU and EXP F32 tables
4015
4224
  {
4016
4225
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
4017
4226
 
@@ -4021,13 +4230,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4021
4230
  memcpy(&ii, &ui, sizeof(ii));
4022
4231
  const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
4023
4232
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
4233
+ table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
4024
4234
  table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
4025
4235
  table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
4026
4236
  }
4027
4237
 
4028
4238
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
4029
4239
 
4030
- GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4240
+ GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4031
4241
  }
4032
4242
 
4033
4243
  // initialize g_state
@@ -4036,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4036
4246
 
4037
4247
  g_state = (struct ggml_state) {
4038
4248
  /*.contexts =*/ { { 0 } },
4249
+ /*.numa =*/ {
4250
+ .n_nodes = 0,
4251
+ .total_cpus = 0,
4252
+ },
4039
4253
  };
4040
4254
 
4041
4255
  for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -4148,14 +4362,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4148
4362
  ctx->no_alloc = no_alloc;
4149
4363
  }
4150
4364
 
4151
- void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4365
+ void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
4152
4366
  return ctx->mem_buffer;
4153
4367
  }
4154
4368
 
4155
- size_t ggml_get_mem_size(struct ggml_context * ctx) {
4369
+ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
4156
4370
  return ctx->mem_size;
4157
4371
  }
4158
4372
 
4373
+ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4374
+ size_t max_size = 0;
4375
+
4376
+ struct ggml_object * obj = ctx->objects_begin;
4377
+
4378
+ while (obj != NULL) {
4379
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4380
+
4381
+ const size_t size = ggml_nbytes(tensor);
4382
+
4383
+ if (max_size < size) {
4384
+ max_size = size;
4385
+ }
4386
+
4387
+ obj = obj->next;
4388
+ }
4389
+
4390
+ return max_size;
4391
+ }
4392
+
4159
4393
  // IMPORTANT:
4160
4394
  // when creating "opt" tensors, always save and load the scratch buffer
4161
4395
  // this is an error prone process, but it is necessary to support inplace
@@ -4639,15 +4873,25 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4639
4873
  return tensor->name;
4640
4874
  }
4641
4875
 
4642
- void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4876
+ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4643
4877
  strncpy(tensor->name, name, sizeof(tensor->name));
4644
4878
  tensor->name[sizeof(tensor->name) - 1] = '\0';
4879
+ return tensor;
4880
+ }
4881
+
4882
+ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
4883
+ va_list args;
4884
+ va_start(args, fmt);
4885
+ vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
4886
+ va_end(args);
4887
+ return tensor;
4645
4888
  }
4646
4889
 
4647
4890
  struct ggml_tensor * ggml_view_tensor(
4648
4891
  struct ggml_context * ctx,
4649
4892
  const struct ggml_tensor * src) {
4650
4893
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
4894
+ ggml_format_name(result, "%s (view)", src->name);
4651
4895
 
4652
4896
  result->nb[0] = src->nb[0];
4653
4897
  result->nb[1] = src->nb[1];
@@ -5420,6 +5664,40 @@ struct ggml_tensor * ggml_gelu_inplace(
5420
5664
  return ggml_gelu_impl(ctx, a, true);
5421
5665
  }
5422
5666
 
5667
+ // ggml_gelu_quick
5668
+
5669
+ struct ggml_tensor * ggml_gelu_quick_impl(
5670
+ struct ggml_context * ctx,
5671
+ struct ggml_tensor * a,
5672
+ bool inplace) {
5673
+ bool is_node = false;
5674
+
5675
+ if (!inplace && (a->grad)) {
5676
+ is_node = true;
5677
+ }
5678
+
5679
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5680
+
5681
+ result->op = GGML_OP_GELU_QUICK;
5682
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5683
+ result->src0 = a;
5684
+ result->src1 = NULL;
5685
+
5686
+ return result;
5687
+ }
5688
+
5689
+ struct ggml_tensor * ggml_gelu_quick(
5690
+ struct ggml_context * ctx,
5691
+ struct ggml_tensor * a) {
5692
+ return ggml_gelu_quick_impl(ctx, a, false);
5693
+ }
5694
+
5695
+ struct ggml_tensor * ggml_gelu_quick_inplace(
5696
+ struct ggml_context * ctx,
5697
+ struct ggml_tensor * a) {
5698
+ return ggml_gelu_quick_impl(ctx, a, true);
5699
+ }
5700
+
5423
5701
  // ggml_silu
5424
5702
 
5425
5703
  struct ggml_tensor * ggml_silu_impl(
@@ -5775,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
5775
6053
 
5776
6054
  // make a view of the destination
5777
6055
  struct ggml_tensor * result = ggml_view_tensor(ctx, b);
6056
+ if (strlen(b->name) > 0) {
6057
+ ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
6058
+ } else {
6059
+ ggml_format_name(result, "%s (copy)", a->name);
6060
+ }
5778
6061
 
5779
6062
  result->op = GGML_OP_CPY;
5780
6063
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5811,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
5811
6094
  }
5812
6095
 
5813
6096
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6097
+ ggml_format_name(result, "%s (cont)", a->name);
5814
6098
 
5815
6099
  result->op = GGML_OP_CONT;
5816
6100
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5854,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
5854
6138
  }
5855
6139
 
5856
6140
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6141
+ ggml_format_name(result, "%s (reshaped)", a->name);
5857
6142
 
5858
6143
  result->op = GGML_OP_RESHAPE;
5859
6144
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5878,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
5878
6163
 
5879
6164
  const int64_t ne[1] = { ne0 };
5880
6165
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6166
+ ggml_format_name(result, "%s (reshaped)", a->name);
5881
6167
 
5882
6168
  result->op = GGML_OP_RESHAPE;
5883
6169
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5903,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
5903
6189
 
5904
6190
  const int64_t ne[2] = { ne0, ne1 };
5905
6191
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6192
+ ggml_format_name(result, "%s (reshaped)", a->name);
5906
6193
 
5907
6194
  result->op = GGML_OP_RESHAPE;
5908
6195
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5929,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
5929
6216
 
5930
6217
  const int64_t ne[3] = { ne0, ne1, ne2 };
5931
6218
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6219
+ ggml_format_name(result, "%s (reshaped)", a->name);
5932
6220
 
5933
6221
  result->op = GGML_OP_RESHAPE;
5934
6222
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5957,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
5957
6245
 
5958
6246
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
5959
6247
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6248
+ ggml_format_name(result, "%s (reshaped)", a->name);
5960
6249
 
5961
6250
  result->op = GGML_OP_RESHAPE;
5962
6251
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5981,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
5981
6270
  }
5982
6271
 
5983
6272
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6273
+ ggml_format_name(result, "%s (view)", a->name);
5984
6274
 
5985
6275
  ggml_scratch_save(ctx);
5986
6276
 
5987
6277
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6278
+ ggml_set_name(offs, "offset");
5988
6279
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
5989
6280
 
5990
6281
  ggml_scratch_load(ctx);
@@ -6017,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
6017
6308
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6018
6309
 
6019
6310
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6311
+ ggml_format_name(result, "%s (view)", a->name);
6020
6312
 
6021
6313
  ggml_scratch_save(ctx);
6022
6314
 
6023
6315
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6316
+ ggml_set_name(offs, "offset");
6024
6317
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6025
6318
 
6026
6319
  ggml_scratch_load(ctx);
@@ -6059,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
6059
6352
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6060
6353
 
6061
6354
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6355
+ ggml_format_name(result, "%s (view)", a->name);
6062
6356
 
6063
6357
  ggml_scratch_save(ctx);
6064
6358
 
6065
6359
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6360
+ ggml_set_name(offs, "offset");
6066
6361
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6067
6362
 
6068
6363
  ggml_scratch_load(ctx);
@@ -6103,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
6103
6398
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6104
6399
 
6105
6400
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6401
+ ggml_format_name(result, "%s (view)", a->name);
6106
6402
 
6107
6403
  ggml_scratch_save(ctx);
6108
6404
 
6109
6405
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6406
+ ggml_set_name(offs, "offset");
6110
6407
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6111
6408
 
6112
6409
  ggml_scratch_load(ctx);
@@ -6152,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
6152
6449
  }
6153
6450
 
6154
6451
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6452
+ ggml_format_name(result, "%s (permuted)", a->name);
6155
6453
 
6156
6454
  int ne[GGML_MAX_DIMS];
6157
6455
  int nb[GGML_MAX_DIMS];
@@ -6211,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
6211
6509
  }
6212
6510
 
6213
6511
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6512
+ ggml_format_name(result, "%s (transposed)", a->name);
6214
6513
 
6215
6514
  result->ne[0] = a->ne[1];
6216
6515
  result->ne[1] = a->ne[0];
@@ -6479,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
6479
6778
  int n_past,
6480
6779
  int n_dims,
6481
6780
  int mode,
6781
+ int n_ctx,
6482
6782
  bool inplace) {
6483
6783
  GGML_ASSERT(n_past >= 0);
6484
6784
  bool is_node = false;
@@ -6491,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
6491
6791
 
6492
6792
  ggml_scratch_save(ctx);
6493
6793
 
6494
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6794
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6495
6795
 
6496
6796
  ((int32_t *) b->data)[0] = n_past;
6497
6797
  ((int32_t *) b->data)[1] = n_dims;
6498
6798
  ((int32_t *) b->data)[2] = mode;
6799
+ ((int32_t *) b->data)[3] = n_ctx;
6499
6800
 
6500
6801
  ggml_scratch_load(ctx);
6501
6802
 
@@ -6512,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
6512
6813
  struct ggml_tensor * a,
6513
6814
  int n_past,
6514
6815
  int n_dims,
6515
- int mode) {
6516
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
6816
+ int mode,
6817
+ int n_ctx) {
6818
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6517
6819
  }
6518
6820
 
6519
6821
  struct ggml_tensor * ggml_rope_inplace(
@@ -6521,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
6521
6823
  struct ggml_tensor * a,
6522
6824
  int n_past,
6523
6825
  int n_dims,
6524
- int mode) {
6525
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
6826
+ int mode,
6827
+ int n_ctx) {
6828
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6526
6829
  }
6527
6830
 
6528
6831
  // ggml_rope_back
@@ -6619,7 +6922,7 @@ struct ggml_tensor * ggml_clamp(
6619
6922
 
6620
6923
  ggml_scratch_save(ctx);
6621
6924
 
6622
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6925
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
6623
6926
 
6624
6927
  ((float *) b->data)[0] = min;
6625
6928
  ((float *) b->data)[1] = max;
@@ -6634,9 +6937,9 @@ struct ggml_tensor * ggml_clamp(
6634
6937
  return result;
6635
6938
  }
6636
6939
 
6637
- // ggml_conv_1d_1s
6940
+ // ggml_conv_1d_s1_ph
6638
6941
 
6639
- struct ggml_tensor * ggml_conv_1d_1s(
6942
+ struct ggml_tensor * ggml_conv_1d_s1_ph(
6640
6943
  struct ggml_context * ctx,
6641
6944
  struct ggml_tensor * a,
6642
6945
  struct ggml_tensor * b) {
@@ -6653,7 +6956,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
6653
6956
  const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6654
6957
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6655
6958
 
6656
- result->op = GGML_OP_CONV_1D_1S;
6959
+ result->op = GGML_OP_CONV_1D_S1_PH;
6657
6960
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6658
6961
  result->src0 = a;
6659
6962
  result->src1 = b;
@@ -6661,9 +6964,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
6661
6964
  return result;
6662
6965
  }
6663
6966
 
6664
- // ggml_conv_1d_2s
6967
+ // ggml_conv_1d_s2_ph
6665
6968
 
6666
- struct ggml_tensor * ggml_conv_1d_2s(
6969
+ struct ggml_tensor * ggml_conv_1d_s2_ph(
6667
6970
  struct ggml_context * ctx,
6668
6971
  struct ggml_tensor * a,
6669
6972
  struct ggml_tensor * b) {
@@ -6680,7 +6983,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
6680
6983
  const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6681
6984
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6682
6985
 
6683
- result->op = GGML_OP_CONV_1D_2S;
6986
+ result->op = GGML_OP_CONV_1D_S2_PH;
6987
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6988
+ result->src0 = a;
6989
+ result->src1 = b;
6990
+
6991
+ return result;
6992
+ }
6993
+
6994
+ // ggml_conv_2d_sk_p0
6995
+
6996
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6997
+ struct ggml_context * ctx,
6998
+ struct ggml_tensor * a,
6999
+ struct ggml_tensor * b) {
7000
+ GGML_ASSERT(b->ne[3] == 1);
7001
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
7002
+ GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
7003
+ GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
7004
+ bool is_node = false;
7005
+
7006
+ if (a->grad || b->grad) {
7007
+ GGML_ASSERT(false); // TODO: implement backward
7008
+ is_node = true;
7009
+ }
7010
+
7011
+ const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
7012
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7013
+
7014
+ result->op = GGML_OP_CONV_2D_SK_P0;
6684
7015
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6685
7016
  result->src0 = a;
6686
7017
  result->src1 = b;
@@ -6814,45 +7145,133 @@ struct ggml_tensor * ggml_flash_attn_back(
6814
7145
  return result;
6815
7146
  }
6816
7147
 
7148
+ // ggml_win_part
6817
7149
 
6818
- // ggml_map_unary
7150
+ struct ggml_tensor * ggml_win_part(
7151
+ struct ggml_context * ctx,
7152
+ struct ggml_tensor * a,
7153
+ int w) {
7154
+ GGML_ASSERT(a->ne[3] == 1);
7155
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
6819
7156
 
6820
- struct ggml_tensor * ggml_map_unary_impl_f32(
6821
- struct ggml_context * ctx,
6822
- struct ggml_tensor * a,
6823
- const ggml_unary_op_f32_t fun,
6824
- bool inplace) {
6825
7157
  bool is_node = false;
6826
7158
 
6827
- if (!inplace && a->grad) {
7159
+ if (a->grad) {
7160
+ GGML_ASSERT(false); // TODO: implement backward
6828
7161
  is_node = true;
6829
7162
  }
6830
7163
 
6831
- struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
6832
- *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
6833
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7164
+ // padding
7165
+ const int px = (w - a->ne[1]%w)%w;
7166
+ const int py = (w - a->ne[2]%w)%w;
6834
7167
 
6835
- result->op = GGML_OP_MAP_UNARY;
7168
+ const int npx = (px + a->ne[1])/w;
7169
+ const int npy = (py + a->ne[2])/w;
7170
+ const int np = npx*npy;
7171
+
7172
+ const int64_t ne[4] = { a->ne[0], w, w, np, };
7173
+
7174
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7175
+
7176
+ ggml_scratch_save(ctx);
7177
+
7178
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7179
+
7180
+ ((int32_t *) b->data)[0] = npx;
7181
+ ((int32_t *) b->data)[1] = npy;
7182
+ ((int32_t *) b->data)[2] = w;
7183
+
7184
+ ggml_scratch_load(ctx);
7185
+
7186
+ result->op = GGML_OP_WIN_PART;
6836
7187
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6837
7188
  result->src0 = a;
6838
- result->opt[0] = addr_tensor;
7189
+ result->src1 = NULL;
7190
+ result->opt[0] = b;
6839
7191
 
6840
7192
  return result;
6841
7193
  }
6842
7194
 
6843
- struct ggml_tensor * ggml_map_unary_f32(
6844
- struct ggml_context * ctx,
6845
- struct ggml_tensor * a,
6846
- const ggml_unary_op_f32_t fun) {
6847
- return ggml_map_unary_impl_f32(ctx, a, fun, false);
6848
- }
7195
+ // ggml_win_unpart
6849
7196
 
6850
- struct ggml_tensor * ggml_map_unary_inplace_f32(
6851
- struct ggml_context * ctx,
6852
- struct ggml_tensor * a,
6853
- const ggml_unary_op_f32_t fun) {
6854
- return ggml_map_unary_impl_f32(ctx, a, fun, true);
6855
- }
7197
+ struct ggml_tensor * ggml_win_unpart(
7198
+ struct ggml_context * ctx,
7199
+ struct ggml_tensor * a,
7200
+ int w0,
7201
+ int h0,
7202
+ int w) {
7203
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
7204
+
7205
+ bool is_node = false;
7206
+
7207
+ if (a->grad) {
7208
+ GGML_ASSERT(false); // TODO: implement backward
7209
+ is_node = true;
7210
+ }
7211
+
7212
+ const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7213
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7214
+
7215
+ ggml_scratch_save(ctx);
7216
+
7217
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7218
+
7219
+ ((int32_t *) b->data)[0] = w;
7220
+
7221
+ ggml_scratch_load(ctx);
7222
+
7223
+ result->op = GGML_OP_WIN_UNPART;
7224
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7225
+ result->src0 = a;
7226
+ result->src1 = NULL;
7227
+ result->opt[0] = b;
7228
+
7229
+ return result;
7230
+ }
7231
+
7232
+ // ggml_map_unary
7233
+
7234
+ struct ggml_tensor * ggml_map_unary_impl_f32(
7235
+ struct ggml_context * ctx,
7236
+ struct ggml_tensor * a,
7237
+ const ggml_unary_op_f32_t fun,
7238
+ bool inplace) {
7239
+ bool is_node = false;
7240
+
7241
+ if (!inplace && a->grad) {
7242
+ is_node = true;
7243
+ }
7244
+
7245
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_scratch_save(ctx);
7248
+
7249
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7250
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7251
+
7252
+ ggml_scratch_load(ctx);
7253
+
7254
+ result->op = GGML_OP_MAP_UNARY;
7255
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7256
+ result->src0 = a;
7257
+ result->opt[0] = addr_tensor;
7258
+
7259
+ return result;
7260
+ }
7261
+
7262
+ struct ggml_tensor * ggml_map_unary_f32(
7263
+ struct ggml_context * ctx,
7264
+ struct ggml_tensor * a,
7265
+ const ggml_unary_op_f32_t fun) {
7266
+ return ggml_map_unary_impl_f32(ctx, a, fun, false);
7267
+ }
7268
+
7269
+ struct ggml_tensor * ggml_map_unary_inplace_f32(
7270
+ struct ggml_context * ctx,
7271
+ struct ggml_tensor * a,
7272
+ const ggml_unary_op_f32_t fun) {
7273
+ return ggml_map_unary_impl_f32(ctx, a, fun, true);
7274
+ }
6856
7275
 
6857
7276
  // ggml_map_binary
6858
7277
 
@@ -6870,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
6870
7289
  is_node = true;
6871
7290
  }
6872
7291
 
7292
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7293
+
7294
+ ggml_scratch_save(ctx);
7295
+
6873
7296
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
6874
7297
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
6875
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7298
+
7299
+ ggml_scratch_load(ctx);
6876
7300
 
6877
7301
  result->op = GGML_OP_MAP_BINARY;
6878
7302
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6899,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
6899
7323
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
6900
7324
  }
6901
7325
 
7326
+ // ggml_map_custom1
7327
+
7328
+ struct ggml_tensor * ggml_map_custom1_impl_f32(
7329
+ struct ggml_context * ctx,
7330
+ struct ggml_tensor * a,
7331
+ const ggml_custom1_op_f32_t fun,
7332
+ bool inplace) {
7333
+ bool is_node = false;
7334
+
7335
+ if (!inplace && a->grad) {
7336
+ is_node = true;
7337
+ }
7338
+
7339
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7340
+
7341
+ ggml_scratch_save(ctx);
7342
+
7343
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7344
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7345
+
7346
+ ggml_scratch_load(ctx);
7347
+
7348
+ result->op = GGML_OP_MAP_CUSTOM1;
7349
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7350
+ result->src0 = a;
7351
+ result->opt[0] = addr_tensor;
7352
+
7353
+ return result;
7354
+ }
7355
+
7356
+ struct ggml_tensor * ggml_map_custom1_f32(
7357
+ struct ggml_context * ctx,
7358
+ struct ggml_tensor * a,
7359
+ const ggml_custom1_op_f32_t fun) {
7360
+ return ggml_map_custom1_impl_f32(ctx, a, fun, false);
7361
+ }
7362
+
7363
+ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7364
+ struct ggml_context * ctx,
7365
+ struct ggml_tensor * a,
7366
+ const ggml_custom1_op_f32_t fun) {
7367
+ return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7368
+ }
7369
+
7370
+ // ggml_map_custom2
7371
+
7372
+ struct ggml_tensor * ggml_map_custom2_impl_f32(
7373
+ struct ggml_context * ctx,
7374
+ struct ggml_tensor * a,
7375
+ struct ggml_tensor * b,
7376
+ const ggml_custom2_op_f32_t fun,
7377
+ bool inplace) {
7378
+ bool is_node = false;
7379
+
7380
+ if (!inplace && (a->grad || b->grad)) {
7381
+ is_node = true;
7382
+ }
7383
+
7384
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7385
+
7386
+ ggml_scratch_save(ctx);
7387
+
7388
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7389
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7390
+
7391
+ ggml_scratch_load(ctx);
7392
+
7393
+ result->op = GGML_OP_MAP_CUSTOM2;
7394
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7395
+ result->src0 = a;
7396
+ result->src1 = b;
7397
+ result->opt[0] = addr_tensor;
7398
+
7399
+ return result;
7400
+ }
7401
+
7402
+ struct ggml_tensor * ggml_map_custom2_f32(
7403
+ struct ggml_context * ctx,
7404
+ struct ggml_tensor * a,
7405
+ struct ggml_tensor * b,
7406
+ const ggml_custom2_op_f32_t fun) {
7407
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
7408
+ }
7409
+
7410
+ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7411
+ struct ggml_context * ctx,
7412
+ struct ggml_tensor * a,
7413
+ struct ggml_tensor * b,
7414
+ const ggml_custom2_op_f32_t fun) {
7415
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7416
+ }
7417
+
7418
+ // ggml_map_custom3
7419
+
7420
+ struct ggml_tensor * ggml_map_custom3_impl_f32(
7421
+ struct ggml_context * ctx,
7422
+ struct ggml_tensor * a,
7423
+ struct ggml_tensor * b,
7424
+ struct ggml_tensor * c,
7425
+ const ggml_custom3_op_f32_t fun,
7426
+ bool inplace) {
7427
+ bool is_node = false;
7428
+
7429
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7430
+ is_node = true;
7431
+ }
7432
+
7433
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7434
+
7435
+ ggml_scratch_save(ctx);
7436
+
7437
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7438
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7439
+
7440
+ ggml_scratch_load(ctx);
7441
+
7442
+ result->op = GGML_OP_MAP_CUSTOM3;
7443
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7444
+ result->src0 = a;
7445
+ result->src1 = b;
7446
+ result->opt[0] = addr_tensor;
7447
+ result->opt[1] = c;
7448
+
7449
+ return result;
7450
+ }
7451
+
7452
+ struct ggml_tensor * ggml_map_custom3_f32(
7453
+ struct ggml_context * ctx,
7454
+ struct ggml_tensor * a,
7455
+ struct ggml_tensor * b,
7456
+ struct ggml_tensor * c,
7457
+ const ggml_custom3_op_f32_t fun) {
7458
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
7459
+ }
7460
+
7461
+ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7462
+ struct ggml_context * ctx,
7463
+ struct ggml_tensor * a,
7464
+ struct ggml_tensor * b,
7465
+ struct ggml_tensor * c,
7466
+ const ggml_custom3_op_f32_t fun) {
7467
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7468
+ }
7469
+
6902
7470
  // ggml_cross_entropy_loss
6903
7471
 
6904
7472
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -7892,7 +8460,7 @@ static void ggml_compute_forward_add_q_f32(
7892
8460
 
7893
8461
  void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
7894
8462
  float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
7895
- void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
8463
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
8464
 
7897
8465
  assert(ne00 % 32 == 0);
7898
8466
 
@@ -9453,8 +10021,65 @@ static void ggml_compute_forward_gelu(
9453
10021
  GGML_ASSERT(false);
9454
10022
  } break;
9455
10023
  }
10024
+ }
10025
+
10026
+ // ggml_compute_forward_gelu_quick
10027
+
10028
+ static void ggml_compute_forward_gelu_quick_f32(
10029
+ const struct ggml_compute_params * params,
10030
+ const struct ggml_tensor * src0,
10031
+ struct ggml_tensor * dst) {
10032
+ GGML_ASSERT(ggml_is_contiguous(src0));
10033
+ GGML_ASSERT(ggml_is_contiguous(dst));
10034
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
10035
+
10036
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10037
+ return;
10038
+ }
10039
+
10040
+ const int ith = params->ith;
10041
+ const int nth = params->nth;
10042
+
10043
+ const int nc = src0->ne[0];
10044
+ const int nr = ggml_nrows(src0);
10045
+
10046
+ // rows per thread
10047
+ const int dr = (nr + nth - 1)/nth;
10048
+
10049
+ // row range for this thread
10050
+ const int ir0 = dr*ith;
10051
+ const int ir1 = MIN(ir0 + dr, nr);
10052
+
10053
+ for (int i1 = ir0; i1 < ir1; i1++) {
10054
+ ggml_vec_gelu_quick_f32(nc,
10055
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
10056
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
10057
+
10058
+ #ifndef NDEBUG
10059
+ for (int k = 0; k < nc; k++) {
10060
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
10061
+ UNUSED(x);
10062
+ assert(!isnan(x));
10063
+ assert(!isinf(x));
10064
+ }
10065
+ #endif
10066
+ }
10067
+ }
9456
10068
 
9457
- //printf("XXXXXXXX gelu\n");
10069
+ static void ggml_compute_forward_gelu_quick(
10070
+ const struct ggml_compute_params * params,
10071
+ const struct ggml_tensor * src0,
10072
+ struct ggml_tensor * dst) {
10073
+ switch (src0->type) {
10074
+ case GGML_TYPE_F32:
10075
+ {
10076
+ ggml_compute_forward_gelu_quick_f32(params, src0, dst);
10077
+ } break;
10078
+ default:
10079
+ {
10080
+ GGML_ASSERT(false);
10081
+ } break;
10082
+ }
9458
10083
  }
9459
10084
 
9460
10085
  // ggml_compute_forward_silu
@@ -10852,7 +11477,7 @@ static void ggml_compute_forward_set_f32(
10852
11477
  const int im2 = (ne12 == 0 ? 0 : ne12-1);
10853
11478
  const int im3 = (ne13 == 0 ? 0 : ne13-1);
10854
11479
 
10855
- GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst));
11480
+ GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
10856
11481
 
10857
11482
  GGML_ASSERT(nb10 == sizeof(float));
10858
11483
 
@@ -11573,8 +12198,9 @@ static void ggml_compute_forward_alibi_f32(
11573
12198
  const struct ggml_tensor * src1,
11574
12199
  struct ggml_tensor * dst) {
11575
12200
  assert(params->ith == 0);
11576
- assert(src1->type == GGML_TYPE_I32);
11577
- assert(ggml_nelements(src1) == 3);
12201
+
12202
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
12203
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11578
12204
 
11579
12205
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11580
12206
  return;
@@ -11637,8 +12263,9 @@ static void ggml_compute_forward_alibi_f16(
11637
12263
  const struct ggml_tensor * src1,
11638
12264
  struct ggml_tensor * dst) {
11639
12265
  assert(params->ith == 0);
11640
- assert(src1->type == GGML_TYPE_I32);
11641
- assert(ggml_nelements(src1) == 3);
12266
+
12267
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
12268
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11642
12269
 
11643
12270
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11644
12271
  return;
@@ -11740,15 +12367,16 @@ static void ggml_compute_forward_clamp_f32(
11740
12367
  const struct ggml_tensor * src1,
11741
12368
  struct ggml_tensor * dst) {
11742
12369
  assert(params->ith == 0);
11743
- assert(src1->type == GGML_TYPE_I32);
11744
- assert(ggml_nelements(src1) == 2);
12370
+
12371
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12372
+ GGML_ASSERT(ggml_nelements(src1) == 2);
11745
12373
 
11746
12374
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11747
12375
  return;
11748
12376
  }
11749
12377
 
11750
- const int min = ((float *) src1->data)[0];
11751
- const int max = ((float *) src1->data)[1];
12378
+ const float min = ((float *) src1->data)[0];
12379
+ const float max = ((float *) src1->data)[1];
11752
12380
 
11753
12381
  const int ith = params->ith;
11754
12382
  const int nth = params->nth;
@@ -11816,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
11816
12444
  const struct ggml_tensor * src1,
11817
12445
  struct ggml_tensor * dst) {
11818
12446
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
11819
- GGML_ASSERT(ggml_nelements(src1) == 3);
12447
+ GGML_ASSERT(ggml_nelements(src1) == 4);
11820
12448
 
11821
12449
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11822
12450
  return;
@@ -11825,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
11825
12453
  const int n_past = ((int32_t *) src1->data)[0];
11826
12454
  const int n_dims = ((int32_t *) src1->data)[1];
11827
12455
  const int mode = ((int32_t *) src1->data)[2];
12456
+ const int n_ctx = ((int32_t *) src1->data)[3];
11828
12457
 
11829
12458
  assert(n_past >= 0);
11830
12459
 
@@ -11869,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
11869
12498
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
11870
12499
 
11871
12500
  const bool is_neox = mode & 2;
12501
+ const bool is_glm = mode & 4;
11872
12502
 
11873
12503
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11874
12504
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -11879,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
11879
12509
 
11880
12510
  float theta = (float)p;
11881
12511
 
11882
- if (!is_neox) {
12512
+ if (is_glm) {
12513
+ theta = MIN(p, n_ctx - 2);
12514
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12515
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12516
+ const float cos_theta = cosf(theta);
12517
+ const float sin_theta = sinf(theta);
12518
+ const float cos_block_theta = cosf(block_theta);
12519
+ const float sin_block_theta = sinf(block_theta);
12520
+
12521
+ theta *= theta_scale;
12522
+ block_theta *= theta_scale;
12523
+
12524
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12525
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12526
+
12527
+ const float x0 = src[0];
12528
+ const float x1 = src[n_dims/2];
12529
+ const float x2 = src[n_dims];
12530
+ const float x3 = src[n_dims/2*3];
12531
+
12532
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
12533
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
12534
+ dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
12535
+ dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12536
+ }
12537
+ } else if (!is_neox) {
11883
12538
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11884
12539
  const float cos_theta = cosf(theta);
11885
12540
  const float sin_theta = sinf(theta);
@@ -11929,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
11929
12584
  const struct ggml_tensor * src1,
11930
12585
  struct ggml_tensor * dst) {
11931
12586
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
11932
- GGML_ASSERT(ggml_nelements(src1) == 3);
12587
+ GGML_ASSERT(ggml_nelements(src1) == 4);
11933
12588
 
11934
12589
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11935
12590
  return;
@@ -11938,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
11938
12593
  const int n_past = ((int32_t *) src1->data)[0];
11939
12594
  const int n_dims = ((int32_t *) src1->data)[1];
11940
12595
  const int mode = ((int32_t *) src1->data)[2];
12596
+ const int n_ctx = ((int32_t *) src1->data)[3];
11941
12597
 
11942
12598
  assert(n_past >= 0);
11943
12599
 
@@ -11982,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
11982
12638
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
11983
12639
 
11984
12640
  const bool is_neox = mode & 2;
12641
+ const bool is_glm = mode & 4;
11985
12642
 
11986
12643
  for (int64_t i3 = 0; i3 < ne3; i3++) {
11987
12644
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -11992,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
11992
12649
 
11993
12650
  float theta = (float)p;
11994
12651
 
11995
- if (!is_neox) {
12652
+ if (is_glm) {
12653
+ theta = MIN(p, n_ctx - 2);
12654
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12655
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12656
+ const float cos_theta = cosf(theta);
12657
+ const float sin_theta = sinf(theta);
12658
+ const float cos_block_theta = cosf(block_theta);
12659
+ const float sin_block_theta = sinf(block_theta);
12660
+
12661
+ theta *= theta_scale;
12662
+ block_theta *= theta_scale;
12663
+
12664
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12665
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12666
+
12667
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
12668
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12669
+ const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
12670
+ const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
12671
+
12672
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12673
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12674
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
12675
+ dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12676
+ }
12677
+ } if (!is_neox) {
11996
12678
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11997
12679
  const float cos_theta = cosf(theta);
11998
12680
  const float sin_theta = sinf(theta);
@@ -12306,9 +12988,9 @@ static void ggml_compute_forward_rope_back(
12306
12988
  }
12307
12989
  }
12308
12990
 
12309
- // ggml_compute_forward_conv_1d_1s
12991
+ // ggml_compute_forward_conv_1d_s1_ph
12310
12992
 
12311
- static void ggml_compute_forward_conv_1d_1s_f16_f32(
12993
+ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12312
12994
  const struct ggml_compute_params * params,
12313
12995
  const struct ggml_tensor * src0,
12314
12996
  const struct ggml_tensor * src1,
@@ -12428,7 +13110,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
12428
13110
  }
12429
13111
  }
12430
13112
 
12431
- static void ggml_compute_forward_conv_1d_1s_f32(
13113
+ static void ggml_compute_forward_conv_1d_s1_ph_f32(
12432
13114
  const struct ggml_compute_params * params,
12433
13115
  const struct ggml_tensor * src0,
12434
13116
  const struct ggml_tensor * src1,
@@ -12548,7 +13230,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
12548
13230
  }
12549
13231
  }
12550
13232
 
12551
- static void ggml_compute_forward_conv_1d_1s(
13233
+ static void ggml_compute_forward_conv_1d_s1_ph(
12552
13234
  const struct ggml_compute_params * params,
12553
13235
  const struct ggml_tensor * src0,
12554
13236
  const struct ggml_tensor * src1,
@@ -12556,11 +13238,11 @@ static void ggml_compute_forward_conv_1d_1s(
12556
13238
  switch (src0->type) {
12557
13239
  case GGML_TYPE_F16:
12558
13240
  {
12559
- ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
13241
+ ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
12560
13242
  } break;
12561
13243
  case GGML_TYPE_F32:
12562
13244
  {
12563
- ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
13245
+ ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
12564
13246
  } break;
12565
13247
  default:
12566
13248
  {
@@ -12569,9 +13251,9 @@ static void ggml_compute_forward_conv_1d_1s(
12569
13251
  }
12570
13252
  }
12571
13253
 
12572
- // ggml_compute_forward_conv_1d_2s
13254
+ // ggml_compute_forward_conv_1d_s2_ph
12573
13255
 
12574
- static void ggml_compute_forward_conv_1d_2s_f16_f32(
13256
+ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
12575
13257
  const struct ggml_compute_params * params,
12576
13258
  const struct ggml_tensor * src0,
12577
13259
  const struct ggml_tensor * src1,
@@ -12691,7 +13373,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
12691
13373
  }
12692
13374
  }
12693
13375
 
12694
- static void ggml_compute_forward_conv_1d_2s_f32(
13376
+ static void ggml_compute_forward_conv_1d_s2_ph_f32(
12695
13377
  const struct ggml_compute_params * params,
12696
13378
  const struct ggml_tensor * src0,
12697
13379
  const struct ggml_tensor * src1,
@@ -12811,7 +13493,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
12811
13493
  }
12812
13494
  }
12813
13495
 
12814
- static void ggml_compute_forward_conv_1d_2s(
13496
+ static void ggml_compute_forward_conv_1d_s2_ph(
12815
13497
  const struct ggml_compute_params * params,
12816
13498
  const struct ggml_tensor * src0,
12817
13499
  const struct ggml_tensor * src1,
@@ -12819,11 +13501,11 @@ static void ggml_compute_forward_conv_1d_2s(
12819
13501
  switch (src0->type) {
12820
13502
  case GGML_TYPE_F16:
12821
13503
  {
12822
- ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
13504
+ ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
12823
13505
  } break;
12824
13506
  case GGML_TYPE_F32:
12825
13507
  {
12826
- ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
13508
+ ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
12827
13509
  } break;
12828
13510
  default:
12829
13511
  {
@@ -12832,18 +13514,154 @@ static void ggml_compute_forward_conv_1d_2s(
12832
13514
  }
12833
13515
  }
12834
13516
 
12835
- // ggml_compute_forward_flash_attn
13517
+ // ggml_compute_forward_conv_2d_sk_p0
12836
13518
 
12837
- static void ggml_compute_forward_flash_attn_f32(
13519
+ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
12838
13520
  const struct ggml_compute_params * params,
12839
- const struct ggml_tensor * q,
12840
- const struct ggml_tensor * k,
12841
- const struct ggml_tensor * v,
12842
- const bool masked,
12843
- struct ggml_tensor * dst) {
12844
- int64_t t0 = ggml_perf_time_us();
12845
- UNUSED(t0);
12846
-
13521
+ const struct ggml_tensor * src0,
13522
+ const struct ggml_tensor * src1,
13523
+ struct ggml_tensor * dst) {
13524
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13525
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13526
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13527
+
13528
+ int64_t t0 = ggml_perf_time_us();
13529
+ UNUSED(t0);
13530
+
13531
+ const int ne00 = src0->ne[0];
13532
+ const int ne01 = src0->ne[1];
13533
+ const int ne02 = src0->ne[2];
13534
+ //const int ne03 = src0->ne[3];
13535
+
13536
+ const int ne10 = src1->ne[0];
13537
+ //const int ne11 = src1->ne[1];
13538
+ const int ne12 = src1->ne[2];
13539
+ //const int ne13 = src1->ne[3];
13540
+
13541
+ const int ne0 = dst->ne[0];
13542
+ const int ne1 = dst->ne[1];
13543
+ const int ne2 = dst->ne[2];
13544
+ //const int ne3 = dst->ne[3];
13545
+ //const int ne = ne0*ne1*ne2*ne3;
13546
+
13547
+ const int nb00 = src0->nb[0];
13548
+ //const int nb01 = src0->nb[1];
13549
+ //const int nb02 = src0->nb[2];
13550
+ const int nb03 = src0->nb[3];
13551
+
13552
+ const int nb10 = src1->nb[0];
13553
+ //const int nb11 = src1->nb[1];
13554
+ const int nb12 = src1->nb[2];
13555
+ //const int nb13 = src1->nb[3];
13556
+
13557
+ //const int nb0 = dst->nb[0];
13558
+ //const int nb1 = dst->nb[1];
13559
+ const int nb2 = dst->nb[2];
13560
+ //const int nb3 = dst->nb[3];
13561
+
13562
+ const int ith = params->ith;
13563
+ const int nth = params->nth;
13564
+
13565
+ const int nk0 = ne00;
13566
+ const int nk1 = ne01;
13567
+
13568
+ // size of the convolution row - the kernel size unrolled across all channels
13569
+ const int ew0 = nk0*nk1*ne02;
13570
+
13571
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13572
+ GGML_ASSERT(nb10 == sizeof(float));
13573
+
13574
+ if (params->type == GGML_TASK_INIT) {
13575
+ // TODO: fix this memset (wsize is overestimated)
13576
+ memset(params->wdata, 0, params->wsize);
13577
+
13578
+ // prepare source data (src1)
13579
+ {
13580
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13581
+
13582
+ for (int i12 = 0; i12 < ne12; i12++) {
13583
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13584
+ ggml_fp16_t * dst_data = wdata;
13585
+
13586
+ for (int i1 = 0; i1 < ne1; i1++) {
13587
+ for (int i0 = 0; i0 < ne0; i0++) {
13588
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13589
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13590
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13591
+ GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13592
+ }
13593
+ }
13594
+ }
13595
+ }
13596
+ }
13597
+ }
13598
+
13599
+ return;
13600
+ }
13601
+
13602
+ if (params->type == GGML_TASK_FINALIZE) {
13603
+ return;
13604
+ }
13605
+
13606
+ // total patches in dst
13607
+ const int np = ne2;
13608
+
13609
+ // patches per thread
13610
+ const int dp = (np + nth - 1)/nth;
13611
+
13612
+ // patch range for this thread
13613
+ const int ip0 = dp*ith;
13614
+ const int ip1 = MIN(ip0 + dp, np);
13615
+
13616
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13617
+
13618
+ for (int i2 = ip0; i2 < ip1; i2++) {
13619
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13620
+
13621
+ for (int i1 = 0; i1 < ne1; ++i1) {
13622
+ for (int i0 = 0; i0 < ne0; ++i0) {
13623
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13624
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13625
+ (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
13626
+ }
13627
+ }
13628
+ }
13629
+ }
13630
+
13631
+ static void ggml_compute_forward_conv_2d_sk_p0(
13632
+ const struct ggml_compute_params * params,
13633
+ const struct ggml_tensor * src0,
13634
+ const struct ggml_tensor * src1,
13635
+ struct ggml_tensor * dst) {
13636
+ switch (src0->type) {
13637
+ case GGML_TYPE_F16:
13638
+ {
13639
+ ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
13640
+ } break;
13641
+ case GGML_TYPE_F32:
13642
+ {
13643
+ //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13644
+ GGML_ASSERT(false);
13645
+ } break;
13646
+ default:
13647
+ {
13648
+ GGML_ASSERT(false);
13649
+ } break;
13650
+ }
13651
+ }
13652
+
13653
+ // ggml_compute_forward_flash_attn
13654
+
13655
+ static void ggml_compute_forward_flash_attn_f32(
13656
+ const struct ggml_compute_params * params,
13657
+ const struct ggml_tensor * q,
13658
+ const struct ggml_tensor * k,
13659
+ const struct ggml_tensor * v,
13660
+ const bool masked,
13661
+ struct ggml_tensor * dst) {
13662
+ int64_t t0 = ggml_perf_time_us();
13663
+ UNUSED(t0);
13664
+
12847
13665
  const int64_t neq0 = q->ne[0];
12848
13666
  const int64_t neq1 = q->ne[1];
12849
13667
  const int64_t neq2 = q->ne[2];
@@ -13926,6 +14744,145 @@ static void ggml_compute_forward_flash_attn_back(
13926
14744
  }
13927
14745
  }
13928
14746
 
14747
+ // ggml_compute_forward_win_part
14748
+
14749
+ static void ggml_compute_forward_win_part_f32(
14750
+ const struct ggml_compute_params * params,
14751
+ const struct ggml_tensor * src0,
14752
+ const struct ggml_tensor * opt0,
14753
+ struct ggml_tensor * dst) {
14754
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14755
+ return;
14756
+ }
14757
+
14758
+ const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14759
+ const int64_t ne01 = src0->ne[1];
14760
+ const int64_t ne02 = src0->ne[2];
14761
+ const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14762
+
14763
+ const int64_t ne0 = dst->ne[0];
14764
+ const int64_t ne1 = dst->ne[1];
14765
+ const int64_t ne2 = dst->ne[2];
14766
+ const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
14767
+
14768
+ const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14769
+ const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14770
+ const int32_t w = ((const int32_t *)(opt0->data))[2];
14771
+
14772
+ assert(ne00 == ne0);
14773
+ assert(ne3 == nep0*nep1);
14774
+
14775
+ // TODO: optimize / multi-thread
14776
+ for (int py = 0; py < nep1; ++py) {
14777
+ for (int px = 0; px < nep0; ++px) {
14778
+ const int64_t i3 = py*nep0 + px;
14779
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14780
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14781
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14782
+ const int64_t i02 = py*w + i2;
14783
+ const int64_t i01 = px*w + i1;
14784
+ const int64_t i00 = i0;
14785
+
14786
+ const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
14787
+ const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
14788
+
14789
+ if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
14790
+ ((float *) dst->data)[i] = 0.0f;
14791
+ } else {
14792
+ ((float *) dst->data)[i] = ((float *) src0->data)[j];
14793
+ }
14794
+ }
14795
+ }
14796
+ }
14797
+ }
14798
+ }
14799
+ }
14800
+
14801
+ static void ggml_compute_forward_win_part(
14802
+ const struct ggml_compute_params * params,
14803
+ const struct ggml_tensor * src0,
14804
+ const struct ggml_tensor * opt0,
14805
+ struct ggml_tensor * dst) {
14806
+ switch (src0->type) {
14807
+ case GGML_TYPE_F32:
14808
+ {
14809
+ ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
14810
+ } break;
14811
+ default:
14812
+ {
14813
+ GGML_ASSERT(false);
14814
+ } break;
14815
+ }
14816
+ }
14817
+
14818
+ // ggml_compute_forward_win_unpart
14819
+
14820
+ static void ggml_compute_forward_win_unpart_f32(
14821
+ const struct ggml_compute_params * params,
14822
+ const struct ggml_tensor * src0,
14823
+ const struct ggml_tensor * opt0,
14824
+ struct ggml_tensor * dst) {
14825
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14826
+ return;
14827
+ }
14828
+
14829
+ const int64_t ne00 = src0->ne[0];
14830
+ const int64_t ne01 = src0->ne[1];
14831
+ const int64_t ne02 = src0->ne[2];
14832
+ //const int64_t ne03 = src0->ne[3];
14833
+
14834
+ const int64_t ne0 = dst->ne[0];
14835
+ const int64_t ne1 = dst->ne[1];
14836
+ const int64_t ne2 = dst->ne[2];
14837
+
14838
+ const int32_t w = ((const int32_t *)(opt0->data))[0];
14839
+
14840
+ // padding
14841
+ const int px = (w - ne1%w)%w;
14842
+ //const int py = (w - ne2%w)%w;
14843
+
14844
+ const int npx = (px + ne1)/w;
14845
+ //const int npy = (py + ne2)/w;
14846
+
14847
+ assert(ne0 == ne00);
14848
+
14849
+ // TODO: optimize / multi-thread
14850
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14851
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14852
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14853
+ const int ip2 = i2/w;
14854
+ const int ip1 = i1/w;
14855
+
14856
+ const int64_t i02 = i2%w;
14857
+ const int64_t i01 = i1%w;
14858
+ const int64_t i00 = i0;
14859
+
14860
+ const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
14861
+ const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
14862
+
14863
+ ((float *) dst->data)[j] = ((float *) src0->data)[i];
14864
+ }
14865
+ }
14866
+ }
14867
+ }
14868
+
14869
+ static void ggml_compute_forward_win_unpart(
14870
+ const struct ggml_compute_params * params,
14871
+ const struct ggml_tensor * src0,
14872
+ const struct ggml_tensor * opt0,
14873
+ struct ggml_tensor * dst) {
14874
+ switch (src0->type) {
14875
+ case GGML_TYPE_F32:
14876
+ {
14877
+ ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14878
+ } break;
14879
+ default:
14880
+ {
14881
+ GGML_ASSERT(false);
14882
+ } break;
14883
+ }
14884
+ }
14885
+
13929
14886
  // ggml_compute_forward_map_unary
13930
14887
 
13931
14888
  static void ggml_compute_forward_map_unary_f32(
@@ -14019,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
14019
14976
  }
14020
14977
  }
14021
14978
 
14979
+ // ggml_compute_forward_map_custom1
14980
+
14981
+ static void ggml_compute_forward_map_custom1_f32(
14982
+ const struct ggml_compute_params * params,
14983
+ const struct ggml_tensor * a,
14984
+ struct ggml_tensor * dst,
14985
+ const ggml_custom1_op_f32_t fun) {
14986
+ assert(params->ith == 0);
14987
+
14988
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14989
+ return;
14990
+ }
14991
+
14992
+ fun(dst, a);
14993
+ }
14994
+
14995
+
14996
+ static void ggml_compute_forward_map_custom1(
14997
+ const struct ggml_compute_params * params,
14998
+ const struct ggml_tensor * a,
14999
+ struct ggml_tensor * dst,
15000
+ const ggml_custom1_op_f32_t fun) {
15001
+ switch (a->type) {
15002
+ case GGML_TYPE_F32:
15003
+ {
15004
+ ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
15005
+ } break;
15006
+ default:
15007
+ {
15008
+ GGML_ASSERT(false);
15009
+ } break;
15010
+ }
15011
+ }
15012
+
15013
+ // ggml_compute_forward_map_custom2
15014
+
15015
+ static void ggml_compute_forward_map_custom2_f32(
15016
+ const struct ggml_compute_params * params,
15017
+ const struct ggml_tensor * a,
15018
+ const struct ggml_tensor * b,
15019
+ struct ggml_tensor * dst,
15020
+ const ggml_custom2_op_f32_t fun) {
15021
+ assert(params->ith == 0);
15022
+
15023
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15024
+ return;
15025
+ }
15026
+
15027
+ fun(dst, a, b);
15028
+ }
15029
+
15030
+
15031
+ static void ggml_compute_forward_map_custom2(
15032
+ const struct ggml_compute_params * params,
15033
+ const struct ggml_tensor * a,
15034
+ const struct ggml_tensor * b,
15035
+ struct ggml_tensor * dst,
15036
+ const ggml_custom2_op_f32_t fun) {
15037
+ switch (a->type) {
15038
+ case GGML_TYPE_F32:
15039
+ {
15040
+ ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
15041
+ } break;
15042
+ default:
15043
+ {
15044
+ GGML_ASSERT(false);
15045
+ } break;
15046
+ }
15047
+ }
15048
+
15049
+ // ggml_compute_forward_map_custom3
15050
+
15051
+ static void ggml_compute_forward_map_custom3_f32(
15052
+ const struct ggml_compute_params * params,
15053
+ const struct ggml_tensor * a,
15054
+ const struct ggml_tensor * b,
15055
+ const struct ggml_tensor * c,
15056
+ struct ggml_tensor * dst,
15057
+ const ggml_custom3_op_f32_t fun) {
15058
+ assert(params->ith == 0);
15059
+
15060
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15061
+ return;
15062
+ }
15063
+
15064
+ fun(dst, a, b, c);
15065
+ }
15066
+
15067
+
15068
+ static void ggml_compute_forward_map_custom3(
15069
+ const struct ggml_compute_params * params,
15070
+ const struct ggml_tensor * a,
15071
+ const struct ggml_tensor * b,
15072
+ const struct ggml_tensor * c,
15073
+ struct ggml_tensor * dst,
15074
+ const ggml_custom3_op_f32_t fun) {
15075
+ switch (a->type) {
15076
+ case GGML_TYPE_F32:
15077
+ {
15078
+ ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
15079
+ } break;
15080
+ default:
15081
+ {
15082
+ GGML_ASSERT(false);
15083
+ } break;
15084
+ }
15085
+ }
15086
+
14022
15087
  // ggml_compute_forward_cross_entropy_loss
14023
15088
 
14024
15089
  static void ggml_compute_forward_cross_entropy_loss_f32(
@@ -14309,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14309
15374
  if (skip_cpu) {
14310
15375
  return;
14311
15376
  }
14312
- GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
15377
+ GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
14313
15378
  GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14314
15379
  #endif // GGML_USE_CUBLAS
14315
15380
 
@@ -14398,6 +15463,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14398
15463
  {
14399
15464
  ggml_compute_forward_gelu(params, tensor->src0, tensor);
14400
15465
  } break;
15466
+ case GGML_OP_GELU_QUICK:
15467
+ {
15468
+ ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
15469
+ } break;
14401
15470
  case GGML_OP_SILU:
14402
15471
  {
14403
15472
  ggml_compute_forward_silu(params, tensor->src0, tensor);
@@ -14502,19 +15571,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14502
15571
  {
14503
15572
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
14504
15573
  } break;
14505
- case GGML_OP_CONV_1D_1S:
15574
+ case GGML_OP_CONV_1D_S1_PH:
15575
+ {
15576
+ ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15577
+ } break;
15578
+ case GGML_OP_CONV_1D_S2_PH:
14506
15579
  {
14507
- ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
15580
+ ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
14508
15581
  } break;
14509
- case GGML_OP_CONV_1D_2S:
15582
+ case GGML_OP_CONV_2D_SK_P0:
14510
15583
  {
14511
- ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
15584
+ ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
14512
15585
  } break;
14513
15586
  case GGML_OP_FLASH_ATTN:
14514
15587
  {
14515
- int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
15588
+ const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
14516
15589
  GGML_ASSERT(t == 0 || t == 1);
14517
- bool masked = t != 0;
15590
+ const bool masked = t != 0;
14518
15591
  ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
14519
15592
  } break;
14520
15593
  case GGML_OP_FLASH_FF:
@@ -14528,6 +15601,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14528
15601
  bool masked = t != 0;
14529
15602
  ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
14530
15603
  } break;
15604
+ case GGML_OP_WIN_PART:
15605
+ {
15606
+ ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
15607
+ } break;
15608
+ case GGML_OP_WIN_UNPART:
15609
+ {
15610
+ ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
15611
+ } break;
14531
15612
  case GGML_OP_MAP_UNARY:
14532
15613
  {
14533
15614
  const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
@@ -14540,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14540
15621
  ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
14541
15622
  }
14542
15623
  break;
15624
+ case GGML_OP_MAP_CUSTOM1:
15625
+ {
15626
+ const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
15627
+ ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
15628
+ }
15629
+ break;
15630
+ case GGML_OP_MAP_CUSTOM2:
15631
+ {
15632
+ const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
15633
+ ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
15634
+ }
15635
+ break;
15636
+ case GGML_OP_MAP_CUSTOM3:
15637
+ {
15638
+ const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
15639
+ ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
15640
+ }
15641
+ break;
14543
15642
  case GGML_OP_CROSS_ENTROPY_LOSS:
14544
15643
  {
14545
15644
  ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
@@ -14799,6 +15898,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14799
15898
  {
14800
15899
  GGML_ASSERT(false); // TODO: not implemented
14801
15900
  } break;
15901
+ case GGML_OP_GELU_QUICK:
15902
+ {
15903
+ GGML_ASSERT(false); // TODO: not implemented
15904
+ } break;
14802
15905
  case GGML_OP_ALIBI:
14803
15906
  {
14804
15907
  GGML_ASSERT(false); // TODO: not implemented
@@ -15144,28 +16247,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15144
16247
  {
15145
16248
  if (src0->grad) {
15146
16249
  assert(src1->type == GGML_TYPE_I32);
15147
- assert(ggml_nelements(src1) == 3);
16250
+ assert(ggml_nelements(src1) == 4);
15148
16251
  const int n_past = ((int32_t *) src1->data)[0];
15149
16252
  const int n_dims = ((int32_t *) src1->data)[1];
15150
16253
  const int mode = ((int32_t *) src1->data)[2];
16254
+ const int n_ctx = ((int32_t *) src1->data)[3];
15151
16255
  src0->grad = ggml_add_impl(ctx,
15152
16256
  src0->grad,
15153
16257
  ggml_rope(ctx,
15154
16258
  tensor->grad,
15155
16259
  n_past,
15156
16260
  n_dims,
15157
- mode),
16261
+ mode,
16262
+ n_ctx),
15158
16263
  inplace);
15159
16264
  }
15160
16265
  if (src1->grad) {
15161
16266
  // noop
15162
16267
  }
15163
16268
  } break;
15164
- case GGML_OP_CONV_1D_1S:
16269
+ case GGML_OP_CONV_1D_S1_PH:
15165
16270
  {
15166
16271
  GGML_ASSERT(false); // TODO: not implemented
15167
16272
  } break;
15168
- case GGML_OP_CONV_1D_2S:
16273
+ case GGML_OP_CONV_1D_S2_PH:
16274
+ {
16275
+ GGML_ASSERT(false); // TODO: not implemented
16276
+ } break;
16277
+ case GGML_OP_CONV_2D_SK_P0:
15169
16278
  {
15170
16279
  GGML_ASSERT(false); // TODO: not implemented
15171
16280
  } break;
@@ -15334,8 +16443,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15334
16443
  {
15335
16444
  GGML_ASSERT(false); // not supported
15336
16445
  } break;
16446
+ case GGML_OP_WIN_PART:
16447
+ case GGML_OP_WIN_UNPART:
15337
16448
  case GGML_OP_MAP_UNARY:
15338
16449
  case GGML_OP_MAP_BINARY:
16450
+ case GGML_OP_MAP_CUSTOM1:
16451
+ case GGML_OP_MAP_CUSTOM2:
16452
+ case GGML_OP_MAP_CUSTOM3:
15339
16453
  {
15340
16454
  GGML_ASSERT(false); // not supported
15341
16455
  } break;
@@ -15407,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15407
16521
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15408
16522
 
15409
16523
  if (strlen(node->name) == 0) {
15410
- snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
16524
+ ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
15411
16525
  }
15412
16526
 
15413
16527
  cgraph->leafs[cgraph->n_leafs] = node;
@@ -15416,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15416
16530
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15417
16531
 
15418
16532
  if (strlen(node->name) == 0) {
15419
- snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
16533
+ ggml_format_name(node, "node_%d", cgraph->n_nodes);
15420
16534
  }
15421
16535
 
15422
16536
  cgraph->nodes[cgraph->n_nodes] = node;
@@ -15570,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
15570
16684
 
15571
16685
  #endif
15572
16686
 
16687
+ // Android's libc implementation "bionic" does not support setting affinity
16688
+ #if defined(__linux__) && !defined(__BIONIC__)
16689
+ void set_numa_thread_affinity(int thread_n, int n_threads) {
16690
+ if (!ggml_is_numa()) {
16691
+ return;
16692
+ }
16693
+
16694
+ // run thread on node_num thread_n / (threads per node)
16695
+ const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16696
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16697
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16698
+
16699
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16700
+ CPU_ZERO_S(setsize, cpus);
16701
+ for (size_t i = 0; i < node->n_cpus; ++i) {
16702
+ CPU_SET_S(node->cpus[i], setsize, cpus);
16703
+ }
16704
+
16705
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16706
+ if (rv) {
16707
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16708
+ strerror(rv));
16709
+ }
16710
+
16711
+ CPU_FREE(cpus);
16712
+ }
16713
+
16714
+ void clear_numa_thread_affinity(void) {
16715
+ if (!ggml_is_numa()) {
16716
+ return;
16717
+ }
16718
+
16719
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16720
+
16721
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16722
+ CPU_ZERO_S(setsize, cpus);
16723
+ for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
16724
+ CPU_SET_S(i, setsize, cpus);
16725
+ }
16726
+
16727
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16728
+ if (rv) {
16729
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16730
+ strerror(rv));
16731
+ }
16732
+
16733
+ CPU_FREE(cpus);
16734
+ }
16735
+ #else
16736
+ // TODO: Windows etc.
16737
+ // (the linux implementation may also work on BSD, someone should test)
16738
+ void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16739
+ void clear_numa_thread_affinity(void) {}
16740
+ #endif
16741
+
15573
16742
  struct ggml_compute_state_shared {
15574
- ggml_lock_t spin;
16743
+ struct ggml_cgraph * cgraph;
16744
+
16745
+ int64_t perf_node_start_cycles;
16746
+ int64_t perf_node_start_time_us;
15575
16747
 
15576
16748
  int n_threads;
15577
16749
 
15578
16750
  // synchronization primitives
15579
- atomic_int n_ready;
15580
- atomic_bool has_work;
15581
- atomic_bool stop; // stop all threads
16751
+ atomic_int n_active; // num active threads
16752
+ atomic_int node_n; // active graph node
15582
16753
  };
15583
16754
 
15584
16755
  struct ggml_compute_state {
15585
16756
  ggml_thread_t thrd;
15586
-
15587
- struct ggml_compute_params params;
15588
- struct ggml_tensor * node;
15589
-
16757
+ int ith;
15590
16758
  struct ggml_compute_state_shared * shared;
15591
16759
  };
15592
16760
 
16761
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16762
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16763
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
16764
+
16765
+ node->perf_runs++;
16766
+ node->perf_cycles += cycles_cur;
16767
+ node->perf_time_us += time_us_cur;
16768
+ }
16769
+
15593
16770
  static thread_ret_t ggml_graph_compute_thread(void * data) {
15594
16771
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16772
+ struct ggml_cgraph * cgraph = state->shared->cgraph;
15595
16773
 
15596
16774
  const int n_threads = state->shared->n_threads;
16775
+ set_numa_thread_affinity(state->ith, n_threads);
16776
+
16777
+ int node_n = -1;
15597
16778
 
15598
16779
  while (true) {
15599
- if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
15600
- atomic_store(&state->shared->has_work, false);
15601
- } else {
15602
- while (atomic_load(&state->shared->has_work)) {
15603
- if (atomic_load(&state->shared->stop)) {
15604
- return 0;
15605
- }
15606
- ggml_lock_lock (&state->shared->spin);
15607
- ggml_lock_unlock(&state->shared->spin);
16780
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16781
+ // all other threads are finished and spinning
16782
+ // do finalize and init here so we don't have synchronize again
16783
+ struct ggml_compute_params params = {
16784
+ /*.type =*/ GGML_TASK_FINALIZE,
16785
+ /*.ith =*/ 0,
16786
+ /*.nth =*/ 0,
16787
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16788
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16789
+ };
16790
+
16791
+ if (node_n != -1) {
16792
+ /* FINALIZE */
16793
+ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
+ params.nth = node->n_tasks;
16795
+ ggml_compute_forward(&params, node);
16796
+ ggml_graph_compute_perf_stats_node(node, state->shared);
15608
16797
  }
15609
- }
15610
16798
 
15611
- atomic_fetch_sub(&state->shared->n_ready, 1);
16799
+ // distribute new work or execute it direct if 1T
16800
+ while (++node_n < cgraph->n_nodes) {
16801
+ GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16802
+
16803
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16804
+
16805
+ state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
+ state->shared->perf_node_start_time_us = ggml_perf_time_us();
16807
+
16808
+ /* INIT */
16809
+ params.type = GGML_TASK_INIT;
16810
+ params.nth = node->n_tasks;
16811
+ ggml_compute_forward(&params, node);
16812
+
16813
+ if (node->n_tasks == 1) {
16814
+ // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16815
+ // they do something more efficient than spinning (?)
16816
+ params.type = GGML_TASK_COMPUTE;
16817
+ ggml_compute_forward(&params, node);
15612
16818
 
15613
- // wait for work
15614
- while (!atomic_load(&state->shared->has_work)) {
15615
- if (atomic_load(&state->shared->stop)) {
15616
- return 0;
16819
+ params.type = GGML_TASK_FINALIZE;
16820
+ ggml_compute_forward(&params, node);
16821
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16822
+ } else {
16823
+ break;
16824
+ }
15617
16825
  }
15618
- ggml_lock_lock (&state->shared->spin);
15619
- ggml_lock_unlock(&state->shared->spin);
16826
+
16827
+ atomic_store(&state->shared->n_active, n_threads);
16828
+ atomic_store(&state->shared->node_n, node_n);
16829
+ } else {
16830
+ // wait for other threads to finish
16831
+ const int last = node_n;
16832
+ do {
16833
+ sched_yield();
16834
+ node_n = atomic_load(&state->shared->node_n);
16835
+ } while (node_n == last);
15620
16836
  }
15621
16837
 
15622
16838
  // check if we should stop
15623
- if (atomic_load(&state->shared->stop)) {
15624
- break;
15625
- }
16839
+ if (node_n >= cgraph->n_nodes) break;
15626
16840
 
15627
- if (state->node) {
15628
- if (state->params.ith < state->params.nth) {
15629
- ggml_compute_forward(&state->params, state->node);
15630
- }
16841
+ /* COMPUTE */
16842
+ struct ggml_tensor * node = cgraph->nodes[node_n];
15631
16843
 
15632
- state->node = NULL;
15633
- } else {
15634
- break;
16844
+ struct ggml_compute_params params = {
16845
+ /*.type =*/ GGML_TASK_COMPUTE,
16846
+ /*.ith =*/ state->ith,
16847
+ /*.nth =*/ node->n_tasks,
16848
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16849
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16850
+ };
16851
+
16852
+ if (state->ith < node->n_tasks) {
16853
+ ggml_compute_forward(&params, node);
15635
16854
  }
15636
16855
  }
15637
16856
 
@@ -15642,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15642
16861
  const int n_threads = cgraph->n_threads;
15643
16862
 
15644
16863
  struct ggml_compute_state_shared state_shared = {
15645
- /*.spin =*/ GGML_LOCK_INITIALIZER,
15646
- /*.n_threads =*/ n_threads,
15647
- /*.n_ready =*/ 0,
15648
- /*.has_work =*/ false,
15649
- /*.stop =*/ false,
16864
+ /*.cgraph =*/ cgraph,
16865
+ /*.perf_node_start_cycles =*/ 0,
16866
+ /*.perf_node_start_time_us =*/ 0,
16867
+ /*.n_threads =*/ n_threads,
16868
+ /*.n_active =*/ n_threads,
16869
+ /*.node_n =*/ -1,
15650
16870
  };
15651
- struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
15652
-
15653
- // create thread pool
15654
- if (n_threads > 1) {
15655
- ggml_lock_init(&state_shared.spin);
15656
-
15657
- atomic_store(&state_shared.has_work, true);
15658
-
15659
- for (int j = 0; j < n_threads - 1; j++) {
15660
- workers[j] = (struct ggml_compute_state) {
15661
- .thrd = 0,
15662
- .params = {
15663
- .type = GGML_TASK_COMPUTE,
15664
- .ith = j + 1,
15665
- .nth = n_threads,
15666
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
15667
- .wdata = cgraph->work ? cgraph->work->data : NULL,
15668
- },
15669
- .node = NULL,
15670
- .shared = &state_shared,
15671
- };
15672
-
15673
- int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
15674
- GGML_ASSERT(rc == 0);
15675
- UNUSED(rc);
15676
- }
15677
- }
16871
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
15678
16872
 
15679
16873
  // initialize tasks + work buffer
15680
16874
  {
@@ -15742,6 +16936,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15742
16936
  } break;
15743
16937
  case GGML_OP_MUL:
15744
16938
  case GGML_OP_GELU:
16939
+ case GGML_OP_GELU_QUICK:
15745
16940
  case GGML_OP_SILU:
15746
16941
  case GGML_OP_SILU_BACK:
15747
16942
  case GGML_OP_NORM:
@@ -15817,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15817
17012
  } break;
15818
17013
  case GGML_OP_SCALE:
15819
17014
  {
15820
- node->n_tasks = n_threads;
17015
+ node->n_tasks = 1;
15821
17016
  } break;
15822
17017
  case GGML_OP_SET:
15823
17018
  case GGML_OP_CONT:
@@ -15848,8 +17043,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15848
17043
  {
15849
17044
  node->n_tasks = 1; //TODO
15850
17045
  } break;
15851
- case GGML_OP_CONV_1D_1S:
15852
- case GGML_OP_CONV_1D_2S:
17046
+ case GGML_OP_CONV_1D_S1_PH:
17047
+ case GGML_OP_CONV_1D_S2_PH:
15853
17048
  {
15854
17049
  node->n_tasks = n_threads;
15855
17050
 
@@ -15876,6 +17071,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15876
17071
  GGML_ASSERT(false);
15877
17072
  }
15878
17073
 
17074
+ work_size = MAX(work_size, cur);
17075
+ } break;
17076
+ case GGML_OP_CONV_2D_SK_P0:
17077
+ {
17078
+ node->n_tasks = n_threads;
17079
+
17080
+ GGML_ASSERT(node->src1->ne[3] == 1);
17081
+
17082
+ const int64_t ne00 = node->src0->ne[0]; // W
17083
+ const int64_t ne01 = node->src0->ne[1]; // H
17084
+ const int64_t ne02 = node->src0->ne[2]; // C
17085
+ const int64_t ne03 = node->src0->ne[3]; // N
17086
+
17087
+ const int64_t ne10 = node->src1->ne[0]; // W
17088
+ const int64_t ne11 = node->src1->ne[1]; // H
17089
+ const int64_t ne12 = node->src1->ne[2]; // C
17090
+
17091
+ const int64_t nk = ne00*ne01;
17092
+
17093
+ UNUSED(ne02);
17094
+ UNUSED(ne03);
17095
+ UNUSED(nk);
17096
+
17097
+ size_t cur = 0;
17098
+
17099
+ if (node->src0->type == GGML_TYPE_F16 &&
17100
+ node->src1->type == GGML_TYPE_F32) {
17101
+ cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
17102
+ } else if (node->src0->type == GGML_TYPE_F32 &&
17103
+ node->src1->type == GGML_TYPE_F32) {
17104
+ cur = sizeof(float)* (ne10*ne11*ne12);
17105
+ } else {
17106
+ GGML_ASSERT(false);
17107
+ }
17108
+
15879
17109
  work_size = MAX(work_size, cur);
15880
17110
  } break;
15881
17111
  case GGML_OP_FLASH_ATTN:
@@ -15937,8 +17167,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15937
17167
 
15938
17168
  work_size = MAX(work_size, cur);
15939
17169
  } break;
17170
+ case GGML_OP_WIN_PART:
17171
+ case GGML_OP_WIN_UNPART:
15940
17172
  case GGML_OP_MAP_UNARY:
15941
17173
  case GGML_OP_MAP_BINARY:
17174
+ case GGML_OP_MAP_CUSTOM1:
17175
+ case GGML_OP_MAP_CUSTOM2:
17176
+ case GGML_OP_MAP_CUSTOM3:
15942
17177
  {
15943
17178
  node->n_tasks = 1;
15944
17179
  } break;
@@ -15981,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15981
17216
  }
15982
17217
  }
15983
17218
 
15984
- const int64_t perf_start_cycles = ggml_perf_cycles();
15985
- const int64_t perf_start_time_us = ggml_perf_time_us();
15986
-
15987
- for (int i = 0; i < cgraph->n_nodes; i++) {
15988
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
15989
-
15990
- struct ggml_tensor * node = cgraph->nodes[i];
15991
-
15992
- // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
15993
- //if (node->grad == NULL && node->perf_runs > 0) {
15994
- // continue;
15995
- //}
15996
-
15997
- const int64_t perf_node_start_cycles = ggml_perf_cycles();
15998
- const int64_t perf_node_start_time_us = ggml_perf_time_us();
15999
-
16000
- // INIT
16001
- struct ggml_compute_params params = {
16002
- /*.type =*/ GGML_TASK_INIT,
16003
- /*.ith =*/ 0,
16004
- /*.nth =*/ node->n_tasks,
16005
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16006
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16007
- };
16008
-
16009
- ggml_compute_forward(&params, node);
16010
-
16011
- // COMPUTE
16012
- if (node->n_tasks > 1) {
16013
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16014
- atomic_store(&state_shared.has_work, false);
16015
- }
16016
-
16017
- while (atomic_load(&state_shared.has_work)) {
16018
- ggml_lock_lock (&state_shared.spin);
16019
- ggml_lock_unlock(&state_shared.spin);
16020
- }
16021
-
16022
- // launch thread pool
16023
- for (int j = 0; j < n_threads - 1; j++) {
16024
- workers[j].params = (struct ggml_compute_params) {
16025
- .type = GGML_TASK_COMPUTE,
16026
- .ith = j + 1,
16027
- .nth = node->n_tasks,
16028
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16029
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16030
- };
16031
- workers[j].node = node;
16032
- }
16033
-
16034
- atomic_fetch_sub(&state_shared.n_ready, 1);
16035
-
16036
- while (atomic_load(&state_shared.n_ready) > 0) {
16037
- ggml_lock_lock (&state_shared.spin);
16038
- ggml_lock_unlock(&state_shared.spin);
16039
- }
16040
-
16041
- atomic_store(&state_shared.has_work, true);
16042
- }
16043
-
16044
- params.type = GGML_TASK_COMPUTE;
16045
- ggml_compute_forward(&params, node);
16046
-
16047
- // wait for thread pool
16048
- if (node->n_tasks > 1) {
16049
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16050
- atomic_store(&state_shared.has_work, false);
16051
- }
16052
-
16053
- while (atomic_load(&state_shared.has_work)) {
16054
- ggml_lock_lock (&state_shared.spin);
16055
- ggml_lock_unlock(&state_shared.spin);
16056
- }
16057
-
16058
- atomic_fetch_sub(&state_shared.n_ready, 1);
16059
-
16060
- while (atomic_load(&state_shared.n_ready) != 0) {
16061
- ggml_lock_lock (&state_shared.spin);
16062
- ggml_lock_unlock(&state_shared.spin);
16063
- }
16064
- }
16065
-
16066
- // FINALIZE
16067
- if (node->n_tasks > 1) {
16068
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16069
- atomic_store(&state_shared.has_work, false);
16070
- }
16071
-
16072
- while (atomic_load(&state_shared.has_work)) {
16073
- ggml_lock_lock (&state_shared.spin);
16074
- ggml_lock_unlock(&state_shared.spin);
16075
- }
16076
-
16077
- // launch thread pool
16078
- for (int j = 0; j < n_threads - 1; j++) {
16079
- workers[j].params = (struct ggml_compute_params) {
16080
- .type = GGML_TASK_FINALIZE,
16081
- .ith = j + 1,
16082
- .nth = node->n_tasks,
16083
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16084
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16085
- };
16086
- workers[j].node = node;
16087
- }
16088
-
16089
- atomic_fetch_sub(&state_shared.n_ready, 1);
16090
-
16091
- while (atomic_load(&state_shared.n_ready) > 0) {
16092
- ggml_lock_lock (&state_shared.spin);
16093
- ggml_lock_unlock(&state_shared.spin);
16094
- }
17219
+ // create thread pool
17220
+ if (n_threads > 1) {
17221
+ for (int j = 1; j < n_threads; ++j) {
17222
+ workers[j] = (struct ggml_compute_state) {
17223
+ .thrd = 0,
17224
+ .ith = j,
17225
+ .shared = &state_shared,
17226
+ };
16095
17227
 
16096
- atomic_store(&state_shared.has_work, true);
17228
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
17229
+ GGML_ASSERT(rc == 0);
16097
17230
  }
17231
+ }
17232
+ workers[0].ith = 0;
17233
+ workers[0].shared = &state_shared;
16098
17234
 
16099
- params.type = GGML_TASK_FINALIZE;
16100
- ggml_compute_forward(&params, node);
16101
-
16102
- // wait for thread pool
16103
- if (node->n_tasks > 1) {
16104
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16105
- atomic_store(&state_shared.has_work, false);
16106
- }
16107
-
16108
- while (atomic_load(&state_shared.has_work)) {
16109
- ggml_lock_lock (&state_shared.spin);
16110
- ggml_lock_unlock(&state_shared.spin);
16111
- }
16112
-
16113
- atomic_fetch_sub(&state_shared.n_ready, 1);
17235
+ const int64_t perf_start_cycles = ggml_perf_cycles();
17236
+ const int64_t perf_start_time_us = ggml_perf_time_us();
16114
17237
 
16115
- while (atomic_load(&state_shared.n_ready) != 0) {
16116
- ggml_lock_lock (&state_shared.spin);
16117
- ggml_lock_unlock(&state_shared.spin);
16118
- }
16119
- }
17238
+ // this is a work thread too
17239
+ ggml_graph_compute_thread(&workers[0]);
16120
17240
 
16121
- // performance stats (node)
16122
- {
16123
- int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
16124
- int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
16125
-
16126
- node->perf_runs++;
16127
- node->perf_cycles += perf_cycles_cur;
16128
- node->perf_time_us += perf_time_us_cur;
16129
- }
16130
- }
17241
+ // don't leave affinity set on the main thread
17242
+ clear_numa_thread_affinity();
16131
17243
 
16132
17244
  // join thread pool
16133
17245
  if (n_threads > 1) {
16134
- atomic_store(&state_shared.stop, true);
16135
- atomic_store(&state_shared.has_work, true);
16136
-
16137
- for (int j = 0; j < n_threads - 1; j++) {
16138
- int rc = ggml_thread_join(workers[j].thrd, NULL);
17246
+ for (int j = 1; j < n_threads; j++) {
17247
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
16139
17248
  GGML_ASSERT(rc == 0);
16140
- UNUSED(rc);
16141
17249
  }
16142
-
16143
- ggml_lock_destroy(&state_shared.spin);
16144
17250
  }
16145
17251
 
16146
17252
  // performance stats (graph)
@@ -16469,16 +17575,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16469
17575
 
16470
17576
  if (!*ctx_data) {
16471
17577
  fprintf(stderr, "%s: failed to create ggml context\n", __func__);
17578
+ fclose(fin);
16472
17579
  return result;
16473
17580
  }
16474
17581
  }
16475
17582
 
16476
17583
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
16477
17584
 
16478
- const size_t ret = fread(data->data, sizeof(char), fsize, fin);
16479
- if (ret != fsize) {
16480
- fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
16481
- return result;
17585
+ {
17586
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
17587
+ if (ret != fsize) {
17588
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
17589
+ fclose(fin);
17590
+ return result;
17591
+ }
16482
17592
  }
16483
17593
 
16484
17594
  fclose(fin);
@@ -16758,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
16758
17868
  return NULL;
16759
17869
  }
16760
17870
 
17871
+ static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17872
+ struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
17873
+ struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
17874
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
17875
+ gparent0 ? (void *) gparent0 : (void *) parent,
17876
+ gparent0 ? "g" : "x",
17877
+ gparent ? (void *) gparent : (void *) node,
17878
+ gparent ? "g" : "x",
17879
+ gparent ? "empty" : "vee",
17880
+ gparent ? "dashed" : "solid",
17881
+ label);
17882
+ }
17883
+
17884
+ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17885
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
17886
+ (void *) parent, "x",
17887
+ (void *) node, "x",
17888
+ label);
17889
+ }
17890
+
16761
17891
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
16762
17892
  char color[16];
16763
17893
 
@@ -16793,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16793
17923
  (void *) node, color);
16794
17924
 
16795
17925
  if (strlen(node->name) > 0) {
16796
- fprintf(fp, "%s |", node->name);
17926
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17927
+ } else {
17928
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
16797
17929
  }
16798
17930
 
16799
17931
  if (node->n_dims == 2) {
@@ -16802,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16802
17934
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
16803
17935
  }
16804
17936
 
16805
-
16806
17937
  if (node->grad) {
16807
17938
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
16808
17939
  } else {
@@ -16821,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16821
17952
  (void *) node, color);
16822
17953
 
16823
17954
  if (strlen(node->name) > 0) {
16824
- fprintf(fp, "%s | ", node->name);
17955
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17956
+ } else {
17957
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
16825
17958
  }
16826
- if (ggml_nelements(node) == 1) {
16827
- if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
16828
- fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
16829
- }
16830
- else {
16831
- fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
17959
+
17960
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17961
+ if (ggml_nelements(node) < 5) {
17962
+ fprintf(fp, " | (");
17963
+ for (int j = 0; j < ggml_nelements(node); j++) {
17964
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17965
+ fprintf(fp, "%d", ggml_get_i32_1d(node, j));
17966
+ }
17967
+ else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
17968
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
17969
+ }
17970
+ else {
17971
+ fprintf(fp, "#");
17972
+ }
17973
+ if (j < ggml_nelements(node) - 1) {
17974
+ fprintf(fp, ", ");
17975
+ }
16832
17976
  }
16833
- }
16834
- else {
16835
- fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17977
+ fprintf(fp, ")");
16836
17978
  }
16837
17979
  fprintf(fp, "\"; ]\n");
16838
17980
  }
@@ -16840,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16840
17982
  for (int i = 0; i < gb->n_nodes; i++) {
16841
17983
  struct ggml_tensor * node = gb->nodes[i];
16842
17984
 
16843
- struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
16844
-
16845
17985
  if (node->src0) {
16846
- struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
16847
-
16848
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
16849
- parent0 ? (void *) parent0 : (void *) node->src0,
16850
- parent0 ? "g" : "x",
16851
- parent ? (void *) parent : (void *) node,
16852
- parent ? "g" : "x",
16853
- parent ? "empty" : "vee",
16854
- parent ? "dashed" : "solid");
17986
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
16855
17987
  }
16856
17988
 
16857
17989
  if (node->src1) {
16858
- struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
16859
-
16860
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
16861
- parent1 ? (void *) parent1 : (void *) node->src1,
16862
- parent1 ? "g" : "x",
16863
- parent ? (void *) parent : (void *) node,
16864
- parent ? "g" : "x",
16865
- parent ? "empty" : "vee",
16866
- parent ? "dashed" : "solid");
17990
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
17991
+ }
17992
+
17993
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
17994
+ if (node->opt[j]) {
17995
+ char label[16];
17996
+ snprintf(label, sizeof(label), "opt %d", j);
17997
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
17998
+ }
16867
17999
  }
16868
18000
  }
16869
18001
 
@@ -16871,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16871
18003
  struct ggml_tensor * node = gb->leafs[i];
16872
18004
 
16873
18005
  if (node->src0) {
16874
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
16875
- (void *) node->src0, "x",
16876
- (void *) node, "x");
18006
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
16877
18007
  }
16878
18008
 
16879
18009
  if (node->src1) {
16880
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
16881
- (void *) node->src1, "x",
16882
- (void *) node, "x");
18010
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
18011
+ }
18012
+
18013
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
18014
+ if (node->opt[j]) {
18015
+ char label[16];
18016
+ snprintf(label, sizeof(label), "opt %d", j);
18017
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
18018
+ }
16883
18019
  }
16884
18020
  }
16885
18021
 
@@ -17598,7 +18734,6 @@ GGML_API void ggml_opt_init(
17598
18734
  ggml_set_zero(opt->lbfgs.g);
17599
18735
  ggml_set_zero(opt->lbfgs.gp);
17600
18736
  ggml_set_zero(opt->lbfgs.d);
17601
- ggml_set_zero(opt->lbfgs.pf);
17602
18737
  if (opt->lbfgs.pf) {
17603
18738
  ggml_set_zero(opt->lbfgs.pf);
17604
18739
  }