llama_cpp 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -35,6 +35,12 @@
35
35
  #define static_assert(cond, msg) struct global_scope_noop_trick
36
36
  #endif
37
37
 
38
+ #if defined(_MSC_VER)
39
+ // disable "possible loss of data" to avoid hundreds of casts
40
+ // we should just be careful :)
41
+ #pragma warning(disable: 4244 4267)
42
+ #endif
43
+
38
44
  #if defined(_WIN32)
39
45
 
40
46
  #include <windows.h>
@@ -106,6 +112,7 @@ typedef void* thread_ret_t;
106
112
  /*#define GGML_PERF*/
107
113
  #define GGML_DEBUG 0
108
114
  #define GGML_GELU_FP16
115
+ #define GGML_GELU_QUICK_FP16
109
116
  #define GGML_SILU_FP16
110
117
 
111
118
  #define GGML_SOFT_MAX_UNROLL 4
@@ -334,6 +341,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
334
341
  // precomputed gelu table for f16 (128 KB)
335
342
  static ggml_fp16_t table_gelu_f16[1 << 16];
336
343
 
344
+ // precomputed quick gelu table for f16 (128 KB)
345
+ static ggml_fp16_t table_gelu_quick_f16[1 << 16];
346
+
337
347
  // precomputed silu table for f16 (128 KB)
338
348
  static ggml_fp16_t table_silu_f16[1 << 16];
339
349
 
@@ -1671,14 +1681,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1671
1681
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1672
1682
  #define GGML_F32x4_REDUCE(res, x) \
1673
1683
  { \
1674
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1675
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1684
+ int offset = GGML_F32_ARR >> 1; \
1685
+ for (int i = 0; i < offset; ++i) { \
1686
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1676
1687
  } \
1677
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1678
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1688
+ offset >>= 1; \
1689
+ for (int i = 0; i < offset; ++i) { \
1690
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1679
1691
  } \
1680
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1681
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1692
+ offset >>= 1; \
1693
+ for (int i = 0; i < offset; ++i) { \
1694
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1682
1695
  } \
1683
1696
  res = GGML_F32x4_REDUCE_ONE(x[0]); \
1684
1697
  }
@@ -1709,14 +1722,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1709
1722
  #define GGML_F16x8_MUL vmulq_f16
1710
1723
  #define GGML_F16x8_REDUCE(res, x) \
1711
1724
  { \
1712
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1713
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1725
+ int offset = GGML_F16_ARR >> 1; \
1726
+ for (int i = 0; i < offset; ++i) { \
1727
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1714
1728
  } \
1715
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1716
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1729
+ offset >>= 1; \
1730
+ for (int i = 0; i < offset; ++i) { \
1731
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1717
1732
  } \
1718
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1719
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1733
+ offset >>= 1; \
1734
+ for (int i = 0; i < offset; ++i) { \
1735
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1720
1736
  } \
1721
1737
  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1722
1738
  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1783,14 +1799,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1783
1799
  #define GGML_F32x8_MUL _mm256_mul_ps
1784
1800
  #define GGML_F32x8_REDUCE(res, x) \
1785
1801
  { \
1786
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1787
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1802
+ int offset = GGML_F32_ARR >> 1; \
1803
+ for (int i = 0; i < offset; ++i) { \
1804
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1788
1805
  } \
1789
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1790
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1806
+ offset >>= 1; \
1807
+ for (int i = 0; i < offset; ++i) { \
1808
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1791
1809
  } \
1792
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1793
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1810
+ offset >>= 1; \
1811
+ for (int i = 0; i < offset; ++i) { \
1812
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1794
1813
  } \
1795
1814
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1796
1815
  _mm256_extractf128_ps(x[0], 1)); \
@@ -1880,14 +1899,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1880
1899
  #define GGML_F32x4_MUL vec_mul
1881
1900
  #define GGML_F32x4_REDUCE(res, x) \
1882
1901
  { \
1883
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1884
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1902
+ int offset = GGML_F32_ARR >> 1; \
1903
+ for (int i = 0; i < offset; ++i) { \
1904
+ x[i] = vec_add(x[i], x[offset+i]); \
1885
1905
  } \
1886
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1887
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1906
+ offset >>= 1; \
1907
+ for (int i = 0; i < offset; ++i) { \
1908
+ x[i] = vec_add(x[i], x[offset+i]); \
1888
1909
  } \
1889
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1890
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1910
+ offset >>= 1; \
1911
+ for (int i = 0; i < offset; ++i) { \
1912
+ x[i] = vec_add(x[i], x[offset+i]); \
1891
1913
  } \
1892
1914
  res = vec_extract(x[0], 0) + \
1893
1915
  vec_extract(x[0], 1) + \
@@ -1943,14 +1965,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1943
1965
  #define GGML_F32x4_MUL wasm_f32x4_mul
1944
1966
  #define GGML_F32x4_REDUCE(res, x) \
1945
1967
  { \
1946
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1947
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1968
+ int offset = GGML_F32_ARR >> 1; \
1969
+ for (int i = 0; i < offset; ++i) { \
1970
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1948
1971
  } \
1949
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1950
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1972
+ offset >>= 1; \
1973
+ for (int i = 0; i < offset; ++i) { \
1974
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1951
1975
  } \
1952
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1953
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1976
+ offset >>= 1; \
1977
+ for (int i = 0; i < offset; ++i) { \
1978
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1954
1979
  } \
1955
1980
  res = wasm_f32x4_extract_lane(x[0], 0) + \
1956
1981
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2005,14 +2030,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2005
2030
  #define GGML_F16x4_MUL wasm_f32x4_mul
2006
2031
  #define GGML_F16x4_REDUCE(res, x) \
2007
2032
  { \
2008
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
2009
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
2033
+ int offset = GGML_F16_ARR >> 1; \
2034
+ for (int i = 0; i < offset; ++i) { \
2035
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2010
2036
  } \
2011
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
2012
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
2037
+ offset >>= 1; \
2038
+ for (int i = 0; i < offset; ++i) { \
2039
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2013
2040
  } \
2014
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
2015
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
2041
+ offset >>= 1; \
2042
+ for (int i = 0; i < offset; ++i) { \
2043
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2016
2044
  } \
2017
2045
  res = wasm_f32x4_extract_lane(x[0], 0) + \
2018
2046
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2054,14 +2082,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2054
2082
  #define GGML_F32x4_MUL _mm_mul_ps
2055
2083
  #define GGML_F32x4_REDUCE(res, x) \
2056
2084
  { \
2057
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2058
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2085
+ int offset = GGML_F32_ARR >> 1; \
2086
+ for (int i = 0; i < offset; ++i) { \
2087
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2059
2088
  } \
2060
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2061
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2089
+ offset >>= 1; \
2090
+ for (int i = 0; i < offset; ++i) { \
2091
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2062
2092
  } \
2063
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2064
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2093
+ offset >>= 1; \
2094
+ for (int i = 0; i < offset; ++i) { \
2095
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2065
2096
  } \
2066
2097
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2067
2098
  res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
@@ -3350,6 +3381,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
3350
3381
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3351
3382
 
3352
3383
  static const float GELU_COEF_A = 0.044715f;
3384
+ static const float GELU_QUICK_COEF = -1.702f;
3353
3385
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3354
3386
 
3355
3387
  inline static float ggml_gelu_f32(float x) {
@@ -3380,6 +3412,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
3380
3412
  }
3381
3413
  #endif
3382
3414
 
3415
+ inline static float ggml_gelu_quick_f32(float x) {
3416
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
3417
+ }
3418
+
3419
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
3420
+ // const uint16_t * i16 = (const uint16_t *) x;
3421
+ // for (int i = 0; i < n; ++i) {
3422
+ // y[i] = table_gelu_quick_f16[i16[i]];
3423
+ // }
3424
+ //}
3425
+
3426
+ #ifdef GGML_GELU_QUICK_FP16
3427
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3428
+ uint16_t t;
3429
+ for (int i = 0; i < n; ++i) {
3430
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
3431
+ memcpy(&t, &fp16, sizeof(uint16_t));
3432
+ y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
3433
+ }
3434
+ }
3435
+ #else
3436
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3437
+ for (int i = 0; i < n; ++i) {
3438
+ y[i] = ggml_gelu_quick_f32(x[i]);
3439
+ }
3440
+ }
3441
+ #endif
3442
+
3383
3443
  // Sigmoid Linear Unit (SiLU) function
3384
3444
  inline static float ggml_silu_f32(float x) {
3385
3445
  return x/(1.0f + expf(-x));
@@ -3610,6 +3670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3610
3670
  "STEP",
3611
3671
  "RELU",
3612
3672
  "GELU",
3673
+ "GELU_QUICK",
3613
3674
  "SILU",
3614
3675
  "SILU_BACK",
3615
3676
  "NORM",
@@ -3638,12 +3699,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3638
3699
  "ROPE_BACK",
3639
3700
  "ALIBI",
3640
3701
  "CLAMP",
3641
- "CONV_1D_1S",
3642
- "CONV_1D_2S",
3702
+ "CONV_1D_S1_PH",
3703
+ "CONV_1D_S2_PH",
3704
+ "CONV_2D_SK_P0",
3643
3705
 
3644
3706
  "FLASH_ATTN",
3645
3707
  "FLASH_FF",
3646
3708
  "FLASH_ATTN_BACK",
3709
+ "WIN_PART",
3710
+ "WIN_UNPART",
3647
3711
 
3648
3712
  "MAP_UNARY",
3649
3713
  "MAP_BINARY",
@@ -3652,7 +3716,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3652
3716
  "CROSS_ENTROPY_LOSS_BACK",
3653
3717
  };
3654
3718
 
3655
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3719
+ static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3656
3720
 
3657
3721
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3658
3722
  "none",
@@ -3678,6 +3742,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3678
3742
  "step(x)",
3679
3743
  "relu(x)",
3680
3744
  "gelu(x)",
3745
+ "gelu_quick(x)",
3681
3746
  "silu(x)",
3682
3747
  "silu_back(x)",
3683
3748
  "norm(x)",
@@ -3706,12 +3771,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3706
3771
  "rope_back(x)",
3707
3772
  "alibi(x)",
3708
3773
  "clamp(x)",
3709
- "conv_1d_1s(x)",
3710
- "conv_1d_2s(x)",
3774
+ "conv_1d_s1_ph(x)",
3775
+ "conv_1d_s2_ph(x)",
3776
+ "conv_2d_sk_p0(x)",
3711
3777
 
3712
3778
  "flash_attn(x)",
3713
3779
  "flash_ff(x)",
3714
3780
  "flash_attn_back(x)",
3781
+ "win_part(x)",
3782
+ "win_unpart(x)",
3715
3783
 
3716
3784
  "f(x)",
3717
3785
  "f(x,y)",
@@ -3720,7 +3788,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3720
3788
  "cross_entropy_loss_back(x,y)",
3721
3789
  };
3722
3790
 
3723
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3791
+ static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3724
3792
 
3725
3793
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3726
3794
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4011,7 +4079,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4011
4079
  // initialize time system (required on Windows)
4012
4080
  ggml_time_init();
4013
4081
 
4014
- // initialize GELU, SILU and EXP F32 tables
4082
+ // initialize GELU, Quick GELU, SILU and EXP F32 tables
4015
4083
  {
4016
4084
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
4017
4085
 
@@ -4021,13 +4089,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4021
4089
  memcpy(&ii, &ui, sizeof(ii));
4022
4090
  const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
4023
4091
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
4092
+ table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
4024
4093
  table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
4025
4094
  table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
4026
4095
  }
4027
4096
 
4028
4097
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
4029
4098
 
4030
- GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4099
+ GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4031
4100
  }
4032
4101
 
4033
4102
  // initialize g_state
@@ -4148,14 +4217,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4148
4217
  ctx->no_alloc = no_alloc;
4149
4218
  }
4150
4219
 
4151
- void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4220
+ void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
4152
4221
  return ctx->mem_buffer;
4153
4222
  }
4154
4223
 
4155
- size_t ggml_get_mem_size(struct ggml_context * ctx) {
4224
+ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
4156
4225
  return ctx->mem_size;
4157
4226
  }
4158
4227
 
4228
+ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4229
+ size_t max_size = 0;
4230
+
4231
+ struct ggml_object * obj = ctx->objects_begin;
4232
+
4233
+ while (obj != NULL) {
4234
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4235
+
4236
+ const size_t size = ggml_nbytes(tensor);
4237
+
4238
+ if (max_size < size) {
4239
+ max_size = size;
4240
+ }
4241
+
4242
+ obj = obj->next;
4243
+ }
4244
+
4245
+ return max_size;
4246
+ }
4247
+
4159
4248
  // IMPORTANT:
4160
4249
  // when creating "opt" tensors, always save and load the scratch buffer
4161
4250
  // this is an error prone process, but it is necessary to support inplace
@@ -4639,9 +4728,10 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4639
4728
  return tensor->name;
4640
4729
  }
4641
4730
 
4642
- void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4731
+ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4643
4732
  strncpy(tensor->name, name, sizeof(tensor->name));
4644
4733
  tensor->name[sizeof(tensor->name) - 1] = '\0';
4734
+ return tensor;
4645
4735
  }
4646
4736
 
4647
4737
  struct ggml_tensor * ggml_view_tensor(
@@ -5420,6 +5510,40 @@ struct ggml_tensor * ggml_gelu_inplace(
5420
5510
  return ggml_gelu_impl(ctx, a, true);
5421
5511
  }
5422
5512
 
5513
+ // ggml_gelu_quick
5514
+
5515
+ struct ggml_tensor * ggml_gelu_quick_impl(
5516
+ struct ggml_context * ctx,
5517
+ struct ggml_tensor * a,
5518
+ bool inplace) {
5519
+ bool is_node = false;
5520
+
5521
+ if (!inplace && (a->grad)) {
5522
+ is_node = true;
5523
+ }
5524
+
5525
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5526
+
5527
+ result->op = GGML_OP_GELU_QUICK;
5528
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5529
+ result->src0 = a;
5530
+ result->src1 = NULL;
5531
+
5532
+ return result;
5533
+ }
5534
+
5535
+ struct ggml_tensor * ggml_gelu_quick(
5536
+ struct ggml_context * ctx,
5537
+ struct ggml_tensor * a) {
5538
+ return ggml_gelu_quick_impl(ctx, a, false);
5539
+ }
5540
+
5541
+ struct ggml_tensor * ggml_gelu_quick_inplace(
5542
+ struct ggml_context * ctx,
5543
+ struct ggml_tensor * a) {
5544
+ return ggml_gelu_quick_impl(ctx, a, true);
5545
+ }
5546
+
5423
5547
  // ggml_silu
5424
5548
 
5425
5549
  struct ggml_tensor * ggml_silu_impl(
@@ -6619,7 +6743,7 @@ struct ggml_tensor * ggml_clamp(
6619
6743
 
6620
6744
  ggml_scratch_save(ctx);
6621
6745
 
6622
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6746
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
6623
6747
 
6624
6748
  ((float *) b->data)[0] = min;
6625
6749
  ((float *) b->data)[1] = max;
@@ -6634,9 +6758,9 @@ struct ggml_tensor * ggml_clamp(
6634
6758
  return result;
6635
6759
  }
6636
6760
 
6637
- // ggml_conv_1d_1s
6761
+ // ggml_conv_1d_s1_ph
6638
6762
 
6639
- struct ggml_tensor * ggml_conv_1d_1s(
6763
+ struct ggml_tensor * ggml_conv_1d_s1_ph(
6640
6764
  struct ggml_context * ctx,
6641
6765
  struct ggml_tensor * a,
6642
6766
  struct ggml_tensor * b) {
@@ -6653,7 +6777,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
6653
6777
  const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6654
6778
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6655
6779
 
6656
- result->op = GGML_OP_CONV_1D_1S;
6780
+ result->op = GGML_OP_CONV_1D_S1_PH;
6657
6781
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6658
6782
  result->src0 = a;
6659
6783
  result->src1 = b;
@@ -6661,9 +6785,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
6661
6785
  return result;
6662
6786
  }
6663
6787
 
6664
- // ggml_conv_1d_2s
6788
+ // ggml_conv_1d_s2_ph
6665
6789
 
6666
- struct ggml_tensor * ggml_conv_1d_2s(
6790
+ struct ggml_tensor * ggml_conv_1d_s2_ph(
6667
6791
  struct ggml_context * ctx,
6668
6792
  struct ggml_tensor * a,
6669
6793
  struct ggml_tensor * b) {
@@ -6680,7 +6804,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
6680
6804
  const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6681
6805
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6682
6806
 
6683
- result->op = GGML_OP_CONV_1D_2S;
6807
+ result->op = GGML_OP_CONV_1D_S2_PH;
6808
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6809
+ result->src0 = a;
6810
+ result->src1 = b;
6811
+
6812
+ return result;
6813
+ }
6814
+
6815
+ // ggml_conv_2d_sk_p0
6816
+
6817
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6818
+ struct ggml_context * ctx,
6819
+ struct ggml_tensor * a,
6820
+ struct ggml_tensor * b) {
6821
+ GGML_ASSERT(b->ne[3] == 1);
6822
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
6823
+ GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
6824
+ GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
6825
+ bool is_node = false;
6826
+
6827
+ if (a->grad || b->grad) {
6828
+ GGML_ASSERT(false); // TODO: implement backward
6829
+ is_node = true;
6830
+ }
6831
+
6832
+ const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
6833
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6834
+
6835
+ result->op = GGML_OP_CONV_2D_SK_P0;
6684
6836
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6685
6837
  result->src0 = a;
6686
6838
  result->src1 = b;
@@ -6814,6 +6966,89 @@ struct ggml_tensor * ggml_flash_attn_back(
6814
6966
  return result;
6815
6967
  }
6816
6968
 
6969
+ // ggml_win_part
6970
+
6971
+ struct ggml_tensor * ggml_win_part(
6972
+ struct ggml_context * ctx,
6973
+ struct ggml_tensor * a,
6974
+ int w) {
6975
+ GGML_ASSERT(a->ne[3] == 1);
6976
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
6977
+
6978
+ bool is_node = false;
6979
+
6980
+ if (a->grad) {
6981
+ GGML_ASSERT(false); // TODO: implement backward
6982
+ is_node = true;
6983
+ }
6984
+
6985
+ // padding
6986
+ const int px = (w - a->ne[1]%w)%w;
6987
+ const int py = (w - a->ne[2]%w)%w;
6988
+
6989
+ const int npx = (px + a->ne[1])/w;
6990
+ const int npy = (py + a->ne[2])/w;
6991
+ const int np = npx*npy;
6992
+
6993
+ const int64_t ne[4] = { a->ne[0], w, w, np, };
6994
+
6995
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6996
+
6997
+ ggml_scratch_save(ctx);
6998
+
6999
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7000
+
7001
+ ((int32_t *) b->data)[0] = npx;
7002
+ ((int32_t *) b->data)[1] = npy;
7003
+ ((int32_t *) b->data)[2] = w;
7004
+
7005
+ ggml_scratch_load(ctx);
7006
+
7007
+ result->op = GGML_OP_WIN_PART;
7008
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7009
+ result->src0 = a;
7010
+ result->src1 = NULL;
7011
+ result->opt[0] = b;
7012
+
7013
+ return result;
7014
+ }
7015
+
7016
+ // ggml_win_unpart
7017
+
7018
+ struct ggml_tensor * ggml_win_unpart(
7019
+ struct ggml_context * ctx,
7020
+ struct ggml_tensor * a,
7021
+ int w0,
7022
+ int h0,
7023
+ int w) {
7024
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
7025
+
7026
+ bool is_node = false;
7027
+
7028
+ if (a->grad) {
7029
+ GGML_ASSERT(false); // TODO: implement backward
7030
+ is_node = true;
7031
+ }
7032
+
7033
+ const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7034
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7035
+
7036
+ ggml_scratch_save(ctx);
7037
+
7038
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7039
+
7040
+ ((int32_t *) b->data)[0] = w;
7041
+
7042
+ ggml_scratch_load(ctx);
7043
+
7044
+ result->op = GGML_OP_WIN_UNPART;
7045
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7046
+ result->src0 = a;
7047
+ result->src1 = NULL;
7048
+ result->opt[0] = b;
7049
+
7050
+ return result;
7051
+ }
6817
7052
 
6818
7053
  // ggml_map_unary
6819
7054
 
@@ -7892,7 +8127,7 @@ static void ggml_compute_forward_add_q_f32(
7892
8127
 
7893
8128
  void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
7894
8129
  float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
7895
- void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
8130
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
8131
 
7897
8132
  assert(ne00 % 32 == 0);
7898
8133
 
@@ -9453,8 +9688,65 @@ static void ggml_compute_forward_gelu(
9453
9688
  GGML_ASSERT(false);
9454
9689
  } break;
9455
9690
  }
9691
+ }
9692
+
9693
+ // ggml_compute_forward_gelu_quick
9694
+
9695
+ static void ggml_compute_forward_gelu_quick_f32(
9696
+ const struct ggml_compute_params * params,
9697
+ const struct ggml_tensor * src0,
9698
+ struct ggml_tensor * dst) {
9699
+ GGML_ASSERT(ggml_is_contiguous(src0));
9700
+ GGML_ASSERT(ggml_is_contiguous(dst));
9701
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
9702
+
9703
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9704
+ return;
9705
+ }
9706
+
9707
+ const int ith = params->ith;
9708
+ const int nth = params->nth;
9709
+
9710
+ const int nc = src0->ne[0];
9711
+ const int nr = ggml_nrows(src0);
9456
9712
 
9457
- //printf("XXXXXXXX gelu\n");
9713
+ // rows per thread
9714
+ const int dr = (nr + nth - 1)/nth;
9715
+
9716
+ // row range for this thread
9717
+ const int ir0 = dr*ith;
9718
+ const int ir1 = MIN(ir0 + dr, nr);
9719
+
9720
+ for (int i1 = ir0; i1 < ir1; i1++) {
9721
+ ggml_vec_gelu_quick_f32(nc,
9722
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
9723
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
9724
+
9725
+ #ifndef NDEBUG
9726
+ for (int k = 0; k < nc; k++) {
9727
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
9728
+ UNUSED(x);
9729
+ assert(!isnan(x));
9730
+ assert(!isinf(x));
9731
+ }
9732
+ #endif
9733
+ }
9734
+ }
9735
+
9736
+ static void ggml_compute_forward_gelu_quick(
9737
+ const struct ggml_compute_params * params,
9738
+ const struct ggml_tensor * src0,
9739
+ struct ggml_tensor * dst) {
9740
+ switch (src0->type) {
9741
+ case GGML_TYPE_F32:
9742
+ {
9743
+ ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9744
+ } break;
9745
+ default:
9746
+ {
9747
+ GGML_ASSERT(false);
9748
+ } break;
9749
+ }
9458
9750
  }
9459
9751
 
9460
9752
  // ggml_compute_forward_silu
@@ -10852,7 +11144,7 @@ static void ggml_compute_forward_set_f32(
10852
11144
  const int im2 = (ne12 == 0 ? 0 : ne12-1);
10853
11145
  const int im3 = (ne13 == 0 ? 0 : ne13-1);
10854
11146
 
10855
- GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst));
11147
+ GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
10856
11148
 
10857
11149
  GGML_ASSERT(nb10 == sizeof(float));
10858
11150
 
@@ -11573,8 +11865,9 @@ static void ggml_compute_forward_alibi_f32(
11573
11865
  const struct ggml_tensor * src1,
11574
11866
  struct ggml_tensor * dst) {
11575
11867
  assert(params->ith == 0);
11576
- assert(src1->type == GGML_TYPE_I32);
11577
- assert(ggml_nelements(src1) == 3);
11868
+
11869
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
11870
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11578
11871
 
11579
11872
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11580
11873
  return;
@@ -11637,8 +11930,9 @@ static void ggml_compute_forward_alibi_f16(
11637
11930
  const struct ggml_tensor * src1,
11638
11931
  struct ggml_tensor * dst) {
11639
11932
  assert(params->ith == 0);
11640
- assert(src1->type == GGML_TYPE_I32);
11641
- assert(ggml_nelements(src1) == 3);
11933
+
11934
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
11935
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11642
11936
 
11643
11937
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11644
11938
  return;
@@ -11740,15 +12034,16 @@ static void ggml_compute_forward_clamp_f32(
11740
12034
  const struct ggml_tensor * src1,
11741
12035
  struct ggml_tensor * dst) {
11742
12036
  assert(params->ith == 0);
11743
- assert(src1->type == GGML_TYPE_I32);
11744
- assert(ggml_nelements(src1) == 2);
12037
+
12038
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12039
+ GGML_ASSERT(ggml_nelements(src1) == 2);
11745
12040
 
11746
12041
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11747
12042
  return;
11748
12043
  }
11749
12044
 
11750
- const int min = ((float *) src1->data)[0];
11751
- const int max = ((float *) src1->data)[1];
12045
+ const float min = ((float *) src1->data)[0];
12046
+ const float max = ((float *) src1->data)[1];
11752
12047
 
11753
12048
  const int ith = params->ith;
11754
12049
  const int nth = params->nth;
@@ -12306,9 +12601,9 @@ static void ggml_compute_forward_rope_back(
12306
12601
  }
12307
12602
  }
12308
12603
 
12309
- // ggml_compute_forward_conv_1d_1s
12604
+ // ggml_compute_forward_conv_1d_s1_ph
12310
12605
 
12311
- static void ggml_compute_forward_conv_1d_1s_f16_f32(
12606
+ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12312
12607
  const struct ggml_compute_params * params,
12313
12608
  const struct ggml_tensor * src0,
12314
12609
  const struct ggml_tensor * src1,
@@ -12428,7 +12723,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
12428
12723
  }
12429
12724
  }
12430
12725
 
12431
- static void ggml_compute_forward_conv_1d_1s_f32(
12726
+ static void ggml_compute_forward_conv_1d_s1_ph_f32(
12432
12727
  const struct ggml_compute_params * params,
12433
12728
  const struct ggml_tensor * src0,
12434
12729
  const struct ggml_tensor * src1,
@@ -12548,7 +12843,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
12548
12843
  }
12549
12844
  }
12550
12845
 
12551
- static void ggml_compute_forward_conv_1d_1s(
12846
+ static void ggml_compute_forward_conv_1d_s1_ph(
12552
12847
  const struct ggml_compute_params * params,
12553
12848
  const struct ggml_tensor * src0,
12554
12849
  const struct ggml_tensor * src1,
@@ -12556,11 +12851,11 @@ static void ggml_compute_forward_conv_1d_1s(
12556
12851
  switch (src0->type) {
12557
12852
  case GGML_TYPE_F16:
12558
12853
  {
12559
- ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
12854
+ ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
12560
12855
  } break;
12561
12856
  case GGML_TYPE_F32:
12562
12857
  {
12563
- ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
12858
+ ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
12564
12859
  } break;
12565
12860
  default:
12566
12861
  {
@@ -12569,9 +12864,9 @@ static void ggml_compute_forward_conv_1d_1s(
12569
12864
  }
12570
12865
  }
12571
12866
 
12572
- // ggml_compute_forward_conv_1d_2s
12867
+ // ggml_compute_forward_conv_1d_s2_ph
12573
12868
 
12574
- static void ggml_compute_forward_conv_1d_2s_f16_f32(
12869
+ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
12575
12870
  const struct ggml_compute_params * params,
12576
12871
  const struct ggml_tensor * src0,
12577
12872
  const struct ggml_tensor * src1,
@@ -12691,7 +12986,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
12691
12986
  }
12692
12987
  }
12693
12988
 
12694
- static void ggml_compute_forward_conv_1d_2s_f32(
12989
+ static void ggml_compute_forward_conv_1d_s2_ph_f32(
12695
12990
  const struct ggml_compute_params * params,
12696
12991
  const struct ggml_tensor * src0,
12697
12992
  const struct ggml_tensor * src1,
@@ -12811,7 +13106,143 @@ static void ggml_compute_forward_conv_1d_2s_f32(
12811
13106
  }
12812
13107
  }
12813
13108
 
12814
- static void ggml_compute_forward_conv_1d_2s(
13109
+ static void ggml_compute_forward_conv_1d_s2_ph(
13110
+ const struct ggml_compute_params * params,
13111
+ const struct ggml_tensor * src0,
13112
+ const struct ggml_tensor * src1,
13113
+ struct ggml_tensor * dst) {
13114
+ switch (src0->type) {
13115
+ case GGML_TYPE_F16:
13116
+ {
13117
+ ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
13118
+ } break;
13119
+ case GGML_TYPE_F32:
13120
+ {
13121
+ ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
13122
+ } break;
13123
+ default:
13124
+ {
13125
+ GGML_ASSERT(false);
13126
+ } break;
13127
+ }
13128
+ }
13129
+
13130
+ // ggml_compute_forward_conv_2d_sk_p0
13131
+
13132
+ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13133
+ const struct ggml_compute_params * params,
13134
+ const struct ggml_tensor * src0,
13135
+ const struct ggml_tensor * src1,
13136
+ struct ggml_tensor * dst) {
13137
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13138
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13139
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13140
+
13141
+ int64_t t0 = ggml_perf_time_us();
13142
+ UNUSED(t0);
13143
+
13144
+ const int ne00 = src0->ne[0];
13145
+ const int ne01 = src0->ne[1];
13146
+ const int ne02 = src0->ne[2];
13147
+ //const int ne03 = src0->ne[3];
13148
+
13149
+ const int ne10 = src1->ne[0];
13150
+ //const int ne11 = src1->ne[1];
13151
+ const int ne12 = src1->ne[2];
13152
+ //const int ne13 = src1->ne[3];
13153
+
13154
+ const int ne0 = dst->ne[0];
13155
+ const int ne1 = dst->ne[1];
13156
+ const int ne2 = dst->ne[2];
13157
+ //const int ne3 = dst->ne[3];
13158
+ //const int ne = ne0*ne1*ne2*ne3;
13159
+
13160
+ const int nb00 = src0->nb[0];
13161
+ //const int nb01 = src0->nb[1];
13162
+ //const int nb02 = src0->nb[2];
13163
+ const int nb03 = src0->nb[3];
13164
+
13165
+ const int nb10 = src1->nb[0];
13166
+ //const int nb11 = src1->nb[1];
13167
+ const int nb12 = src1->nb[2];
13168
+ //const int nb13 = src1->nb[3];
13169
+
13170
+ //const int nb0 = dst->nb[0];
13171
+ //const int nb1 = dst->nb[1];
13172
+ const int nb2 = dst->nb[2];
13173
+ //const int nb3 = dst->nb[3];
13174
+
13175
+ const int ith = params->ith;
13176
+ const int nth = params->nth;
13177
+
13178
+ const int nk0 = ne00;
13179
+ const int nk1 = ne01;
13180
+
13181
+ // size of the convolution row - the kernel size unrolled across all channels
13182
+ // round-up so it is more suitable for SIMD
13183
+ const int ew0 = ggml_up32(nk0*nk1*ne02);
13184
+
13185
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13186
+ GGML_ASSERT(nb10 == sizeof(float));
13187
+
13188
+ if (params->type == GGML_TASK_INIT) {
13189
+ // TODO: fix this memset (wsize is overestimated)
13190
+ memset(params->wdata, 0, params->wsize);
13191
+
13192
+ // prepare source data (src1)
13193
+ {
13194
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13195
+
13196
+ for (int i12 = 0; i12 < ne12; i12++) {
13197
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13198
+ ggml_fp16_t * dst_data = wdata;
13199
+
13200
+ for (int i1 = 0; i1 < ne1; i1++) {
13201
+ for (int i0 = 0; i0 < ne0; i0++) {
13202
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13203
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13204
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13205
+ GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13206
+ }
13207
+ }
13208
+ }
13209
+ }
13210
+ }
13211
+ }
13212
+
13213
+ return;
13214
+ }
13215
+
13216
+ if (params->type == GGML_TASK_FINALIZE) {
13217
+ return;
13218
+ }
13219
+
13220
+ // total patches in dst
13221
+ const int np = ne2;
13222
+
13223
+ // patches per thread
13224
+ const int dp = (np + nth - 1)/nth;
13225
+
13226
+ // patch range for this thread
13227
+ const int ip0 = dp*ith;
13228
+ const int ip1 = MIN(ip0 + dp, np);
13229
+
13230
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13231
+
13232
+ for (int i2 = ip0; i2 < ip1; i2++) {
13233
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13234
+
13235
+ for (int i1 = 0; i1 < ne1; ++i1) {
13236
+ for (int i0 = 0; i0 < ne0; ++i0) {
13237
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13238
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13239
+ (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
13240
+ }
13241
+ }
13242
+ }
13243
+ }
13244
+
13245
+ static void ggml_compute_forward_conv_2d_sk_p0(
12815
13246
  const struct ggml_compute_params * params,
12816
13247
  const struct ggml_tensor * src0,
12817
13248
  const struct ggml_tensor * src1,
@@ -12819,11 +13250,12 @@ static void ggml_compute_forward_conv_1d_2s(
12819
13250
  switch (src0->type) {
12820
13251
  case GGML_TYPE_F16:
12821
13252
  {
12822
- ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
13253
+ ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
12823
13254
  } break;
12824
13255
  case GGML_TYPE_F32:
12825
13256
  {
12826
- ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
13257
+ //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13258
+ GGML_ASSERT(false);
12827
13259
  } break;
12828
13260
  default:
12829
13261
  {
@@ -13926,6 +14358,145 @@ static void ggml_compute_forward_flash_attn_back(
13926
14358
  }
13927
14359
  }
13928
14360
 
14361
+ // ggml_compute_forward_win_part
14362
+
14363
+ static void ggml_compute_forward_win_part_f32(
14364
+ const struct ggml_compute_params * params,
14365
+ const struct ggml_tensor * src0,
14366
+ const struct ggml_tensor * opt0,
14367
+ struct ggml_tensor * dst) {
14368
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14369
+ return;
14370
+ }
14371
+
14372
+ const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14373
+ const int64_t ne01 = src0->ne[1];
14374
+ const int64_t ne02 = src0->ne[2];
14375
+ const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14376
+
14377
+ const int64_t ne0 = dst->ne[0];
14378
+ const int64_t ne1 = dst->ne[1];
14379
+ const int64_t ne2 = dst->ne[2];
14380
+ const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
14381
+
14382
+ const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14383
+ const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14384
+ const int32_t w = ((const int32_t *)(opt0->data))[2];
14385
+
14386
+ assert(ne00 == ne0);
14387
+ assert(ne3 == nep0*nep1);
14388
+
14389
+ // TODO: optimize / multi-thread
14390
+ for (int py = 0; py < nep1; ++py) {
14391
+ for (int px = 0; px < nep0; ++px) {
14392
+ const int64_t i3 = py*nep0 + px;
14393
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14394
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14395
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14396
+ const int64_t i02 = py*w + i2;
14397
+ const int64_t i01 = px*w + i1;
14398
+ const int64_t i00 = i0;
14399
+
14400
+ const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
14401
+ const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
14402
+
14403
+ if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
14404
+ ((float *) dst->data)[i] = 0.0f;
14405
+ } else {
14406
+ ((float *) dst->data)[i] = ((float *) src0->data)[j];
14407
+ }
14408
+ }
14409
+ }
14410
+ }
14411
+ }
14412
+ }
14413
+ }
14414
+
14415
+ static void ggml_compute_forward_win_part(
14416
+ const struct ggml_compute_params * params,
14417
+ const struct ggml_tensor * src0,
14418
+ const struct ggml_tensor * opt0,
14419
+ struct ggml_tensor * dst) {
14420
+ switch (src0->type) {
14421
+ case GGML_TYPE_F32:
14422
+ {
14423
+ ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
14424
+ } break;
14425
+ default:
14426
+ {
14427
+ GGML_ASSERT(false);
14428
+ } break;
14429
+ }
14430
+ }
14431
+
14432
+ // ggml_compute_forward_win_unpart
14433
+
14434
+ static void ggml_compute_forward_win_unpart_f32(
14435
+ const struct ggml_compute_params * params,
14436
+ const struct ggml_tensor * src0,
14437
+ const struct ggml_tensor * opt0,
14438
+ struct ggml_tensor * dst) {
14439
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14440
+ return;
14441
+ }
14442
+
14443
+ const int64_t ne00 = src0->ne[0];
14444
+ const int64_t ne01 = src0->ne[1];
14445
+ const int64_t ne02 = src0->ne[2];
14446
+ //const int64_t ne03 = src0->ne[3];
14447
+
14448
+ const int64_t ne0 = dst->ne[0];
14449
+ const int64_t ne1 = dst->ne[1];
14450
+ const int64_t ne2 = dst->ne[2];
14451
+
14452
+ const int32_t w = ((const int32_t *)(opt0->data))[0];
14453
+
14454
+ // padding
14455
+ const int px = (w - ne1%w)%w;
14456
+ //const int py = (w - ne2%w)%w;
14457
+
14458
+ const int npx = (px + ne1)/w;
14459
+ //const int npy = (py + ne2)/w;
14460
+
14461
+ assert(ne0 == ne00);
14462
+
14463
+ // TODO: optimize / multi-thread
14464
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14465
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14466
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14467
+ const int ip2 = i2/w;
14468
+ const int ip1 = i1/w;
14469
+
14470
+ const int64_t i02 = i2%w;
14471
+ const int64_t i01 = i1%w;
14472
+ const int64_t i00 = i0;
14473
+
14474
+ const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
14475
+ const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
14476
+
14477
+ ((float *) dst->data)[j] = ((float *) src0->data)[i];
14478
+ }
14479
+ }
14480
+ }
14481
+ }
14482
+
14483
+ static void ggml_compute_forward_win_unpart(
14484
+ const struct ggml_compute_params * params,
14485
+ const struct ggml_tensor * src0,
14486
+ const struct ggml_tensor * opt0,
14487
+ struct ggml_tensor * dst) {
14488
+ switch (src0->type) {
14489
+ case GGML_TYPE_F32:
14490
+ {
14491
+ ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14492
+ } break;
14493
+ default:
14494
+ {
14495
+ GGML_ASSERT(false);
14496
+ } break;
14497
+ }
14498
+ }
14499
+
13929
14500
  // ggml_compute_forward_map_unary
13930
14501
 
13931
14502
  static void ggml_compute_forward_map_unary_f32(
@@ -14398,6 +14969,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14398
14969
  {
14399
14970
  ggml_compute_forward_gelu(params, tensor->src0, tensor);
14400
14971
  } break;
14972
+ case GGML_OP_GELU_QUICK:
14973
+ {
14974
+ ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
14975
+ } break;
14401
14976
  case GGML_OP_SILU:
14402
14977
  {
14403
14978
  ggml_compute_forward_silu(params, tensor->src0, tensor);
@@ -14502,19 +15077,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14502
15077
  {
14503
15078
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
14504
15079
  } break;
14505
- case GGML_OP_CONV_1D_1S:
15080
+ case GGML_OP_CONV_1D_S1_PH:
15081
+ {
15082
+ ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15083
+ } break;
15084
+ case GGML_OP_CONV_1D_S2_PH:
14506
15085
  {
14507
- ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
15086
+ ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
14508
15087
  } break;
14509
- case GGML_OP_CONV_1D_2S:
15088
+ case GGML_OP_CONV_2D_SK_P0:
14510
15089
  {
14511
- ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
15090
+ ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
14512
15091
  } break;
14513
15092
  case GGML_OP_FLASH_ATTN:
14514
15093
  {
14515
- int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
15094
+ const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
14516
15095
  GGML_ASSERT(t == 0 || t == 1);
14517
- bool masked = t != 0;
15096
+ const bool masked = t != 0;
14518
15097
  ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
14519
15098
  } break;
14520
15099
  case GGML_OP_FLASH_FF:
@@ -14528,6 +15107,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14528
15107
  bool masked = t != 0;
14529
15108
  ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
14530
15109
  } break;
15110
+ case GGML_OP_WIN_PART:
15111
+ {
15112
+ ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
15113
+ } break;
15114
+ case GGML_OP_WIN_UNPART:
15115
+ {
15116
+ ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
15117
+ } break;
14531
15118
  case GGML_OP_MAP_UNARY:
14532
15119
  {
14533
15120
  const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
@@ -14799,6 +15386,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14799
15386
  {
14800
15387
  GGML_ASSERT(false); // TODO: not implemented
14801
15388
  } break;
15389
+ case GGML_OP_GELU_QUICK:
15390
+ {
15391
+ GGML_ASSERT(false); // TODO: not implemented
15392
+ } break;
14802
15393
  case GGML_OP_ALIBI:
14803
15394
  {
14804
15395
  GGML_ASSERT(false); // TODO: not implemented
@@ -15161,11 +15752,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15161
15752
  // noop
15162
15753
  }
15163
15754
  } break;
15164
- case GGML_OP_CONV_1D_1S:
15755
+ case GGML_OP_CONV_1D_S1_PH:
15756
+ {
15757
+ GGML_ASSERT(false); // TODO: not implemented
15758
+ } break;
15759
+ case GGML_OP_CONV_1D_S2_PH:
15165
15760
  {
15166
15761
  GGML_ASSERT(false); // TODO: not implemented
15167
15762
  } break;
15168
- case GGML_OP_CONV_1D_2S:
15763
+ case GGML_OP_CONV_2D_SK_P0:
15169
15764
  {
15170
15765
  GGML_ASSERT(false); // TODO: not implemented
15171
15766
  } break;
@@ -15334,6 +15929,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15334
15929
  {
15335
15930
  GGML_ASSERT(false); // not supported
15336
15931
  } break;
15932
+ case GGML_OP_WIN_PART:
15933
+ case GGML_OP_WIN_UNPART:
15337
15934
  case GGML_OP_MAP_UNARY:
15338
15935
  case GGML_OP_MAP_BINARY:
15339
15936
  {
@@ -15742,6 +16339,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15742
16339
  } break;
15743
16340
  case GGML_OP_MUL:
15744
16341
  case GGML_OP_GELU:
16342
+ case GGML_OP_GELU_QUICK:
15745
16343
  case GGML_OP_SILU:
15746
16344
  case GGML_OP_SILU_BACK:
15747
16345
  case GGML_OP_NORM:
@@ -15848,8 +16446,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15848
16446
  {
15849
16447
  node->n_tasks = 1; //TODO
15850
16448
  } break;
15851
- case GGML_OP_CONV_1D_1S:
15852
- case GGML_OP_CONV_1D_2S:
16449
+ case GGML_OP_CONV_1D_S1_PH:
16450
+ case GGML_OP_CONV_1D_S2_PH:
15853
16451
  {
15854
16452
  node->n_tasks = n_threads;
15855
16453
 
@@ -15876,6 +16474,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15876
16474
  GGML_ASSERT(false);
15877
16475
  }
15878
16476
 
16477
+ work_size = MAX(work_size, cur);
16478
+ } break;
16479
+ case GGML_OP_CONV_2D_SK_P0:
16480
+ {
16481
+ node->n_tasks = n_threads;
16482
+
16483
+ GGML_ASSERT(node->src1->ne[3] == 1);
16484
+
16485
+ const int64_t ne00 = node->src0->ne[0]; // W
16486
+ const int64_t ne01 = node->src0->ne[1]; // H
16487
+ const int64_t ne02 = node->src0->ne[2]; // C
16488
+ const int64_t ne03 = node->src0->ne[3]; // N
16489
+
16490
+ const int64_t ne10 = node->src1->ne[0]; // W
16491
+ const int64_t ne11 = node->src1->ne[1]; // H
16492
+ const int64_t ne12 = node->src1->ne[2]; // C
16493
+
16494
+ const int64_t nk = ne00*ne01;
16495
+
16496
+ UNUSED(ne02);
16497
+ UNUSED(ne03);
16498
+ UNUSED(nk);
16499
+
16500
+ size_t cur = 0;
16501
+
16502
+ if (node->src0->type == GGML_TYPE_F16 &&
16503
+ node->src1->type == GGML_TYPE_F32) {
16504
+ cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16505
+ } else if (node->src0->type == GGML_TYPE_F32 &&
16506
+ node->src1->type == GGML_TYPE_F32) {
16507
+ cur = sizeof(float)* (ne10*ne11*ne12);
16508
+ } else {
16509
+ GGML_ASSERT(false);
16510
+ }
16511
+
15879
16512
  work_size = MAX(work_size, cur);
15880
16513
  } break;
15881
16514
  case GGML_OP_FLASH_ATTN:
@@ -15937,6 +16570,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15937
16570
 
15938
16571
  work_size = MAX(work_size, cur);
15939
16572
  } break;
16573
+ case GGML_OP_WIN_PART:
16574
+ case GGML_OP_WIN_UNPART:
15940
16575
  case GGML_OP_MAP_UNARY:
15941
16576
  case GGML_OP_MAP_BINARY:
15942
16577
  {
@@ -16469,16 +17104,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16469
17104
 
16470
17105
  if (!*ctx_data) {
16471
17106
  fprintf(stderr, "%s: failed to create ggml context\n", __func__);
17107
+ fclose(fin);
16472
17108
  return result;
16473
17109
  }
16474
17110
  }
16475
17111
 
16476
17112
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
16477
17113
 
16478
- const size_t ret = fread(data->data, sizeof(char), fsize, fin);
16479
- if (ret != fsize) {
16480
- fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
16481
- return result;
17114
+ {
17115
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
17116
+ if (ret != fsize) {
17117
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
17118
+ fclose(fin);
17119
+ return result;
17120
+ }
16482
17121
  }
16483
17122
 
16484
17123
  fclose(fin);
@@ -17598,7 +18237,6 @@ GGML_API void ggml_opt_init(
17598
18237
  ggml_set_zero(opt->lbfgs.g);
17599
18238
  ggml_set_zero(opt->lbfgs.gp);
17600
18239
  ggml_set_zero(opt->lbfgs.d);
17601
- ggml_set_zero(opt->lbfgs.pf);
17602
18240
  if (opt->lbfgs.pf) {
17603
18241
  ggml_set_zero(opt->lbfgs.pf);
17604
18242
  }