llama_cpp 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,6 +35,12 @@
35
35
  #define static_assert(cond, msg) struct global_scope_noop_trick
36
36
  #endif
37
37
 
38
+ #if defined(_MSC_VER)
39
+ // disable "possible loss of data" to avoid hundreds of casts
40
+ // we should just be careful :)
41
+ #pragma warning(disable: 4244 4267)
42
+ #endif
43
+
38
44
  #if defined(_WIN32)
39
45
 
40
46
  #include <windows.h>
@@ -106,6 +112,7 @@ typedef void* thread_ret_t;
106
112
  /*#define GGML_PERF*/
107
113
  #define GGML_DEBUG 0
108
114
  #define GGML_GELU_FP16
115
+ #define GGML_GELU_QUICK_FP16
109
116
  #define GGML_SILU_FP16
110
117
 
111
118
  #define GGML_SOFT_MAX_UNROLL 4
@@ -334,6 +341,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
334
341
  // precomputed gelu table for f16 (128 KB)
335
342
  static ggml_fp16_t table_gelu_f16[1 << 16];
336
343
 
344
+ // precomputed quick gelu table for f16 (128 KB)
345
+ static ggml_fp16_t table_gelu_quick_f16[1 << 16];
346
+
337
347
  // precomputed silu table for f16 (128 KB)
338
348
  static ggml_fp16_t table_silu_f16[1 << 16];
339
349
 
@@ -1671,14 +1681,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1671
1681
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1672
1682
  #define GGML_F32x4_REDUCE(res, x) \
1673
1683
  { \
1674
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1675
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
1684
+ int offset = GGML_F32_ARR >> 1; \
1685
+ for (int i = 0; i < offset; ++i) { \
1686
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1676
1687
  } \
1677
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1678
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
1688
+ offset >>= 1; \
1689
+ for (int i = 0; i < offset; ++i) { \
1690
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1679
1691
  } \
1680
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1681
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
1692
+ offset >>= 1; \
1693
+ for (int i = 0; i < offset; ++i) { \
1694
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1682
1695
  } \
1683
1696
  res = GGML_F32x4_REDUCE_ONE(x[0]); \
1684
1697
  }
@@ -1709,14 +1722,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1709
1722
  #define GGML_F16x8_MUL vmulq_f16
1710
1723
  #define GGML_F16x8_REDUCE(res, x) \
1711
1724
  { \
1712
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1713
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
1725
+ int offset = GGML_F16_ARR >> 1; \
1726
+ for (int i = 0; i < offset; ++i) { \
1727
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1714
1728
  } \
1715
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1716
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
1729
+ offset >>= 1; \
1730
+ for (int i = 0; i < offset; ++i) { \
1731
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1717
1732
  } \
1718
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1719
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
1733
+ offset >>= 1; \
1734
+ for (int i = 0; i < offset; ++i) { \
1735
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1720
1736
  } \
1721
1737
  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1722
1738
  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1783,14 +1799,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1783
1799
  #define GGML_F32x8_MUL _mm256_mul_ps
1784
1800
  #define GGML_F32x8_REDUCE(res, x) \
1785
1801
  { \
1786
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1787
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
1802
+ int offset = GGML_F32_ARR >> 1; \
1803
+ for (int i = 0; i < offset; ++i) { \
1804
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1788
1805
  } \
1789
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1790
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
1806
+ offset >>= 1; \
1807
+ for (int i = 0; i < offset; ++i) { \
1808
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1791
1809
  } \
1792
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1793
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
1810
+ offset >>= 1; \
1811
+ for (int i = 0; i < offset; ++i) { \
1812
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1794
1813
  } \
1795
1814
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1796
1815
  _mm256_extractf128_ps(x[0], 1)); \
@@ -1880,14 +1899,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1880
1899
  #define GGML_F32x4_MUL vec_mul
1881
1900
  #define GGML_F32x4_REDUCE(res, x) \
1882
1901
  { \
1883
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1884
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
1902
+ int offset = GGML_F32_ARR >> 1; \
1903
+ for (int i = 0; i < offset; ++i) { \
1904
+ x[i] = vec_add(x[i], x[offset+i]); \
1885
1905
  } \
1886
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1887
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
1906
+ offset >>= 1; \
1907
+ for (int i = 0; i < offset; ++i) { \
1908
+ x[i] = vec_add(x[i], x[offset+i]); \
1888
1909
  } \
1889
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1890
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
1910
+ offset >>= 1; \
1911
+ for (int i = 0; i < offset; ++i) { \
1912
+ x[i] = vec_add(x[i], x[offset+i]); \
1891
1913
  } \
1892
1914
  res = vec_extract(x[0], 0) + \
1893
1915
  vec_extract(x[0], 1) + \
@@ -1943,14 +1965,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1943
1965
  #define GGML_F32x4_MUL wasm_f32x4_mul
1944
1966
  #define GGML_F32x4_REDUCE(res, x) \
1945
1967
  { \
1946
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1947
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
1968
+ int offset = GGML_F32_ARR >> 1; \
1969
+ for (int i = 0; i < offset; ++i) { \
1970
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1948
1971
  } \
1949
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1950
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
1972
+ offset >>= 1; \
1973
+ for (int i = 0; i < offset; ++i) { \
1974
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1951
1975
  } \
1952
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1953
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
1976
+ offset >>= 1; \
1977
+ for (int i = 0; i < offset; ++i) { \
1978
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1954
1979
  } \
1955
1980
  res = wasm_f32x4_extract_lane(x[0], 0) + \
1956
1981
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2005,14 +2030,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2005
2030
  #define GGML_F16x4_MUL wasm_f32x4_mul
2006
2031
  #define GGML_F16x4_REDUCE(res, x) \
2007
2032
  { \
2008
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
2009
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
2033
+ int offset = GGML_F16_ARR >> 1; \
2034
+ for (int i = 0; i < offset; ++i) { \
2035
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2010
2036
  } \
2011
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
2012
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
2037
+ offset >>= 1; \
2038
+ for (int i = 0; i < offset; ++i) { \
2039
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2013
2040
  } \
2014
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
2015
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
2041
+ offset >>= 1; \
2042
+ for (int i = 0; i < offset; ++i) { \
2043
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2016
2044
  } \
2017
2045
  res = wasm_f32x4_extract_lane(x[0], 0) + \
2018
2046
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2054,14 +2082,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2054
2082
  #define GGML_F32x4_MUL _mm_mul_ps
2055
2083
  #define GGML_F32x4_REDUCE(res, x) \
2056
2084
  { \
2057
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2058
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
2085
+ int offset = GGML_F32_ARR >> 1; \
2086
+ for (int i = 0; i < offset; ++i) { \
2087
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2059
2088
  } \
2060
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2061
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
2089
+ offset >>= 1; \
2090
+ for (int i = 0; i < offset; ++i) { \
2091
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2062
2092
  } \
2063
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2064
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
2093
+ offset >>= 1; \
2094
+ for (int i = 0; i < offset; ++i) { \
2095
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2065
2096
  } \
2066
2097
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2067
2098
  res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
@@ -3350,6 +3381,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
3350
3381
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3351
3382
 
3352
3383
  static const float GELU_COEF_A = 0.044715f;
3384
+ static const float GELU_QUICK_COEF = -1.702f;
3353
3385
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3354
3386
 
3355
3387
  inline static float ggml_gelu_f32(float x) {
@@ -3380,6 +3412,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
3380
3412
  }
3381
3413
  #endif
3382
3414
 
3415
+ inline static float ggml_gelu_quick_f32(float x) {
3416
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
3417
+ }
3418
+
3419
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
3420
+ // const uint16_t * i16 = (const uint16_t *) x;
3421
+ // for (int i = 0; i < n; ++i) {
3422
+ // y[i] = table_gelu_quick_f16[i16[i]];
3423
+ // }
3424
+ //}
3425
+
3426
+ #ifdef GGML_GELU_QUICK_FP16
3427
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3428
+ uint16_t t;
3429
+ for (int i = 0; i < n; ++i) {
3430
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
3431
+ memcpy(&t, &fp16, sizeof(uint16_t));
3432
+ y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
3433
+ }
3434
+ }
3435
+ #else
3436
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3437
+ for (int i = 0; i < n; ++i) {
3438
+ y[i] = ggml_gelu_quick_f32(x[i]);
3439
+ }
3440
+ }
3441
+ #endif
3442
+
3383
3443
  // Sigmoid Linear Unit (SiLU) function
3384
3444
  inline static float ggml_silu_f32(float x) {
3385
3445
  return x/(1.0f + expf(-x));
@@ -3610,6 +3670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3610
3670
  "STEP",
3611
3671
  "RELU",
3612
3672
  "GELU",
3673
+ "GELU_QUICK",
3613
3674
  "SILU",
3614
3675
  "SILU_BACK",
3615
3676
  "NORM",
@@ -3638,12 +3699,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3638
3699
  "ROPE_BACK",
3639
3700
  "ALIBI",
3640
3701
  "CLAMP",
3641
- "CONV_1D_1S",
3642
- "CONV_1D_2S",
3702
+ "CONV_1D_S1_PH",
3703
+ "CONV_1D_S2_PH",
3704
+ "CONV_2D_SK_P0",
3643
3705
 
3644
3706
  "FLASH_ATTN",
3645
3707
  "FLASH_FF",
3646
3708
  "FLASH_ATTN_BACK",
3709
+ "WIN_PART",
3710
+ "WIN_UNPART",
3647
3711
 
3648
3712
  "MAP_UNARY",
3649
3713
  "MAP_BINARY",
@@ -3652,7 +3716,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3652
3716
  "CROSS_ENTROPY_LOSS_BACK",
3653
3717
  };
3654
3718
 
3655
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3719
+ static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3656
3720
 
3657
3721
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3658
3722
  "none",
@@ -3678,6 +3742,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3678
3742
  "step(x)",
3679
3743
  "relu(x)",
3680
3744
  "gelu(x)",
3745
+ "gelu_quick(x)",
3681
3746
  "silu(x)",
3682
3747
  "silu_back(x)",
3683
3748
  "norm(x)",
@@ -3706,12 +3771,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3706
3771
  "rope_back(x)",
3707
3772
  "alibi(x)",
3708
3773
  "clamp(x)",
3709
- "conv_1d_1s(x)",
3710
- "conv_1d_2s(x)",
3774
+ "conv_1d_s1_ph(x)",
3775
+ "conv_1d_s2_ph(x)",
3776
+ "conv_2d_sk_p0(x)",
3711
3777
 
3712
3778
  "flash_attn(x)",
3713
3779
  "flash_ff(x)",
3714
3780
  "flash_attn_back(x)",
3781
+ "win_part(x)",
3782
+ "win_unpart(x)",
3715
3783
 
3716
3784
  "f(x)",
3717
3785
  "f(x,y)",
@@ -3720,7 +3788,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3720
3788
  "cross_entropy_loss_back(x,y)",
3721
3789
  };
3722
3790
 
3723
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3791
+ static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3724
3792
 
3725
3793
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3726
3794
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4011,7 +4079,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4011
4079
  // initialize time system (required on Windows)
4012
4080
  ggml_time_init();
4013
4081
 
4014
- // initialize GELU, SILU and EXP F32 tables
4082
+ // initialize GELU, Quick GELU, SILU and EXP F32 tables
4015
4083
  {
4016
4084
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
4017
4085
 
@@ -4021,13 +4089,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4021
4089
  memcpy(&ii, &ui, sizeof(ii));
4022
4090
  const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
4023
4091
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
4092
+ table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
4024
4093
  table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
4025
4094
  table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
4026
4095
  }
4027
4096
 
4028
4097
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
4029
4098
 
4030
- GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4099
+ GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4031
4100
  }
4032
4101
 
4033
4102
  // initialize g_state
@@ -4148,14 +4217,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4148
4217
  ctx->no_alloc = no_alloc;
4149
4218
  }
4150
4219
 
4151
- void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4220
+ void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
4152
4221
  return ctx->mem_buffer;
4153
4222
  }
4154
4223
 
4155
- size_t ggml_get_mem_size(struct ggml_context * ctx) {
4224
+ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
4156
4225
  return ctx->mem_size;
4157
4226
  }
4158
4227
 
4228
+ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4229
+ size_t max_size = 0;
4230
+
4231
+ struct ggml_object * obj = ctx->objects_begin;
4232
+
4233
+ while (obj != NULL) {
4234
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4235
+
4236
+ const size_t size = ggml_nbytes(tensor);
4237
+
4238
+ if (max_size < size) {
4239
+ max_size = size;
4240
+ }
4241
+
4242
+ obj = obj->next;
4243
+ }
4244
+
4245
+ return max_size;
4246
+ }
4247
+
4159
4248
  // IMPORTANT:
4160
4249
  // when creating "opt" tensors, always save and load the scratch buffer
4161
4250
  // this is an error prone process, but it is necessary to support inplace
@@ -4639,9 +4728,10 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4639
4728
  return tensor->name;
4640
4729
  }
4641
4730
 
4642
- void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4731
+ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4643
4732
  strncpy(tensor->name, name, sizeof(tensor->name));
4644
4733
  tensor->name[sizeof(tensor->name) - 1] = '\0';
4734
+ return tensor;
4645
4735
  }
4646
4736
 
4647
4737
  struct ggml_tensor * ggml_view_tensor(
@@ -5420,6 +5510,40 @@ struct ggml_tensor * ggml_gelu_inplace(
5420
5510
  return ggml_gelu_impl(ctx, a, true);
5421
5511
  }
5422
5512
 
5513
+ // ggml_gelu_quick
5514
+
5515
+ struct ggml_tensor * ggml_gelu_quick_impl(
5516
+ struct ggml_context * ctx,
5517
+ struct ggml_tensor * a,
5518
+ bool inplace) {
5519
+ bool is_node = false;
5520
+
5521
+ if (!inplace && (a->grad)) {
5522
+ is_node = true;
5523
+ }
5524
+
5525
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5526
+
5527
+ result->op = GGML_OP_GELU_QUICK;
5528
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5529
+ result->src0 = a;
5530
+ result->src1 = NULL;
5531
+
5532
+ return result;
5533
+ }
5534
+
5535
+ struct ggml_tensor * ggml_gelu_quick(
5536
+ struct ggml_context * ctx,
5537
+ struct ggml_tensor * a) {
5538
+ return ggml_gelu_quick_impl(ctx, a, false);
5539
+ }
5540
+
5541
+ struct ggml_tensor * ggml_gelu_quick_inplace(
5542
+ struct ggml_context * ctx,
5543
+ struct ggml_tensor * a) {
5544
+ return ggml_gelu_quick_impl(ctx, a, true);
5545
+ }
5546
+
5423
5547
  // ggml_silu
5424
5548
 
5425
5549
  struct ggml_tensor * ggml_silu_impl(
@@ -6619,7 +6743,7 @@ struct ggml_tensor * ggml_clamp(
6619
6743
 
6620
6744
  ggml_scratch_save(ctx);
6621
6745
 
6622
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6746
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
6623
6747
 
6624
6748
  ((float *) b->data)[0] = min;
6625
6749
  ((float *) b->data)[1] = max;
@@ -6634,9 +6758,9 @@ struct ggml_tensor * ggml_clamp(
6634
6758
  return result;
6635
6759
  }
6636
6760
 
6637
- // ggml_conv_1d_1s
6761
+ // ggml_conv_1d_s1_ph
6638
6762
 
6639
- struct ggml_tensor * ggml_conv_1d_1s(
6763
+ struct ggml_tensor * ggml_conv_1d_s1_ph(
6640
6764
  struct ggml_context * ctx,
6641
6765
  struct ggml_tensor * a,
6642
6766
  struct ggml_tensor * b) {
@@ -6653,7 +6777,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
6653
6777
  const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6654
6778
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6655
6779
 
6656
- result->op = GGML_OP_CONV_1D_1S;
6780
+ result->op = GGML_OP_CONV_1D_S1_PH;
6657
6781
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6658
6782
  result->src0 = a;
6659
6783
  result->src1 = b;
@@ -6661,9 +6785,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
6661
6785
  return result;
6662
6786
  }
6663
6787
 
6664
- // ggml_conv_1d_2s
6788
+ // ggml_conv_1d_s2_ph
6665
6789
 
6666
- struct ggml_tensor * ggml_conv_1d_2s(
6790
+ struct ggml_tensor * ggml_conv_1d_s2_ph(
6667
6791
  struct ggml_context * ctx,
6668
6792
  struct ggml_tensor * a,
6669
6793
  struct ggml_tensor * b) {
@@ -6680,7 +6804,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
6680
6804
  const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6681
6805
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6682
6806
 
6683
- result->op = GGML_OP_CONV_1D_2S;
6807
+ result->op = GGML_OP_CONV_1D_S2_PH;
6808
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6809
+ result->src0 = a;
6810
+ result->src1 = b;
6811
+
6812
+ return result;
6813
+ }
6814
+
6815
+ // ggml_conv_2d_sk_p0
6816
+
6817
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6818
+ struct ggml_context * ctx,
6819
+ struct ggml_tensor * a,
6820
+ struct ggml_tensor * b) {
6821
+ GGML_ASSERT(b->ne[3] == 1);
6822
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
6823
+ GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
6824
+ GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
6825
+ bool is_node = false;
6826
+
6827
+ if (a->grad || b->grad) {
6828
+ GGML_ASSERT(false); // TODO: implement backward
6829
+ is_node = true;
6830
+ }
6831
+
6832
+ const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
6833
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6834
+
6835
+ result->op = GGML_OP_CONV_2D_SK_P0;
6684
6836
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6685
6837
  result->src0 = a;
6686
6838
  result->src1 = b;
@@ -6814,6 +6966,89 @@ struct ggml_tensor * ggml_flash_attn_back(
6814
6966
  return result;
6815
6967
  }
6816
6968
 
6969
+ // ggml_win_part
6970
+
6971
+ struct ggml_tensor * ggml_win_part(
6972
+ struct ggml_context * ctx,
6973
+ struct ggml_tensor * a,
6974
+ int w) {
6975
+ GGML_ASSERT(a->ne[3] == 1);
6976
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
6977
+
6978
+ bool is_node = false;
6979
+
6980
+ if (a->grad) {
6981
+ GGML_ASSERT(false); // TODO: implement backward
6982
+ is_node = true;
6983
+ }
6984
+
6985
+ // padding
6986
+ const int px = (w - a->ne[1]%w)%w;
6987
+ const int py = (w - a->ne[2]%w)%w;
6988
+
6989
+ const int npx = (px + a->ne[1])/w;
6990
+ const int npy = (py + a->ne[2])/w;
6991
+ const int np = npx*npy;
6992
+
6993
+ const int64_t ne[4] = { a->ne[0], w, w, np, };
6994
+
6995
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6996
+
6997
+ ggml_scratch_save(ctx);
6998
+
6999
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7000
+
7001
+ ((int32_t *) b->data)[0] = npx;
7002
+ ((int32_t *) b->data)[1] = npy;
7003
+ ((int32_t *) b->data)[2] = w;
7004
+
7005
+ ggml_scratch_load(ctx);
7006
+
7007
+ result->op = GGML_OP_WIN_PART;
7008
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7009
+ result->src0 = a;
7010
+ result->src1 = NULL;
7011
+ result->opt[0] = b;
7012
+
7013
+ return result;
7014
+ }
7015
+
7016
+ // ggml_win_unpart
7017
+
7018
+ struct ggml_tensor * ggml_win_unpart(
7019
+ struct ggml_context * ctx,
7020
+ struct ggml_tensor * a,
7021
+ int w0,
7022
+ int h0,
7023
+ int w) {
7024
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
7025
+
7026
+ bool is_node = false;
7027
+
7028
+ if (a->grad) {
7029
+ GGML_ASSERT(false); // TODO: implement backward
7030
+ is_node = true;
7031
+ }
7032
+
7033
+ const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7034
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7035
+
7036
+ ggml_scratch_save(ctx);
7037
+
7038
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7039
+
7040
+ ((int32_t *) b->data)[0] = w;
7041
+
7042
+ ggml_scratch_load(ctx);
7043
+
7044
+ result->op = GGML_OP_WIN_UNPART;
7045
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7046
+ result->src0 = a;
7047
+ result->src1 = NULL;
7048
+ result->opt[0] = b;
7049
+
7050
+ return result;
7051
+ }
6817
7052
 
6818
7053
  // ggml_map_unary
6819
7054
 
@@ -7892,7 +8127,7 @@ static void ggml_compute_forward_add_q_f32(
7892
8127
 
7893
8128
  void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
7894
8129
  float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
7895
- void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
8130
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
7896
8131
 
7897
8132
  assert(ne00 % 32 == 0);
7898
8133
 
@@ -9453,8 +9688,65 @@ static void ggml_compute_forward_gelu(
9453
9688
  GGML_ASSERT(false);
9454
9689
  } break;
9455
9690
  }
9691
+ }
9692
+
9693
+ // ggml_compute_forward_gelu_quick
9694
+
9695
+ static void ggml_compute_forward_gelu_quick_f32(
9696
+ const struct ggml_compute_params * params,
9697
+ const struct ggml_tensor * src0,
9698
+ struct ggml_tensor * dst) {
9699
+ GGML_ASSERT(ggml_is_contiguous(src0));
9700
+ GGML_ASSERT(ggml_is_contiguous(dst));
9701
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
9702
+
9703
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9704
+ return;
9705
+ }
9706
+
9707
+ const int ith = params->ith;
9708
+ const int nth = params->nth;
9709
+
9710
+ const int nc = src0->ne[0];
9711
+ const int nr = ggml_nrows(src0);
9456
9712
 
9457
- //printf("XXXXXXXX gelu\n");
9713
+ // rows per thread
9714
+ const int dr = (nr + nth - 1)/nth;
9715
+
9716
+ // row range for this thread
9717
+ const int ir0 = dr*ith;
9718
+ const int ir1 = MIN(ir0 + dr, nr);
9719
+
9720
+ for (int i1 = ir0; i1 < ir1; i1++) {
9721
+ ggml_vec_gelu_quick_f32(nc,
9722
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
9723
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
9724
+
9725
+ #ifndef NDEBUG
9726
+ for (int k = 0; k < nc; k++) {
9727
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
9728
+ UNUSED(x);
9729
+ assert(!isnan(x));
9730
+ assert(!isinf(x));
9731
+ }
9732
+ #endif
9733
+ }
9734
+ }
9735
+
9736
+ static void ggml_compute_forward_gelu_quick(
9737
+ const struct ggml_compute_params * params,
9738
+ const struct ggml_tensor * src0,
9739
+ struct ggml_tensor * dst) {
9740
+ switch (src0->type) {
9741
+ case GGML_TYPE_F32:
9742
+ {
9743
+ ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9744
+ } break;
9745
+ default:
9746
+ {
9747
+ GGML_ASSERT(false);
9748
+ } break;
9749
+ }
9458
9750
  }
9459
9751
 
9460
9752
  // ggml_compute_forward_silu
@@ -10852,7 +11144,7 @@ static void ggml_compute_forward_set_f32(
10852
11144
  const int im2 = (ne12 == 0 ? 0 : ne12-1);
10853
11145
  const int im3 = (ne13 == 0 ? 0 : ne13-1);
10854
11146
 
10855
- GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst));
11147
+ GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
10856
11148
 
10857
11149
  GGML_ASSERT(nb10 == sizeof(float));
10858
11150
 
@@ -11573,8 +11865,9 @@ static void ggml_compute_forward_alibi_f32(
11573
11865
  const struct ggml_tensor * src1,
11574
11866
  struct ggml_tensor * dst) {
11575
11867
  assert(params->ith == 0);
11576
- assert(src1->type == GGML_TYPE_I32);
11577
- assert(ggml_nelements(src1) == 3);
11868
+
11869
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
11870
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11578
11871
 
11579
11872
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11580
11873
  return;
@@ -11637,8 +11930,9 @@ static void ggml_compute_forward_alibi_f16(
11637
11930
  const struct ggml_tensor * src1,
11638
11931
  struct ggml_tensor * dst) {
11639
11932
  assert(params->ith == 0);
11640
- assert(src1->type == GGML_TYPE_I32);
11641
- assert(ggml_nelements(src1) == 3);
11933
+
11934
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
11935
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11642
11936
 
11643
11937
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11644
11938
  return;
@@ -11740,15 +12034,16 @@ static void ggml_compute_forward_clamp_f32(
11740
12034
  const struct ggml_tensor * src1,
11741
12035
  struct ggml_tensor * dst) {
11742
12036
  assert(params->ith == 0);
11743
- assert(src1->type == GGML_TYPE_I32);
11744
- assert(ggml_nelements(src1) == 2);
12037
+
12038
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12039
+ GGML_ASSERT(ggml_nelements(src1) == 2);
11745
12040
 
11746
12041
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11747
12042
  return;
11748
12043
  }
11749
12044
 
11750
- const int min = ((float *) src1->data)[0];
11751
- const int max = ((float *) src1->data)[1];
12045
+ const float min = ((float *) src1->data)[0];
12046
+ const float max = ((float *) src1->data)[1];
11752
12047
 
11753
12048
  const int ith = params->ith;
11754
12049
  const int nth = params->nth;
@@ -12306,9 +12601,9 @@ static void ggml_compute_forward_rope_back(
12306
12601
  }
12307
12602
  }
12308
12603
 
12309
- // ggml_compute_forward_conv_1d_1s
12604
+ // ggml_compute_forward_conv_1d_s1_ph
12310
12605
 
12311
- static void ggml_compute_forward_conv_1d_1s_f16_f32(
12606
+ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12312
12607
  const struct ggml_compute_params * params,
12313
12608
  const struct ggml_tensor * src0,
12314
12609
  const struct ggml_tensor * src1,
@@ -12428,7 +12723,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
12428
12723
  }
12429
12724
  }
12430
12725
 
12431
- static void ggml_compute_forward_conv_1d_1s_f32(
12726
+ static void ggml_compute_forward_conv_1d_s1_ph_f32(
12432
12727
  const struct ggml_compute_params * params,
12433
12728
  const struct ggml_tensor * src0,
12434
12729
  const struct ggml_tensor * src1,
@@ -12548,7 +12843,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
12548
12843
  }
12549
12844
  }
12550
12845
 
12551
- static void ggml_compute_forward_conv_1d_1s(
12846
+ static void ggml_compute_forward_conv_1d_s1_ph(
12552
12847
  const struct ggml_compute_params * params,
12553
12848
  const struct ggml_tensor * src0,
12554
12849
  const struct ggml_tensor * src1,
@@ -12556,11 +12851,11 @@ static void ggml_compute_forward_conv_1d_1s(
12556
12851
  switch (src0->type) {
12557
12852
  case GGML_TYPE_F16:
12558
12853
  {
12559
- ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
12854
+ ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
12560
12855
  } break;
12561
12856
  case GGML_TYPE_F32:
12562
12857
  {
12563
- ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
12858
+ ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
12564
12859
  } break;
12565
12860
  default:
12566
12861
  {
@@ -12569,9 +12864,9 @@ static void ggml_compute_forward_conv_1d_1s(
12569
12864
  }
12570
12865
  }
12571
12866
 
12572
- // ggml_compute_forward_conv_1d_2s
12867
+ // ggml_compute_forward_conv_1d_s2_ph
12573
12868
 
12574
- static void ggml_compute_forward_conv_1d_2s_f16_f32(
12869
+ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
12575
12870
  const struct ggml_compute_params * params,
12576
12871
  const struct ggml_tensor * src0,
12577
12872
  const struct ggml_tensor * src1,
@@ -12691,7 +12986,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
12691
12986
  }
12692
12987
  }
12693
12988
 
12694
- static void ggml_compute_forward_conv_1d_2s_f32(
12989
+ static void ggml_compute_forward_conv_1d_s2_ph_f32(
12695
12990
  const struct ggml_compute_params * params,
12696
12991
  const struct ggml_tensor * src0,
12697
12992
  const struct ggml_tensor * src1,
@@ -12811,7 +13106,143 @@ static void ggml_compute_forward_conv_1d_2s_f32(
12811
13106
  }
12812
13107
  }
12813
13108
 
12814
- static void ggml_compute_forward_conv_1d_2s(
13109
+ static void ggml_compute_forward_conv_1d_s2_ph(
13110
+ const struct ggml_compute_params * params,
13111
+ const struct ggml_tensor * src0,
13112
+ const struct ggml_tensor * src1,
13113
+ struct ggml_tensor * dst) {
13114
+ switch (src0->type) {
13115
+ case GGML_TYPE_F16:
13116
+ {
13117
+ ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
13118
+ } break;
13119
+ case GGML_TYPE_F32:
13120
+ {
13121
+ ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
13122
+ } break;
13123
+ default:
13124
+ {
13125
+ GGML_ASSERT(false);
13126
+ } break;
13127
+ }
13128
+ }
13129
+
13130
+ // ggml_compute_forward_conv_2d_sk_p0
13131
+
13132
+ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13133
+ const struct ggml_compute_params * params,
13134
+ const struct ggml_tensor * src0,
13135
+ const struct ggml_tensor * src1,
13136
+ struct ggml_tensor * dst) {
13137
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13138
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13139
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13140
+
13141
+ int64_t t0 = ggml_perf_time_us();
13142
+ UNUSED(t0);
13143
+
13144
+ const int ne00 = src0->ne[0];
13145
+ const int ne01 = src0->ne[1];
13146
+ const int ne02 = src0->ne[2];
13147
+ //const int ne03 = src0->ne[3];
13148
+
13149
+ const int ne10 = src1->ne[0];
13150
+ //const int ne11 = src1->ne[1];
13151
+ const int ne12 = src1->ne[2];
13152
+ //const int ne13 = src1->ne[3];
13153
+
13154
+ const int ne0 = dst->ne[0];
13155
+ const int ne1 = dst->ne[1];
13156
+ const int ne2 = dst->ne[2];
13157
+ //const int ne3 = dst->ne[3];
13158
+ //const int ne = ne0*ne1*ne2*ne3;
13159
+
13160
+ const int nb00 = src0->nb[0];
13161
+ //const int nb01 = src0->nb[1];
13162
+ //const int nb02 = src0->nb[2];
13163
+ const int nb03 = src0->nb[3];
13164
+
13165
+ const int nb10 = src1->nb[0];
13166
+ //const int nb11 = src1->nb[1];
13167
+ const int nb12 = src1->nb[2];
13168
+ //const int nb13 = src1->nb[3];
13169
+
13170
+ //const int nb0 = dst->nb[0];
13171
+ //const int nb1 = dst->nb[1];
13172
+ const int nb2 = dst->nb[2];
13173
+ //const int nb3 = dst->nb[3];
13174
+
13175
+ const int ith = params->ith;
13176
+ const int nth = params->nth;
13177
+
13178
+ const int nk0 = ne00;
13179
+ const int nk1 = ne01;
13180
+
13181
+ // size of the convolution row - the kernel size unrolled across all channels
13182
+ // round-up so it is more suitable for SIMD
13183
+ const int ew0 = ggml_up32(nk0*nk1*ne02);
13184
+
13185
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13186
+ GGML_ASSERT(nb10 == sizeof(float));
13187
+
13188
+ if (params->type == GGML_TASK_INIT) {
13189
+ // TODO: fix this memset (wsize is overestimated)
13190
+ memset(params->wdata, 0, params->wsize);
13191
+
13192
+ // prepare source data (src1)
13193
+ {
13194
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13195
+
13196
+ for (int i12 = 0; i12 < ne12; i12++) {
13197
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13198
+ ggml_fp16_t * dst_data = wdata;
13199
+
13200
+ for (int i1 = 0; i1 < ne1; i1++) {
13201
+ for (int i0 = 0; i0 < ne0; i0++) {
13202
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13203
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13204
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13205
+ GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13206
+ }
13207
+ }
13208
+ }
13209
+ }
13210
+ }
13211
+ }
13212
+
13213
+ return;
13214
+ }
13215
+
13216
+ if (params->type == GGML_TASK_FINALIZE) {
13217
+ return;
13218
+ }
13219
+
13220
+ // total patches in dst
13221
+ const int np = ne2;
13222
+
13223
+ // patches per thread
13224
+ const int dp = (np + nth - 1)/nth;
13225
+
13226
+ // patch range for this thread
13227
+ const int ip0 = dp*ith;
13228
+ const int ip1 = MIN(ip0 + dp, np);
13229
+
13230
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13231
+
13232
+ for (int i2 = ip0; i2 < ip1; i2++) {
13233
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13234
+
13235
+ for (int i1 = 0; i1 < ne1; ++i1) {
13236
+ for (int i0 = 0; i0 < ne0; ++i0) {
13237
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13238
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13239
+ (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
13240
+ }
13241
+ }
13242
+ }
13243
+ }
13244
+
13245
+ static void ggml_compute_forward_conv_2d_sk_p0(
12815
13246
  const struct ggml_compute_params * params,
12816
13247
  const struct ggml_tensor * src0,
12817
13248
  const struct ggml_tensor * src1,
@@ -12819,11 +13250,12 @@ static void ggml_compute_forward_conv_1d_2s(
12819
13250
  switch (src0->type) {
12820
13251
  case GGML_TYPE_F16:
12821
13252
  {
12822
- ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
13253
+ ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
12823
13254
  } break;
12824
13255
  case GGML_TYPE_F32:
12825
13256
  {
12826
- ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
13257
+ //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13258
+ GGML_ASSERT(false);
12827
13259
  } break;
12828
13260
  default:
12829
13261
  {
@@ -13926,6 +14358,145 @@ static void ggml_compute_forward_flash_attn_back(
13926
14358
  }
13927
14359
  }
13928
14360
 
14361
+ // ggml_compute_forward_win_part
14362
+
14363
+ static void ggml_compute_forward_win_part_f32(
14364
+ const struct ggml_compute_params * params,
14365
+ const struct ggml_tensor * src0,
14366
+ const struct ggml_tensor * opt0,
14367
+ struct ggml_tensor * dst) {
14368
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14369
+ return;
14370
+ }
14371
+
14372
+ const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14373
+ const int64_t ne01 = src0->ne[1];
14374
+ const int64_t ne02 = src0->ne[2];
14375
+ const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14376
+
14377
+ const int64_t ne0 = dst->ne[0];
14378
+ const int64_t ne1 = dst->ne[1];
14379
+ const int64_t ne2 = dst->ne[2];
14380
+ const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
14381
+
14382
+ const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14383
+ const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14384
+ const int32_t w = ((const int32_t *)(opt0->data))[2];
14385
+
14386
+ assert(ne00 == ne0);
14387
+ assert(ne3 == nep0*nep1);
14388
+
14389
+ // TODO: optimize / multi-thread
14390
+ for (int py = 0; py < nep1; ++py) {
14391
+ for (int px = 0; px < nep0; ++px) {
14392
+ const int64_t i3 = py*nep0 + px;
14393
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14394
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14395
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14396
+ const int64_t i02 = py*w + i2;
14397
+ const int64_t i01 = px*w + i1;
14398
+ const int64_t i00 = i0;
14399
+
14400
+ const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
14401
+ const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
14402
+
14403
+ if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
14404
+ ((float *) dst->data)[i] = 0.0f;
14405
+ } else {
14406
+ ((float *) dst->data)[i] = ((float *) src0->data)[j];
14407
+ }
14408
+ }
14409
+ }
14410
+ }
14411
+ }
14412
+ }
14413
+ }
14414
+
14415
+ static void ggml_compute_forward_win_part(
14416
+ const struct ggml_compute_params * params,
14417
+ const struct ggml_tensor * src0,
14418
+ const struct ggml_tensor * opt0,
14419
+ struct ggml_tensor * dst) {
14420
+ switch (src0->type) {
14421
+ case GGML_TYPE_F32:
14422
+ {
14423
+ ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
14424
+ } break;
14425
+ default:
14426
+ {
14427
+ GGML_ASSERT(false);
14428
+ } break;
14429
+ }
14430
+ }
14431
+
14432
+ // ggml_compute_forward_win_unpart
14433
+
14434
+ static void ggml_compute_forward_win_unpart_f32(
14435
+ const struct ggml_compute_params * params,
14436
+ const struct ggml_tensor * src0,
14437
+ const struct ggml_tensor * opt0,
14438
+ struct ggml_tensor * dst) {
14439
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14440
+ return;
14441
+ }
14442
+
14443
+ const int64_t ne00 = src0->ne[0];
14444
+ const int64_t ne01 = src0->ne[1];
14445
+ const int64_t ne02 = src0->ne[2];
14446
+ //const int64_t ne03 = src0->ne[3];
14447
+
14448
+ const int64_t ne0 = dst->ne[0];
14449
+ const int64_t ne1 = dst->ne[1];
14450
+ const int64_t ne2 = dst->ne[2];
14451
+
14452
+ const int32_t w = ((const int32_t *)(opt0->data))[0];
14453
+
14454
+ // padding
14455
+ const int px = (w - ne1%w)%w;
14456
+ //const int py = (w - ne2%w)%w;
14457
+
14458
+ const int npx = (px + ne1)/w;
14459
+ //const int npy = (py + ne2)/w;
14460
+
14461
+ assert(ne0 == ne00);
14462
+
14463
+ // TODO: optimize / multi-thread
14464
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14465
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14466
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14467
+ const int ip2 = i2/w;
14468
+ const int ip1 = i1/w;
14469
+
14470
+ const int64_t i02 = i2%w;
14471
+ const int64_t i01 = i1%w;
14472
+ const int64_t i00 = i0;
14473
+
14474
+ const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
14475
+ const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
14476
+
14477
+ ((float *) dst->data)[j] = ((float *) src0->data)[i];
14478
+ }
14479
+ }
14480
+ }
14481
+ }
14482
+
14483
+ static void ggml_compute_forward_win_unpart(
14484
+ const struct ggml_compute_params * params,
14485
+ const struct ggml_tensor * src0,
14486
+ const struct ggml_tensor * opt0,
14487
+ struct ggml_tensor * dst) {
14488
+ switch (src0->type) {
14489
+ case GGML_TYPE_F32:
14490
+ {
14491
+ ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14492
+ } break;
14493
+ default:
14494
+ {
14495
+ GGML_ASSERT(false);
14496
+ } break;
14497
+ }
14498
+ }
14499
+
13929
14500
  // ggml_compute_forward_map_unary
13930
14501
 
13931
14502
  static void ggml_compute_forward_map_unary_f32(
@@ -14398,6 +14969,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14398
14969
  {
14399
14970
  ggml_compute_forward_gelu(params, tensor->src0, tensor);
14400
14971
  } break;
14972
+ case GGML_OP_GELU_QUICK:
14973
+ {
14974
+ ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
14975
+ } break;
14401
14976
  case GGML_OP_SILU:
14402
14977
  {
14403
14978
  ggml_compute_forward_silu(params, tensor->src0, tensor);
@@ -14502,19 +15077,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14502
15077
  {
14503
15078
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
14504
15079
  } break;
14505
- case GGML_OP_CONV_1D_1S:
15080
+ case GGML_OP_CONV_1D_S1_PH:
15081
+ {
15082
+ ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15083
+ } break;
15084
+ case GGML_OP_CONV_1D_S2_PH:
14506
15085
  {
14507
- ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
15086
+ ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
14508
15087
  } break;
14509
- case GGML_OP_CONV_1D_2S:
15088
+ case GGML_OP_CONV_2D_SK_P0:
14510
15089
  {
14511
- ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
15090
+ ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
14512
15091
  } break;
14513
15092
  case GGML_OP_FLASH_ATTN:
14514
15093
  {
14515
- int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
15094
+ const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
14516
15095
  GGML_ASSERT(t == 0 || t == 1);
14517
- bool masked = t != 0;
15096
+ const bool masked = t != 0;
14518
15097
  ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
14519
15098
  } break;
14520
15099
  case GGML_OP_FLASH_FF:
@@ -14528,6 +15107,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14528
15107
  bool masked = t != 0;
14529
15108
  ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
14530
15109
  } break;
15110
+ case GGML_OP_WIN_PART:
15111
+ {
15112
+ ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
15113
+ } break;
15114
+ case GGML_OP_WIN_UNPART:
15115
+ {
15116
+ ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
15117
+ } break;
14531
15118
  case GGML_OP_MAP_UNARY:
14532
15119
  {
14533
15120
  const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
@@ -14799,6 +15386,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14799
15386
  {
14800
15387
  GGML_ASSERT(false); // TODO: not implemented
14801
15388
  } break;
15389
+ case GGML_OP_GELU_QUICK:
15390
+ {
15391
+ GGML_ASSERT(false); // TODO: not implemented
15392
+ } break;
14802
15393
  case GGML_OP_ALIBI:
14803
15394
  {
14804
15395
  GGML_ASSERT(false); // TODO: not implemented
@@ -15161,11 +15752,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15161
15752
  // noop
15162
15753
  }
15163
15754
  } break;
15164
- case GGML_OP_CONV_1D_1S:
15755
+ case GGML_OP_CONV_1D_S1_PH:
15756
+ {
15757
+ GGML_ASSERT(false); // TODO: not implemented
15758
+ } break;
15759
+ case GGML_OP_CONV_1D_S2_PH:
15165
15760
  {
15166
15761
  GGML_ASSERT(false); // TODO: not implemented
15167
15762
  } break;
15168
- case GGML_OP_CONV_1D_2S:
15763
+ case GGML_OP_CONV_2D_SK_P0:
15169
15764
  {
15170
15765
  GGML_ASSERT(false); // TODO: not implemented
15171
15766
  } break;
@@ -15334,6 +15929,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15334
15929
  {
15335
15930
  GGML_ASSERT(false); // not supported
15336
15931
  } break;
15932
+ case GGML_OP_WIN_PART:
15933
+ case GGML_OP_WIN_UNPART:
15337
15934
  case GGML_OP_MAP_UNARY:
15338
15935
  case GGML_OP_MAP_BINARY:
15339
15936
  {
@@ -15742,6 +16339,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15742
16339
  } break;
15743
16340
  case GGML_OP_MUL:
15744
16341
  case GGML_OP_GELU:
16342
+ case GGML_OP_GELU_QUICK:
15745
16343
  case GGML_OP_SILU:
15746
16344
  case GGML_OP_SILU_BACK:
15747
16345
  case GGML_OP_NORM:
@@ -15848,8 +16446,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15848
16446
  {
15849
16447
  node->n_tasks = 1; //TODO
15850
16448
  } break;
15851
- case GGML_OP_CONV_1D_1S:
15852
- case GGML_OP_CONV_1D_2S:
16449
+ case GGML_OP_CONV_1D_S1_PH:
16450
+ case GGML_OP_CONV_1D_S2_PH:
15853
16451
  {
15854
16452
  node->n_tasks = n_threads;
15855
16453
 
@@ -15876,6 +16474,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15876
16474
  GGML_ASSERT(false);
15877
16475
  }
15878
16476
 
16477
+ work_size = MAX(work_size, cur);
16478
+ } break;
16479
+ case GGML_OP_CONV_2D_SK_P0:
16480
+ {
16481
+ node->n_tasks = n_threads;
16482
+
16483
+ GGML_ASSERT(node->src1->ne[3] == 1);
16484
+
16485
+ const int64_t ne00 = node->src0->ne[0]; // W
16486
+ const int64_t ne01 = node->src0->ne[1]; // H
16487
+ const int64_t ne02 = node->src0->ne[2]; // C
16488
+ const int64_t ne03 = node->src0->ne[3]; // N
16489
+
16490
+ const int64_t ne10 = node->src1->ne[0]; // W
16491
+ const int64_t ne11 = node->src1->ne[1]; // H
16492
+ const int64_t ne12 = node->src1->ne[2]; // C
16493
+
16494
+ const int64_t nk = ne00*ne01;
16495
+
16496
+ UNUSED(ne02);
16497
+ UNUSED(ne03);
16498
+ UNUSED(nk);
16499
+
16500
+ size_t cur = 0;
16501
+
16502
+ if (node->src0->type == GGML_TYPE_F16 &&
16503
+ node->src1->type == GGML_TYPE_F32) {
16504
+ cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16505
+ } else if (node->src0->type == GGML_TYPE_F32 &&
16506
+ node->src1->type == GGML_TYPE_F32) {
16507
+ cur = sizeof(float)* (ne10*ne11*ne12);
16508
+ } else {
16509
+ GGML_ASSERT(false);
16510
+ }
16511
+
15879
16512
  work_size = MAX(work_size, cur);
15880
16513
  } break;
15881
16514
  case GGML_OP_FLASH_ATTN:
@@ -15937,6 +16570,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15937
16570
 
15938
16571
  work_size = MAX(work_size, cur);
15939
16572
  } break;
16573
+ case GGML_OP_WIN_PART:
16574
+ case GGML_OP_WIN_UNPART:
15940
16575
  case GGML_OP_MAP_UNARY:
15941
16576
  case GGML_OP_MAP_BINARY:
15942
16577
  {
@@ -16469,16 +17104,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16469
17104
 
16470
17105
  if (!*ctx_data) {
16471
17106
  fprintf(stderr, "%s: failed to create ggml context\n", __func__);
17107
+ fclose(fin);
16472
17108
  return result;
16473
17109
  }
16474
17110
  }
16475
17111
 
16476
17112
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
16477
17113
 
16478
- const size_t ret = fread(data->data, sizeof(char), fsize, fin);
16479
- if (ret != fsize) {
16480
- fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
16481
- return result;
17114
+ {
17115
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
17116
+ if (ret != fsize) {
17117
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
17118
+ fclose(fin);
17119
+ return result;
17120
+ }
16482
17121
  }
16483
17122
 
16484
17123
  fclose(fin);
@@ -17598,7 +18237,6 @@ GGML_API void ggml_opt_init(
17598
18237
  ggml_set_zero(opt->lbfgs.g);
17599
18238
  ggml_set_zero(opt->lbfgs.gp);
17600
18239
  ggml_set_zero(opt->lbfgs.d);
17601
- ggml_set_zero(opt->lbfgs.pf);
17602
18240
  if (opt->lbfgs.pf) {
17603
18241
  ggml_set_zero(opt->lbfgs.pf);
17604
18242
  }