llama_cpp 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
100
100
  #include <hbwmalloc.h>
101
101
  #endif
102
102
 
103
+ #if defined(__APPLE__)
104
+ #include <TargetConditionals.h>
105
+ #endif
106
+
107
+ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
108
+ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
109
+
110
+ #include <sys/wait.h>
111
+
112
+ void ggml_print_backtrace(void) {
113
+ /*
114
+ #include <execinfo.h>
115
+ #include <dlfcn.h>
116
+
117
+ void * trace[100];
118
+
119
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
120
+
121
+ backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
122
+ */
123
+
124
+ // backtrack_symbols does not show line numbers, use gdb instead
125
+ char attach[32];
126
+ snprintf(attach, sizeof(attach), "attach %d", getpid());
127
+ int pid = fork();
128
+ if (pid == 0) {
129
+ execlp("gdb", "gdb", "--batch",
130
+ "-ex", "set style enabled on",
131
+ "-ex", attach,
132
+ "-ex", "bt -frame-info source-and-location",
133
+ "-ex", "detach",
134
+ "-ex", "quit",
135
+ NULL);
136
+ } else {
137
+ waitpid(pid, NULL, 0);
138
+ }
139
+ }
140
+ #else
141
+ void ggml_print_backtrace(void) {
142
+ // platform not supported
143
+ }
144
+ #endif
145
+
103
146
  /*#define GGML_PERF*/
104
147
  #define GGML_DEBUG 0
105
148
  #define GGML_GELU_FP16
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
228
271
  // floating point type used to accumulate sums
229
272
  typedef double ggml_float;
230
273
 
274
+ #undef MIN
275
+ #undef MAX
276
+
277
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
278
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
279
+
231
280
  //
232
281
  // global data
233
282
  //
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
561
610
  // simd mappings
562
611
  //
563
612
 
613
+ #if defined(__ARM_NEON)
614
+ #if !defined(__aarch64__)
615
+
616
+ // 64-bit compatibility
617
+
618
+ inline static float vaddvq_f32(float32x4_t v) {
619
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620
+ }
621
+
622
+ #endif
623
+ #endif
624
+
564
625
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
565
626
  // we then implement the fundamental computation operations below using only these macros
566
627
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1352
1413
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1353
1414
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1354
1415
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
+ inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1355
1417
 
1356
1418
  static const float GELU_COEF_A = 0.044715f;
1357
1419
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1572
1634
  "ROPE_BACK",
1573
1635
  "ALIBI",
1574
1636
  "CLAMP",
1575
- "CONV_1D",
1576
- "CONV_1D_STAGE_0",
1577
- "CONV_1D_STAGE_1",
1578
1637
  "CONV_TRANSPOSE_1D",
1579
- "CONV_2D",
1580
- "CONV_2D_STAGE_0",
1581
- "CONV_2D_STAGE_1",
1638
+ "IM2COL",
1582
1639
  "CONV_TRANSPOSE_2D",
1583
1640
  "POOL_1D",
1584
1641
  "POOL_2D",
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1609
1666
  "CROSS_ENTROPY_LOSS_BACK",
1610
1667
  };
1611
1668
 
1612
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1669
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1613
1670
 
1614
1671
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1615
1672
  "none",
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1659
1716
  "rope_back(x)",
1660
1717
  "alibi(x)",
1661
1718
  "clamp(x)",
1662
- "conv_1d(x)",
1663
- "conv_1d_stage_0(x)",
1664
- "conv_1d_stage_1(x)",
1665
1719
  "conv_transpose_1d(x)",
1666
- "conv_2d(x)",
1667
- "conv_2d_stage_0(x)",
1668
- "conv_2d_stage_1(x)",
1720
+ "im2col(x)",
1669
1721
  "conv_transpose_2d(x)",
1670
1722
  "pool_1d(x)",
1671
1723
  "pool_2d(x)",
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1696
1748
  "cross_entropy_loss_back(x,y)",
1697
1749
  };
1698
1750
 
1699
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1751
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1700
1752
 
1701
1753
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1702
1754
 
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
1724
1776
  p[GGML_OP_GET_ROWS_BACK ] = true;
1725
1777
  p[GGML_OP_DIAG_MASK_INF ] = true;
1726
1778
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
1727
- p[GGML_OP_CONV_1D ] = true;
1728
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
1729
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
1730
1779
  p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
1731
- p[GGML_OP_CONV_2D ] = true;
1732
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
1733
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
1734
1780
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
1735
1781
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
1736
1782
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
3769
3815
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3770
3816
  }
3771
3817
 
3818
+ // ggml_leaky
3819
+
3820
+ struct ggml_tensor * ggml_leaky(
3821
+ struct ggml_context * ctx,
3822
+ struct ggml_tensor * a) {
3823
+ return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3824
+ }
3825
+
3772
3826
  // ggml_gelu
3773
3827
 
3774
3828
  struct ggml_tensor * ggml_gelu(
@@ -4970,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
4970
5024
  int n_dims,
4971
5025
  int mode,
4972
5026
  int n_ctx,
5027
+ int n_orig_ctx,
4973
5028
  float freq_base,
4974
5029
  float freq_scale,
5030
+ float ext_factor,
5031
+ float attn_factor,
5032
+ float beta_fast,
5033
+ float beta_slow,
4975
5034
  float xpos_base,
4976
5035
  bool xpos_down) {
4977
5036
  GGML_ASSERT(ggml_is_vector(b));
@@ -4988,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
4988
5047
 
4989
5048
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4990
5049
 
4991
- int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
4992
- memcpy(params + 4, &freq_base, sizeof(float));
4993
- memcpy(params + 5, &freq_scale, sizeof(float));
4994
- memcpy(params + 6, &xpos_base, sizeof(float));
4995
- memcpy(params + 7, &xpos_down, sizeof(bool));
5050
+ int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
5051
+ memcpy(params + 5, &freq_base, sizeof(float));
5052
+ memcpy(params + 6, &freq_scale, sizeof(float));
5053
+ memcpy(params + 7, &ext_factor, sizeof(float));
5054
+ memcpy(params + 8, &attn_factor, sizeof(float));
5055
+ memcpy(params + 9, &beta_fast, sizeof(float));
5056
+ memcpy(params + 10, &beta_slow, sizeof(float));
5057
+ memcpy(params + 11, &xpos_base, sizeof(float));
5058
+ memcpy(params + 12, &xpos_down, sizeof(bool));
4996
5059
  ggml_set_op_params(result, params, sizeof(params));
4997
5060
 
4998
5061
  result->op = GGML_OP_ROPE_BACK;
@@ -5067,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
5067
5130
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
5068
5131
  }
5069
5132
 
5070
- // im2col: [N, IC, IL] => [N, OL, IC*K]
5071
- // a: [OC,IC, K]
5072
- // b: [N, IC, IL]
5073
- // result: [N, OL, IC*K]
5074
- static struct ggml_tensor * ggml_conv_1d_stage_0(
5075
- struct ggml_context * ctx,
5076
- struct ggml_tensor * a,
5077
- struct ggml_tensor * b,
5078
- int s0,
5079
- int p0,
5080
- int d0) {
5081
- GGML_ASSERT(a->ne[1] == b->ne[1]);
5082
- bool is_node = false;
5083
-
5084
- if (a->grad || b->grad) {
5085
- GGML_ASSERT(false); // TODO: implement backward
5086
- is_node = true;
5087
- }
5088
-
5089
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5090
-
5091
- const int64_t ne[4] = {
5092
- a->ne[1] * a->ne[0],
5093
- OL,
5094
- b->ne[2],
5095
- 1,
5096
- };
5097
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5098
-
5099
- int32_t params[] = { s0, p0, d0 };
5100
- ggml_set_op_params(result, params, sizeof(params));
5101
-
5102
- result->op = GGML_OP_CONV_1D_STAGE_0;
5103
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5104
- result->src[0] = a;
5105
- result->src[1] = b;
5106
-
5107
- return result;
5108
- }
5109
-
5110
- // ggml_conv_1d_stage_1
5111
-
5112
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
5113
- // a: [OC, IC, K]
5114
- // b: [N, OL, IC * K]
5115
- // result: [N, OC, OL]
5116
- static struct ggml_tensor * ggml_conv_1d_stage_1(
5117
- struct ggml_context * ctx,
5118
- struct ggml_tensor * a,
5119
- struct ggml_tensor * b) {
5120
-
5121
- bool is_node = false;
5122
-
5123
- if (a->grad || b->grad) {
5124
- GGML_ASSERT(false); // TODO: implement backward
5125
- is_node = true;
5126
- }
5127
-
5128
- const int64_t ne[4] = {
5129
- b->ne[1],
5130
- a->ne[2],
5131
- b->ne[2],
5132
- 1,
5133
- };
5134
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5135
-
5136
- result->op = GGML_OP_CONV_1D_STAGE_1;
5137
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5138
- result->src[0] = a;
5139
- result->src[1] = b;
5140
-
5141
- return result;
5142
- }
5143
-
5144
- // ggml_conv_1d
5145
-
5146
5133
  GGML_API struct ggml_tensor * ggml_conv_1d(
5147
5134
  struct ggml_context * ctx,
5148
5135
  struct ggml_tensor * a,
@@ -5150,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5150
5137
  int s0,
5151
5138
  int p0,
5152
5139
  int d0) {
5153
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
5154
- result = ggml_conv_1d_stage_1(ctx, a, result);
5155
- return result;
5156
- }
5140
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5157
5141
 
5158
- // GGML_API struct ggml_tensor * ggml_conv_1d(
5159
- // struct ggml_context * ctx,
5160
- // struct ggml_tensor * a,
5161
- // struct ggml_tensor * b,
5162
- // int s0,
5163
- // int p0,
5164
- // int d0) {
5165
- // GGML_ASSERT(ggml_is_matrix(b));
5166
- // GGML_ASSERT(a->ne[1] == b->ne[1]);
5167
- // bool is_node = false;
5142
+ struct ggml_tensor * result =
5143
+ ggml_mul_mat(ctx,
5144
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
5145
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
5168
5146
 
5169
- // if (a->grad || b->grad) {
5170
- // GGML_ASSERT(false); // TODO: implement backward
5171
- // is_node = true;
5172
- // }
5147
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
5173
5148
 
5174
- // const int64_t ne[4] = {
5175
- // ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
5176
- // a->ne[2], 1, 1,
5177
- // };
5178
- // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5179
-
5180
- // int32_t params[] = { s0, p0, d0 };
5181
- // ggml_set_op_params(result, params, sizeof(params));
5182
-
5183
- // result->op = GGML_OP_CONV_1D;
5184
- // result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5185
- // result->src[0] = a;
5186
- // result->src[1] = b;
5187
-
5188
- // return result;
5189
- // }
5149
+ return result;
5150
+ }
5190
5151
 
5191
5152
  // ggml_conv_1d_ph
5192
5153
 
@@ -5249,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5249
5210
  // a: [OC,IC, KH, KW]
5250
5211
  // b: [N, IC, IH, IW]
5251
5212
  // result: [N, OH, OW, IC*KH*KW]
5252
- static struct ggml_tensor * ggml_conv_2d_stage_0(
5213
+ struct ggml_tensor * ggml_im2col(
5253
5214
  struct ggml_context * ctx,
5254
5215
  struct ggml_tensor * a,
5255
5216
  struct ggml_tensor * b,
@@ -5258,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5258
5219
  int p0,
5259
5220
  int p1,
5260
5221
  int d0,
5261
- int d1) {
5222
+ int d1,
5223
+ bool is_2D) {
5262
5224
 
5263
- GGML_ASSERT(a->ne[2] == b->ne[2]);
5225
+ if(is_2D) {
5226
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
5227
+ } else {
5228
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
5229
+ }
5264
5230
  bool is_node = false;
5265
5231
 
5266
5232
  if (a->grad || b->grad) {
@@ -5268,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5268
5234
  is_node = true;
5269
5235
  }
5270
5236
 
5271
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
5272
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5237
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
5238
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5273
5239
 
5274
5240
  const int64_t ne[4] = {
5275
- a->ne[2] * a->ne[1] * a->ne[0],
5241
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
5276
5242
  OW,
5277
- OH,
5278
- b->ne[3],
5243
+ is_2D ? OH : b->ne[2],
5244
+ is_2D ? b->ne[3] : 1,
5279
5245
  };
5280
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5281
5246
 
5282
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
5247
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5248
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5283
5249
  ggml_set_op_params(result, params, sizeof(params));
5284
5250
 
5285
- result->op = GGML_OP_CONV_2D_STAGE_0;
5286
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5287
- result->src[0] = a;
5288
- result->src[1] = b;
5289
-
5290
- return result;
5291
-
5292
- }
5293
-
5294
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
5295
- // a: [OC, IC, KH, KW]
5296
- // b: [N, OH, OW, IC * KH * KW]
5297
- // result: [N, OC, OH, OW]
5298
- static struct ggml_tensor * ggml_conv_2d_stage_1(
5299
- struct ggml_context * ctx,
5300
- struct ggml_tensor * a,
5301
- struct ggml_tensor * b) {
5302
-
5303
- bool is_node = false;
5304
-
5305
- if (a->grad || b->grad) {
5306
- GGML_ASSERT(false); // TODO: implement backward
5307
- is_node = true;
5308
- }
5309
-
5310
- const int64_t ne[4] = {
5311
- b->ne[1],
5312
- b->ne[2],
5313
- a->ne[3],
5314
- b->ne[3],
5315
- };
5316
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5317
-
5318
- result->op = GGML_OP_CONV_2D_STAGE_1;
5251
+ result->op = GGML_OP_IM2COL;
5319
5252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5320
5253
  result->src[0] = a;
5321
5254
  result->src[1] = b;
5322
5255
 
5323
5256
  return result;
5324
-
5325
5257
  }
5326
5258
 
5327
5259
  // a: [OC,IC, KH, KW]
5328
5260
  // b: [N, IC, IH, IW]
5329
5261
  // result: [N, OC, OH, OW]
5330
5262
  struct ggml_tensor * ggml_conv_2d(
5331
- struct ggml_context * ctx,
5332
- struct ggml_tensor * a,
5333
- struct ggml_tensor * b,
5334
- int s0,
5335
- int s1,
5336
- int p0,
5337
- int p1,
5338
- int d0,
5339
- int d1) {
5263
+ struct ggml_context * ctx,
5264
+ struct ggml_tensor * a,
5265
+ struct ggml_tensor * b,
5266
+ int s0,
5267
+ int s1,
5268
+ int p0,
5269
+ int p1,
5270
+ int d0,
5271
+ int d1) {
5272
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5340
5273
 
5341
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
5342
- result = ggml_conv_2d_stage_1(ctx, a, result);
5274
+ struct ggml_tensor * result =
5275
+ ggml_mul_mat(ctx,
5276
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5277
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5343
5278
 
5344
- return result;
5279
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5345
5280
 
5281
+ return result;
5346
5282
  }
5347
5283
 
5348
5284
  // ggml_conv_2d_sk_p0
@@ -5402,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
5402
5338
 
5403
5339
  // ggml_pool_*
5404
5340
 
5405
- static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
5341
+ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
5406
5342
  return (ins + 2 * p - ks) / s + 1;
5407
5343
  }
5408
5344
 
@@ -5449,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
5449
5385
  int k1,
5450
5386
  int s0,
5451
5387
  int s1,
5452
- int p0,
5453
- int p1) {
5388
+ float p0,
5389
+ float p1) {
5454
5390
 
5455
5391
  bool is_node = false;
5456
5392
 
@@ -8912,6 +8848,48 @@ static void ggml_compute_forward_silu(
8912
8848
  }
8913
8849
  }
8914
8850
 
8851
+ // ggml_compute_forward_leaky
8852
+
8853
+ static void ggml_compute_forward_leaky_f32(
8854
+ const struct ggml_compute_params * params,
8855
+ const struct ggml_tensor * src0,
8856
+ struct ggml_tensor * dst) {
8857
+ assert(params->ith == 0);
8858
+ assert(ggml_are_same_shape(src0, dst));
8859
+
8860
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8861
+ return;
8862
+ }
8863
+
8864
+ const int n = ggml_nrows(src0);
8865
+ const int nc = src0->ne[0];
8866
+
8867
+ assert(dst->nb[0] == sizeof(float));
8868
+ assert(src0->nb[0] == sizeof(float));
8869
+
8870
+ for (int i = 0; i < n; i++) {
8871
+ ggml_vec_leaky_f32(nc,
8872
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8873
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8874
+ }
8875
+ }
8876
+
8877
+ static void ggml_compute_forward_leaky(
8878
+ const struct ggml_compute_params * params,
8879
+ const struct ggml_tensor * src0,
8880
+ struct ggml_tensor * dst) {
8881
+ switch (src0->type) {
8882
+ case GGML_TYPE_F32:
8883
+ {
8884
+ ggml_compute_forward_leaky_f32(params, src0, dst);
8885
+ } break;
8886
+ default:
8887
+ {
8888
+ GGML_ASSERT(false);
8889
+ } break;
8890
+ }
8891
+ }
8892
+
8915
8893
  // ggml_compute_forward_silu_back
8916
8894
 
8917
8895
  static void ggml_compute_forward_silu_back_f32(
@@ -9395,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9395
9373
  // TODO: find the optimal values for these
9396
9374
  if (ggml_is_contiguous(src0) &&
9397
9375
  ggml_is_contiguous(src1) &&
9376
+ src0->type == GGML_TYPE_F32 &&
9377
+ src1->type == GGML_TYPE_F32 &&
9398
9378
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9399
9379
 
9400
9380
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9433,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
9433
9413
 
9434
9414
  // we don't support permuted src0 or src1
9435
9415
  GGML_ASSERT(nb00 == ggml_type_size(type));
9436
- GGML_ASSERT(nb10 == sizeof(float));
9416
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9437
9417
 
9438
9418
  // dst cannot be transposed or permuted
9439
9419
  GGML_ASSERT(nb0 == sizeof(float));
@@ -10974,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
10974
10954
  const struct ggml_compute_params * params,
10975
10955
  const struct ggml_tensor * src0,
10976
10956
  const struct ggml_tensor * src1,
10977
- struct ggml_tensor * dst) {
10957
+ struct ggml_tensor * dst,
10958
+ const bool forward) {
10978
10959
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10979
10960
  return;
10980
10961
  }
@@ -11033,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
11033
11014
  const bool is_neox = mode & 2;
11034
11015
  const bool is_glm = mode & 4;
11035
11016
 
11017
+ // backward process uses inverse rotation by cos and sin.
11018
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
11019
+ // this essentially just switches the sign of sin.
11020
+ const float sin_sign = forward ? 1.0f : -1.0f;
11021
+
11036
11022
  const int32_t * pos = (const int32_t *) src1->data;
11037
11023
 
11038
11024
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11049,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
11049
11035
  float block_theta = MAX(p - (n_ctx - 2), 0);
11050
11036
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11051
11037
  const float cos_theta = cosf(theta_base);
11052
- const float sin_theta = sinf(theta_base);
11038
+ const float sin_theta = sinf(theta_base) * sin_sign;
11053
11039
  const float cos_block_theta = cosf(block_theta);
11054
- const float sin_block_theta = sinf(block_theta);
11040
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
11055
11041
 
11056
11042
  theta_base *= theta_scale;
11057
11043
  block_theta *= theta_scale;
@@ -11075,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
11075
11061
  rope_yarn(
11076
11062
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11077
11063
  );
11064
+ sin_theta *= sin_sign;
11078
11065
 
11079
11066
  // zeta scaling for xPos only:
11080
11067
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -11105,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
11105
11092
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11106
11093
  &cos_theta, &sin_theta
11107
11094
  );
11095
+ sin_theta *= sin_sign;
11108
11096
 
11109
11097
  theta_base *= theta_scale;
11110
11098
 
@@ -11130,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
11130
11118
  const struct ggml_compute_params * params,
11131
11119
  const struct ggml_tensor * src0,
11132
11120
  const struct ggml_tensor * src1,
11133
- struct ggml_tensor * dst) {
11121
+ struct ggml_tensor * dst,
11122
+ const bool forward) {
11134
11123
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11135
11124
  return;
11136
11125
  }
@@ -11182,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
11182
11171
  const bool is_neox = mode & 2;
11183
11172
  const bool is_glm = mode & 4;
11184
11173
 
11174
+ // backward process uses inverse rotation by cos and sin.
11175
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
11176
+ // this essentially just switches the sign of sin.
11177
+ const float sin_sign = forward ? 1.0f : -1.0f;
11178
+
11185
11179
  const int32_t * pos = (const int32_t *) src1->data;
11186
11180
 
11187
11181
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11198,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
11198
11192
  float block_theta = MAX(p - (n_ctx - 2), 0);
11199
11193
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11200
11194
  const float cos_theta = cosf(theta_base);
11201
- const float sin_theta = sinf(theta_base);
11195
+ const float sin_theta = sinf(theta_base) * sin_sign;
11202
11196
  const float cos_block_theta = cosf(block_theta);
11203
- const float sin_block_theta = sinf(block_theta);
11197
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
11204
11198
 
11205
11199
  theta_base *= theta_scale;
11206
11200
  block_theta *= theta_scale;
@@ -11224,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
11224
11218
  rope_yarn(
11225
11219
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11226
11220
  );
11221
+ sin_theta *= sin_sign;
11227
11222
 
11228
11223
  theta_base *= theta_scale;
11229
11224
 
@@ -11250,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
11250
11245
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11251
11246
  &cos_theta, &sin_theta
11252
11247
  );
11248
+ sin_theta *= sin_sign;
11253
11249
 
11254
11250
  theta_base *= theta_scale;
11255
11251
 
@@ -11279,11 +11275,11 @@ static void ggml_compute_forward_rope(
11279
11275
  switch (src0->type) {
11280
11276
  case GGML_TYPE_F16:
11281
11277
  {
11282
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
11278
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
11283
11279
  } break;
11284
11280
  case GGML_TYPE_F32:
11285
11281
  {
11286
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
11282
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
11287
11283
  } break;
11288
11284
  default:
11289
11285
  {
@@ -11294,693 +11290,73 @@ static void ggml_compute_forward_rope(
11294
11290
 
11295
11291
  // ggml_compute_forward_rope_back
11296
11292
 
11297
- static void ggml_compute_forward_rope_back_f32(
11293
+ static void ggml_compute_forward_rope_back(
11298
11294
  const struct ggml_compute_params * params,
11299
11295
  const struct ggml_tensor * src0,
11300
11296
  const struct ggml_tensor * src1,
11301
11297
  struct ggml_tensor * dst) {
11302
-
11303
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11304
- return;
11298
+ switch (src0->type) {
11299
+ case GGML_TYPE_F16:
11300
+ {
11301
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
11302
+ } break;
11303
+ case GGML_TYPE_F32:
11304
+ {
11305
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
11306
+ } break;
11307
+ default:
11308
+ {
11309
+ GGML_ASSERT(false);
11310
+ } break;
11305
11311
  }
11312
+ }
11306
11313
 
11307
- // y = rope(x, src1)
11308
- // dx = rope_back(dy, src1)
11309
- // src0 is dy, src1 contains options
11310
-
11311
- float freq_base;
11312
- float freq_scale;
11313
-
11314
- // these two only relevant for xPos RoPE:
11315
- float xpos_base;
11316
- bool xpos_down;
11317
-
11318
- //const int n_past = ((int32_t *) dst->op_params)[0];
11319
- const int n_dims = ((int32_t *) dst->op_params)[1];
11320
- const int mode = ((int32_t *) dst->op_params)[2];
11321
- const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
11322
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11323
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
11324
- memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
11325
- memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
11314
+ // ggml_compute_forward_conv_transpose_1d
11326
11315
 
11327
- GGML_TENSOR_UNARY_OP_LOCALS
11316
+ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11317
+ const struct ggml_compute_params * params,
11318
+ const struct ggml_tensor * src0,
11319
+ const struct ggml_tensor * src1,
11320
+ struct ggml_tensor * dst) {
11321
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
11322
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
11323
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
11328
11324
 
11329
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
11330
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
11325
+ int64_t t0 = ggml_perf_time_us();
11326
+ UNUSED(t0);
11331
11327
 
11332
- assert(nb0 == sizeof(float));
11328
+ GGML_TENSOR_BINARY_OP_LOCALS
11333
11329
 
11334
11330
  const int ith = params->ith;
11335
11331
  const int nth = params->nth;
11336
11332
 
11337
- const int nr = ggml_nrows(dst);
11333
+ const int nk = ne00*ne01*ne02;
11338
11334
 
11339
- // rows per thread
11340
- const int dr = (nr + nth - 1)/nth;
11335
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11336
+ GGML_ASSERT(nb10 == sizeof(float));
11341
11337
 
11342
- // row range for this thread
11343
- const int ir0 = dr*ith;
11344
- const int ir1 = MIN(ir0 + dr, nr);
11338
+ if (params->type == GGML_TASK_INIT) {
11339
+ memset(params->wdata, 0, params->wsize);
11345
11340
 
11346
- // row index used to determine which thread to use
11347
- int ir = 0;
11341
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11342
+ {
11343
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11348
11344
 
11349
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
11345
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11346
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11347
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11348
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11349
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11350
+ dst_data[i00*ne02 + i02] = src[i00];
11351
+ }
11352
+ }
11353
+ }
11354
+ }
11350
11355
 
11351
- const bool is_neox = mode & 2;
11352
-
11353
- const int32_t * pos = (const int32_t *) src1->data;
11354
-
11355
- for (int64_t i3 = 0; i3 < ne3; i3++) {
11356
- for (int64_t i2 = 0; i2 < ne2; i2++) {
11357
- const int64_t p = pos[i2];
11358
- for (int64_t i1 = 0; i1 < ne1; i1++) {
11359
- if (ir++ < ir0) continue;
11360
- if (ir > ir1) break;
11361
-
11362
- float theta_base = freq_scale * (float)p;
11363
-
11364
- if (!is_neox) {
11365
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11366
- const float cos_theta = cosf(theta_base);
11367
- const float sin_theta = sinf(theta_base);
11368
-
11369
- // zeta scaling for xPos only:
11370
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
11371
- if (xpos_down) zeta = 1.0f / zeta;
11372
-
11373
- theta_base *= theta_scale;
11374
-
11375
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11376
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11377
-
11378
- const float dy0 = dy[0];
11379
- const float dy1 = dy[1];
11380
-
11381
- dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
11382
- dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
11383
- }
11384
- } else {
11385
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11386
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11387
- const float cos_theta = cosf(theta_base);
11388
- const float sin_theta = sinf(theta_base);
11389
-
11390
- theta_base *= theta_scale;
11391
-
11392
- const int64_t i0 = ib*n_dims + ic/2;
11393
-
11394
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11395
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11396
-
11397
- const float dy0 = dy[0];
11398
- const float dy1 = dy[n_dims/2];
11399
-
11400
- dx[0] = dy0*cos_theta + dy1*sin_theta;
11401
- dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
11402
- }
11403
- }
11404
- }
11405
- }
11406
- }
11407
- }
11408
- }
11409
-
11410
- static void ggml_compute_forward_rope_back_f16(
11411
- const struct ggml_compute_params * params,
11412
- const struct ggml_tensor * src0,
11413
- const struct ggml_tensor * src1,
11414
- struct ggml_tensor * dst) {
11415
-
11416
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11417
- return;
11418
- }
11419
-
11420
- // y = rope(x, src1)
11421
- // dx = rope_back(dy, src1)
11422
- // src0 is dy, src1 contains options
11423
-
11424
- //const int n_past = ((int32_t *) dst->op_params)[0];
11425
- const int n_dims = ((int32_t *) dst->op_params)[1];
11426
- const int mode = ((int32_t *) dst->op_params)[2];
11427
-
11428
- GGML_TENSOR_UNARY_OP_LOCALS
11429
-
11430
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
11431
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
11432
-
11433
- assert(nb0 == sizeof(ggml_fp16_t));
11434
-
11435
- const int ith = params->ith;
11436
- const int nth = params->nth;
11437
-
11438
- const int nr = ggml_nrows(dst);
11439
-
11440
- // rows per thread
11441
- const int dr = (nr + nth - 1)/nth;
11442
-
11443
- // row range for this thread
11444
- const int ir0 = dr*ith;
11445
- const int ir1 = MIN(ir0 + dr, nr);
11446
-
11447
- // row index used to determine which thread to use
11448
- int ir = 0;
11449
-
11450
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11451
-
11452
- const bool is_neox = mode & 2;
11453
-
11454
- const int32_t * pos = (const int32_t *) src1->data;
11455
-
11456
- for (int64_t i3 = 0; i3 < ne3; i3++) {
11457
- for (int64_t i2 = 0; i2 < ne2; i2++) {
11458
- const int64_t p = pos[i2];
11459
- for (int64_t i1 = 0; i1 < ne1; i1++) {
11460
- if (ir++ < ir0) continue;
11461
- if (ir > ir1) break;
11462
-
11463
- float theta_base = (float)p;
11464
-
11465
- if (!is_neox) {
11466
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11467
- const float cos_theta = cosf(theta_base);
11468
- const float sin_theta = sinf(theta_base);
11469
-
11470
- theta_base *= theta_scale;
11471
-
11472
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11473
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11474
-
11475
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
11476
- const float dy1 = GGML_FP16_TO_FP32(dy[1]);
11477
-
11478
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
11479
- dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
11480
- }
11481
- } else {
11482
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11483
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11484
- const float cos_theta = cosf(theta_base);
11485
- const float sin_theta = sinf(theta_base);
11486
-
11487
- theta_base *= theta_scale;
11488
-
11489
- const int64_t i0 = ib*n_dims + ic/2;
11490
-
11491
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11492
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11493
-
11494
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
11495
- const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
11496
-
11497
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
11498
- dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
11499
- }
11500
- }
11501
- }
11502
- }
11503
- }
11504
- }
11505
- }
11506
-
11507
- static void ggml_compute_forward_rope_back(
11508
- const struct ggml_compute_params * params,
11509
- const struct ggml_tensor * src0,
11510
- const struct ggml_tensor * src1,
11511
- struct ggml_tensor * dst) {
11512
- switch (src0->type) {
11513
- case GGML_TYPE_F16:
11514
- {
11515
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
11516
- } break;
11517
- case GGML_TYPE_F32:
11518
- {
11519
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
11520
- } break;
11521
- default:
11522
- {
11523
- GGML_ASSERT(false);
11524
- } break;
11525
- }
11526
- }
11527
-
11528
- // ggml_compute_forward_conv_1d
11529
-
11530
- static void ggml_compute_forward_conv_1d_f16_f32(
11531
- const struct ggml_compute_params * params,
11532
- const struct ggml_tensor * src0,
11533
- const struct ggml_tensor * src1,
11534
- struct ggml_tensor * dst) {
11535
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11536
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11537
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11538
-
11539
- int64_t t0 = ggml_perf_time_us();
11540
- UNUSED(t0);
11541
-
11542
- GGML_TENSOR_BINARY_OP_LOCALS
11543
-
11544
- const int ith = params->ith;
11545
- const int nth = params->nth;
11546
-
11547
- const int nk = ne00;
11548
-
11549
- // size of the convolution row - the kernel size unrolled across all input channels
11550
- const int ew0 = nk*ne01;
11551
-
11552
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11553
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11554
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11555
-
11556
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11557
- GGML_ASSERT(nb10 == sizeof(float));
11558
-
11559
- if (params->type == GGML_TASK_INIT) {
11560
- memset(params->wdata, 0, params->wsize);
11561
-
11562
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11563
-
11564
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11565
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11566
- ggml_fp16_t * dst_data = wdata;
11567
-
11568
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11569
- for (int64_t ik = 0; ik < nk; ik++) {
11570
- const int idx0 = i0*s0 + ik*d0 - p0;
11571
-
11572
- if(!(idx0 < 0 || idx0 >= ne10)) {
11573
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
11574
- }
11575
- }
11576
- }
11577
- }
11578
-
11579
- return;
11580
- }
11581
-
11582
- if (params->type == GGML_TASK_FINALIZE) {
11583
- return;
11584
- }
11585
-
11586
- // total rows in dst
11587
- const int nr = ne2;
11588
-
11589
- // rows per thread
11590
- const int dr = (nr + nth - 1)/nth;
11591
-
11592
- // row range for this thread
11593
- const int ir0 = dr*ith;
11594
- const int ir1 = MIN(ir0 + dr, nr);
11595
-
11596
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11597
-
11598
- for (int i2 = 0; i2 < ne2; i2++) {
11599
- for (int i1 = ir0; i1 < ir1; i1++) {
11600
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11601
-
11602
- for (int i0 = 0; i0 < ne0; i0++) {
11603
- ggml_vec_dot_f16(ew0, dst_data + i0,
11604
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
11605
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
11606
- }
11607
- }
11608
- }
11609
- }
11610
-
11611
- static void ggml_compute_forward_conv_1d_f32(
11612
- const struct ggml_compute_params * params,
11613
- const struct ggml_tensor * src0,
11614
- const struct ggml_tensor * src1,
11615
- struct ggml_tensor * dst) {
11616
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
11617
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11618
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11619
-
11620
- int64_t t0 = ggml_perf_time_us();
11621
- UNUSED(t0);
11622
-
11623
- GGML_TENSOR_BINARY_OP_LOCALS
11624
-
11625
- const int ith = params->ith;
11626
- const int nth = params->nth;
11627
-
11628
- const int nk = ne00;
11629
-
11630
- const int ew0 = nk*ne01;
11631
-
11632
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11633
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11634
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11635
-
11636
- GGML_ASSERT(nb00 == sizeof(float));
11637
- GGML_ASSERT(nb10 == sizeof(float));
11638
-
11639
- if (params->type == GGML_TASK_INIT) {
11640
- memset(params->wdata, 0, params->wsize);
11641
-
11642
- float * const wdata = (float *) params->wdata + 0;
11643
-
11644
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11645
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11646
- float * dst_data = wdata;
11647
-
11648
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11649
- for (int64_t ik = 0; ik < nk; ik++) {
11650
- const int idx0 = i0*s0 + ik*d0 - p0;
11651
-
11652
- if(!(idx0 < 0 || idx0 >= ne10)) {
11653
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
11654
- }
11655
- }
11656
- }
11657
- }
11658
-
11659
- return;
11660
- }
11661
-
11662
- if (params->type == GGML_TASK_FINALIZE) {
11663
- return;
11664
- }
11665
-
11666
- // total rows in dst
11667
- const int nr = ne02;
11668
-
11669
- // rows per thread
11670
- const int dr = (nr + nth - 1)/nth;
11671
-
11672
- // row range for this thread
11673
- const int ir0 = dr*ith;
11674
- const int ir1 = MIN(ir0 + dr, nr);
11675
-
11676
- float * const wdata = (float *) params->wdata + 0;
11677
-
11678
- for (int i2 = 0; i2 < ne2; i2++) {
11679
- for (int i1 = ir0; i1 < ir1; i1++) {
11680
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11681
-
11682
- for (int i0 = 0; i0 < ne0; i0++) {
11683
- ggml_vec_dot_f32(ew0, dst_data + i0,
11684
- (float *) ((char *) src0->data + i1*nb02),
11685
- (float *) wdata + i2*nb2 + i0*ew0);
11686
- }
11687
- }
11688
- }
11689
- }
11690
-
11691
- // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
11692
- static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
11693
- ggml_fp16_t * A,
11694
- ggml_fp16_t * B,
11695
- float * C,
11696
- const int ith, const int nth) {
11697
- // does not seem to make a difference
11698
- int64_t m0, m1, n0, n1;
11699
- // patches per thread
11700
- if (m > n) {
11701
- n0 = 0;
11702
- n1 = n;
11703
-
11704
- // total patches in dst
11705
- const int np = m;
11706
-
11707
- // patches per thread
11708
- const int dp = (np + nth - 1)/nth;
11709
-
11710
- // patch range for this thread
11711
- m0 = dp*ith;
11712
- m1 = MIN(m0 + dp, np);
11713
- } else {
11714
- m0 = 0;
11715
- m1 = m;
11716
-
11717
- // total patches in dst
11718
- const int np = n;
11719
-
11720
- // patches per thread
11721
- const int dp = (np + nth - 1)/nth;
11722
-
11723
- // patch range for this thread
11724
- n0 = dp*ith;
11725
- n1 = MIN(n0 + dp, np);
11726
- }
11727
-
11728
- // block-tiling attempt
11729
- int64_t blck_n = 16;
11730
- int64_t blck_m = 16;
11731
-
11732
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
11733
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
11734
- // if (blck_size > 0) {
11735
- // blck_0 = 4;
11736
- // blck_1 = blck_size / blck_0;
11737
- // if (blck_1 < 0) {
11738
- // blck_1 = 1;
11739
- // }
11740
- // // blck_0 = (int64_t)sqrt(blck_size);
11741
- // // blck_1 = blck_0;
11742
- // }
11743
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
11744
-
11745
- for (int j = n0; j < n1; j+=blck_n) {
11746
- for (int i = m0; i < m1; i+=blck_m) {
11747
- // printf("i j k => %d %d %d\n", i, j, K);
11748
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
11749
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
11750
- ggml_vec_dot_f16(k,
11751
- C + ii*n + jj,
11752
- A + ii * k,
11753
- B + jj * k);
11754
- }
11755
- }
11756
- }
11757
- }
11758
- }
11759
-
11760
- // src0: kernel [OC, IC, K]
11761
- // src1: signal [N, IC, IL]
11762
- // dst: result [N, OL, IC*K]
11763
- static void ggml_compute_forward_conv_1d_stage_0_f32(
11764
- const struct ggml_compute_params * params,
11765
- const struct ggml_tensor * src0,
11766
- const struct ggml_tensor * src1,
11767
- struct ggml_tensor * dst) {
11768
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11769
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11770
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
11771
-
11772
- int64_t t0 = ggml_perf_time_us();
11773
- UNUSED(t0);
11774
-
11775
- GGML_TENSOR_BINARY_OP_LOCALS;
11776
-
11777
- const int64_t N = ne12;
11778
- const int64_t IC = ne11;
11779
- const int64_t IL = ne10;
11780
-
11781
- const int64_t K = ne00;
11782
-
11783
- const int64_t OL = ne1;
11784
-
11785
- const int ith = params->ith;
11786
- const int nth = params->nth;
11787
-
11788
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11789
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11790
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11791
-
11792
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11793
- GGML_ASSERT(nb10 == sizeof(float));
11794
-
11795
- if (params->type == GGML_TASK_INIT) {
11796
- memset(dst->data, 0, ggml_nbytes(dst));
11797
- return;
11798
- }
11799
-
11800
- if (params->type == GGML_TASK_FINALIZE) {
11801
- return;
11802
- }
11803
-
11804
- // im2col: [N, IC, IL] => [N, OL, IC*K]
11805
- {
11806
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
11807
-
11808
- for (int64_t in = 0; in < N; in++) {
11809
- for (int64_t iol = 0; iol < OL; iol++) {
11810
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11811
-
11812
- // micro kernel
11813
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
11814
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
11815
-
11816
- for (int64_t ik = 0; ik < K; ik++) {
11817
- const int64_t iil = iol*s0 + ik*d0 - p0;
11818
-
11819
- if (!(iil < 0 || iil >= IL)) {
11820
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
11821
- }
11822
- }
11823
- }
11824
- }
11825
- }
11826
- }
11827
- }
11828
-
11829
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11830
- // src0: [OC, IC, K]
11831
- // src1: [N, OL, IC * K]
11832
- // result: [N, OC, OL]
11833
- static void ggml_compute_forward_conv_1d_stage_1_f16(
11834
- const struct ggml_compute_params * params,
11835
- const struct ggml_tensor * src0,
11836
- const struct ggml_tensor * src1,
11837
- struct ggml_tensor * dst) {
11838
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11839
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
11840
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11841
-
11842
- int64_t t0 = ggml_perf_time_us();
11843
- UNUSED(t0);
11844
-
11845
- if (params->type == GGML_TASK_INIT) {
11846
- return;
11847
- }
11848
-
11849
- if (params->type == GGML_TASK_FINALIZE) {
11850
- return;
11851
- }
11852
-
11853
- GGML_TENSOR_BINARY_OP_LOCALS;
11854
-
11855
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11856
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
11857
- GGML_ASSERT(nb0 == sizeof(float));
11858
-
11859
- const int N = ne12;
11860
- const int OL = ne11;
11861
-
11862
- const int OC = ne02;
11863
- const int IC = ne01;
11864
- const int K = ne00;
11865
-
11866
- const int ith = params->ith;
11867
- const int nth = params->nth;
11868
-
11869
- int64_t m = OC;
11870
- int64_t n = OL;
11871
- int64_t k = IC * K;
11872
-
11873
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11874
- for (int i = 0; i < N; i++) {
11875
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
11876
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
11877
- float * C = (float *)dst->data + i * m * n; // [m, n]
11878
-
11879
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
11880
- }
11881
- }
11882
-
11883
- static void ggml_compute_forward_conv_1d(
11884
- const struct ggml_compute_params * params,
11885
- const struct ggml_tensor * src0,
11886
- const struct ggml_tensor * src1,
11887
- struct ggml_tensor * dst) {
11888
- switch(src0->type) {
11889
- case GGML_TYPE_F16:
11890
- {
11891
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
11892
- } break;
11893
- case GGML_TYPE_F32:
11894
- {
11895
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
11896
- } break;
11897
- default:
11898
- {
11899
- GGML_ASSERT(false);
11900
- } break;
11901
- }
11902
- }
11903
-
11904
- static void ggml_compute_forward_conv_1d_stage_0(
11905
- const struct ggml_compute_params * params,
11906
- const struct ggml_tensor * src0,
11907
- const struct ggml_tensor * src1,
11908
- struct ggml_tensor * dst) {
11909
- switch(src0->type) {
11910
- case GGML_TYPE_F16:
11911
- {
11912
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
11913
- } break;
11914
- default:
11915
- {
11916
- GGML_ASSERT(false);
11917
- } break;
11918
- }
11919
- }
11920
-
11921
- static void ggml_compute_forward_conv_1d_stage_1(
11922
- const struct ggml_compute_params * params,
11923
- const struct ggml_tensor * src0,
11924
- const struct ggml_tensor * src1,
11925
- struct ggml_tensor * dst) {
11926
- switch(src0->type) {
11927
- case GGML_TYPE_F16:
11928
- {
11929
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
11930
- } break;
11931
- default:
11932
- {
11933
- GGML_ASSERT(false);
11934
- } break;
11935
- }
11936
- }
11937
-
11938
- // ggml_compute_forward_conv_transpose_1d
11939
-
11940
- static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11941
- const struct ggml_compute_params * params,
11942
- const struct ggml_tensor * src0,
11943
- const struct ggml_tensor * src1,
11944
- struct ggml_tensor * dst) {
11945
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11946
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11947
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11948
-
11949
- int64_t t0 = ggml_perf_time_us();
11950
- UNUSED(t0);
11951
-
11952
- GGML_TENSOR_BINARY_OP_LOCALS
11953
-
11954
- const int ith = params->ith;
11955
- const int nth = params->nth;
11956
-
11957
- const int nk = ne00*ne01*ne02;
11958
-
11959
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11960
- GGML_ASSERT(nb10 == sizeof(float));
11961
-
11962
- if (params->type == GGML_TASK_INIT) {
11963
- memset(params->wdata, 0, params->wsize);
11964
-
11965
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11966
- {
11967
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11968
-
11969
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11970
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11971
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11972
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11973
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11974
- dst_data[i00*ne02 + i02] = src[i00];
11975
- }
11976
- }
11977
- }
11978
- }
11979
-
11980
- // permute source data (src1) from (L x Cin) to (Cin x L)
11981
- {
11982
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11983
- ggml_fp16_t * dst_data = wdata;
11356
+ // permute source data (src1) from (L x Cin) to (Cin x L)
11357
+ {
11358
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11359
+ ggml_fp16_t * dst_data = wdata;
11984
11360
 
11985
11361
  for (int64_t i11 = 0; i11 < ne11; i11++) {
11986
11362
  const float * const src = (float *)((char *) src1->data + i11*nb11);
@@ -12146,12 +11522,10 @@ static void ggml_compute_forward_conv_transpose_1d(
12146
11522
  }
12147
11523
  }
12148
11524
 
12149
- // ggml_compute_forward_conv_2d
12150
-
12151
11525
  // src0: kernel [OC, IC, KH, KW]
12152
11526
  // src1: image [N, IC, IH, IW]
12153
11527
  // dst: result [N, OH, OW, IC*KH*KW]
12154
- static void ggml_compute_forward_conv_2d_stage_0_f32(
11528
+ static void ggml_compute_forward_im2col_f16(
12155
11529
  const struct ggml_compute_params * params,
12156
11530
  const struct ggml_tensor * src0,
12157
11531
  const struct ggml_tensor * src1,
@@ -12165,34 +11539,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12165
11539
 
12166
11540
  GGML_TENSOR_BINARY_OP_LOCALS;
12167
11541
 
12168
- const int64_t N = ne13;
12169
- const int64_t IC = ne12;
12170
- const int64_t IH = ne11;
11542
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
11543
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
11544
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
11545
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
11546
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
11547
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
11548
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
11549
+
11550
+ const int ith = params->ith;
11551
+ const int nth = params->nth;
11552
+
11553
+ const int64_t N = is_2D ? ne13 : ne12;
11554
+ const int64_t IC = is_2D ? ne12 : ne11;
11555
+ const int64_t IH = is_2D ? ne11 : 1;
12171
11556
  const int64_t IW = ne10;
12172
11557
 
12173
- // const int64_t OC = ne03;
12174
- // const int64_t IC = ne02;
12175
- const int64_t KH = ne01;
11558
+ const int64_t KH = is_2D ? ne01 : 1;
12176
11559
  const int64_t KW = ne00;
12177
11560
 
12178
- const int64_t OH = ne2;
11561
+ const int64_t OH = is_2D ? ne2 : 1;
12179
11562
  const int64_t OW = ne1;
12180
11563
 
12181
- const int ith = params->ith;
12182
- const int nth = params->nth;
12183
-
12184
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12185
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12186
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12187
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12188
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12189
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
11564
+ int ofs0 = is_2D ? nb13 : nb12;
11565
+ int ofs1 = is_2D ? nb12 : nb11;
12190
11566
 
12191
11567
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12192
11568
  GGML_ASSERT(nb10 == sizeof(float));
12193
11569
 
12194
11570
  if (params->type == GGML_TASK_INIT) {
12195
- memset(dst->data, 0, ggml_nbytes(dst));
12196
11571
  return;
12197
11572
  }
12198
11573
 
@@ -12205,20 +11580,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12205
11580
  ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
12206
11581
 
12207
11582
  for (int64_t in = 0; in < N; in++) {
12208
- for (int64_t ioh = 0; ioh < OH; ioh++) {
11583
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12209
11584
  for (int64_t iow = 0; iow < OW; iow++) {
12210
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11585
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12211
11586
 
12212
11587
  // micro kernel
12213
11588
  ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12214
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
11589
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12215
11590
 
12216
- for (int64_t ikh = 0; ikh < KH; ikh++) {
11591
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12217
11592
  for (int64_t ikw = 0; ikw < KW; ikw++) {
12218
11593
  const int64_t iiw = iow*s0 + ikw*d0 - p0;
12219
11594
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
12220
11595
 
12221
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
11596
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
11597
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
11598
+ } else {
12222
11599
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12223
11600
  }
12224
11601
  }
@@ -12230,223 +11607,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12230
11607
  }
12231
11608
  }
12232
11609
 
12233
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12234
- // src0: [OC, IC, KH, KW]
12235
- // src1: [N, OH, OW, IC * KH * KW]
12236
- // result: [N, OC, OH, OW]
12237
- static void ggml_compute_forward_conv_2d_stage_1_f16(
12238
- const struct ggml_compute_params * params,
12239
- const struct ggml_tensor * src0,
12240
- const struct ggml_tensor * src1,
12241
- struct ggml_tensor * dst) {
12242
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12243
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
12244
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12245
-
12246
- int64_t t0 = ggml_perf_time_us();
12247
- UNUSED(t0);
12248
-
12249
- if (params->type == GGML_TASK_INIT) {
12250
- return;
12251
- }
12252
-
12253
- if (params->type == GGML_TASK_FINALIZE) {
12254
- return;
12255
- }
12256
-
12257
- GGML_TENSOR_BINARY_OP_LOCALS;
12258
-
12259
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12260
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
12261
- GGML_ASSERT(nb0 == sizeof(float));
12262
-
12263
- const int N = ne13;
12264
- const int OH = ne12;
12265
- const int OW = ne11;
12266
-
12267
- const int OC = ne03;
12268
- const int IC = ne02;
12269
- const int KH = ne01;
12270
- const int KW = ne00;
12271
-
12272
- const int ith = params->ith;
12273
- const int nth = params->nth;
12274
-
12275
- int64_t m = OC;
12276
- int64_t n = OH * OW;
12277
- int64_t k = IC * KH * KW;
12278
-
12279
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12280
- for (int i = 0; i < N; i++) {
12281
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12282
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
12283
- float * C = (float *)dst->data + i * m * n; // [m, n]
12284
-
12285
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12286
- }
12287
- }
12288
-
12289
- static void ggml_compute_forward_conv_2d_f16_f32(
12290
- const struct ggml_compute_params * params,
12291
- const struct ggml_tensor * src0,
12292
- const struct ggml_tensor * src1,
12293
- struct ggml_tensor * dst) {
12294
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12295
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12296
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12297
-
12298
- int64_t t0 = ggml_perf_time_us();
12299
- UNUSED(t0);
12300
-
12301
- GGML_TENSOR_BINARY_OP_LOCALS
12302
-
12303
- // src1: image [N, IC, IH, IW]
12304
- // src0: kernel [OC, IC, KH, KW]
12305
- // dst: result [N, OC, OH, OW]
12306
- // ne12: IC
12307
- // ne0: OW
12308
- // ne1: OH
12309
- // nk0: KW
12310
- // nk1: KH
12311
- // ne13: N
12312
-
12313
- const int N = ne13;
12314
- const int IC = ne12;
12315
- const int IH = ne11;
12316
- const int IW = ne10;
12317
-
12318
- const int OC = ne03;
12319
- // const int IC = ne02;
12320
- const int KH = ne01;
12321
- const int KW = ne00;
12322
-
12323
- const int OH = ne1;
12324
- const int OW = ne0;
12325
-
12326
- const int ith = params->ith;
12327
- const int nth = params->nth;
12328
-
12329
- // const int nk0 = ne00;
12330
- // const int nk1 = ne01;
12331
-
12332
- // size of the convolution row - the kernel size unrolled across all channels
12333
- // const int ew0 = nk0*nk1*ne02;
12334
- // ew0: IC*KH*KW
12335
-
12336
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12337
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12338
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12339
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12340
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12341
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12342
-
12343
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12344
- GGML_ASSERT(nb10 == sizeof(float));
12345
-
12346
- if (params->type == GGML_TASK_INIT) {
12347
- memset(params->wdata, 0, params->wsize);
12348
-
12349
- // prepare source data (src1)
12350
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
12351
-
12352
- {
12353
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12354
-
12355
- for (int in = 0; in < N; in++) {
12356
- for (int iic = 0; iic < IC; iic++) {
12357
- for (int ioh = 0; ioh < OH; ioh++) {
12358
- for (int iow = 0; iow < OW; iow++) {
12359
-
12360
- // micro kernel
12361
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12362
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
12363
-
12364
- for (int ikh = 0; ikh < KH; ikh++) {
12365
- for (int ikw = 0; ikw < KW; ikw++) {
12366
- const int iiw = iow*s0 + ikw*d0 - p0;
12367
- const int iih = ioh*s1 + ikh*d1 - p1;
12368
-
12369
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
12370
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12371
- }
12372
- }
12373
- }
12374
- }
12375
- }
12376
- }
12377
- }
12378
- }
12379
-
12380
- return;
12381
- }
12382
-
12383
- if (params->type == GGML_TASK_FINALIZE) {
12384
- return;
12385
- }
12386
-
12387
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12388
- // wdata: [N*OH*OW, IC*KH*KW]
12389
- // dst: result [N, OC, OH, OW]
12390
- // src0: kernel [OC, IC, KH, KW]
12391
-
12392
- int64_t m = OC;
12393
- int64_t n = OH * OW;
12394
- int64_t k = IC * KH * KW;
12395
-
12396
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12397
- for (int i = 0; i < N; i++) {
12398
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12399
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
12400
- float * C = (float *)dst->data + i * m * n; // [m * k]
12401
-
12402
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12403
- }
12404
- }
12405
-
12406
- static void ggml_compute_forward_conv_2d(
12407
- const struct ggml_compute_params * params,
12408
- const struct ggml_tensor * src0,
12409
- const struct ggml_tensor * src1,
12410
- struct ggml_tensor * dst) {
12411
- switch (src0->type) {
12412
- case GGML_TYPE_F16:
12413
- {
12414
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
12415
- } break;
12416
- case GGML_TYPE_F32:
12417
- {
12418
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
12419
- GGML_ASSERT(false);
12420
- } break;
12421
- default:
12422
- {
12423
- GGML_ASSERT(false);
12424
- } break;
12425
- }
12426
- }
12427
-
12428
- static void ggml_compute_forward_conv_2d_stage_0(
12429
- const struct ggml_compute_params * params,
12430
- const struct ggml_tensor * src0,
12431
- const struct ggml_tensor * src1,
12432
- struct ggml_tensor * dst) {
12433
- switch (src0->type) {
12434
- case GGML_TYPE_F16:
12435
- {
12436
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
12437
- } break;
12438
- case GGML_TYPE_F32:
12439
- {
12440
- GGML_ASSERT(false);
12441
- } break;
12442
- default:
12443
- {
12444
- GGML_ASSERT(false);
12445
- } break;
12446
- }
12447
- }
12448
-
12449
- static void ggml_compute_forward_conv_2d_stage_1(
11610
+ static void ggml_compute_forward_im2col(
12450
11611
  const struct ggml_compute_params * params,
12451
11612
  const struct ggml_tensor * src0,
12452
11613
  const struct ggml_tensor * src1,
@@ -12454,7 +11615,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
12454
11615
  switch (src0->type) {
12455
11616
  case GGML_TYPE_F16:
12456
11617
  {
12457
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
11618
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12458
11619
  } break;
12459
11620
  case GGML_TYPE_F32:
12460
11621
  {
@@ -12639,14 +11800,11 @@ static void ggml_compute_forward_pool_1d(
12639
11800
  ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
12640
11801
  }
12641
11802
 
12642
- // ggml_compute_forward_pool_2d_sk_p0
11803
+ // ggml_compute_forward_pool_2d
12643
11804
 
12644
- static void ggml_compute_forward_pool_2d_sk_p0(
11805
+ static void ggml_compute_forward_pool_2d(
12645
11806
  const struct ggml_compute_params * params,
12646
- const enum ggml_op_pool op,
12647
11807
  const struct ggml_tensor * src,
12648
- const int k0,
12649
- const int k1,
12650
11808
  struct ggml_tensor * dst) {
12651
11809
  assert(src->type == GGML_TYPE_F32);
12652
11810
  assert(params->ith == 0);
@@ -12655,6 +11813,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12655
11813
  return;
12656
11814
  }
12657
11815
 
11816
+ const int32_t * opts = (const int32_t *)dst->op_params;
11817
+ enum ggml_op_pool op = opts[0];
11818
+ const int k0 = opts[1];
11819
+ const int k1 = opts[2];
11820
+ const int s0 = opts[3];
11821
+ const int s1 = opts[4];
11822
+ const int p0 = opts[5];
11823
+ const int p1 = opts[6];
12658
11824
  const char * cdata = (const char*)src->data;
12659
11825
  const char * const data_end = cdata + ggml_nbytes(src);
12660
11826
 
@@ -12665,6 +11831,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12665
11831
  float * dplane = (float *)dst->data;
12666
11832
 
12667
11833
  const int ka = k0 * k1;
11834
+ const int offset0 = -p0;
11835
+ const int offset1 = -p1;
12668
11836
 
12669
11837
  while (cdata < data_end) {
12670
11838
  for (int oy = 0; oy < py; ++oy) {
@@ -12677,13 +11845,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12677
11845
  case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
12678
11846
  }
12679
11847
 
12680
- const int ix = ox * k0;
12681
- const int iy = oy * k1;
11848
+ const int ix = offset0 + ox * s0;
11849
+ const int iy = offset1 + oy * s1;
12682
11850
 
12683
11851
  for (int ky = 0; ky < k1; ++ky) {
11852
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
12684
11853
  const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
12685
11854
  for (int kx = 0; kx < k0; ++kx) {
12686
11855
  int j = ix + kx;
11856
+ if (j < 0 || j >= src->ne[0]) continue;
12687
11857
  switch (op) {
12688
11858
  case GGML_OP_POOL_AVG: *out += srow[j]; break;
12689
11859
  case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
@@ -12700,31 +11870,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12700
11870
  }
12701
11871
 
12702
11872
  cdata += src->nb[2];
12703
- dplane += pa;
12704
- }
12705
- }
12706
-
12707
- // ggml_compute_forward_pool_2d
12708
-
12709
- static void ggml_compute_forward_pool_2d(
12710
- const struct ggml_compute_params * params,
12711
- const struct ggml_tensor * src0,
12712
- struct ggml_tensor * dst) {
12713
-
12714
- const int32_t * opts = (const int32_t *)dst->op_params;
12715
- enum ggml_op_pool op = opts[0];
12716
- const int k0 = opts[1];
12717
- const int k1 = opts[2];
12718
- const int s0 = opts[3];
12719
- const int s1 = opts[4];
12720
- const int p0 = opts[5];
12721
- const int p1 = opts[6];
12722
- GGML_ASSERT(p0 == 0);
12723
- GGML_ASSERT(p1 == 0); // padding not supported
12724
- GGML_ASSERT(k0 == s0);
12725
- GGML_ASSERT(k1 == s1); // only s = k supported
12726
-
12727
- ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
11873
+ dplane += pa;
11874
+ }
12728
11875
  }
12729
11876
 
12730
11877
  // ggml_compute_forward_upscale
@@ -13928,6 +13075,10 @@ static void ggml_compute_forward_unary(
13928
13075
  {
13929
13076
  ggml_compute_forward_silu(params, src0, dst);
13930
13077
  } break;
13078
+ case GGML_UNARY_OP_LEAKY:
13079
+ {
13080
+ ggml_compute_forward_leaky(params, src0, dst);
13081
+ } break;
13931
13082
  default:
13932
13083
  {
13933
13084
  GGML_ASSERT(false);
@@ -14681,33 +13832,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14681
13832
  {
14682
13833
  ggml_compute_forward_clamp(params, tensor->src[0], tensor);
14683
13834
  } break;
14684
- case GGML_OP_CONV_1D:
14685
- {
14686
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
14687
- } break;
14688
- case GGML_OP_CONV_1D_STAGE_0:
14689
- {
14690
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14691
- } break;
14692
- case GGML_OP_CONV_1D_STAGE_1:
14693
- {
14694
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
14695
- } break;
14696
13835
  case GGML_OP_CONV_TRANSPOSE_1D:
14697
13836
  {
14698
13837
  ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
14699
13838
  } break;
14700
- case GGML_OP_CONV_2D:
14701
- {
14702
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14703
- } break;
14704
- case GGML_OP_CONV_2D_STAGE_0:
13839
+ case GGML_OP_IM2COL:
14705
13840
  {
14706
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14707
- } break;
14708
- case GGML_OP_CONV_2D_STAGE_1:
14709
- {
14710
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
13841
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
14711
13842
  } break;
14712
13843
  case GGML_OP_CONV_TRANSPOSE_2D:
14713
13844
  {
@@ -14836,62 +13967,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14836
13967
 
14837
13968
  ////////////////////////////////////////////////////////////////////////////////
14838
13969
 
14839
- static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
13970
+ static size_t ggml_hash_size(size_t min_sz) {
13971
+ // next primes after powers of two
13972
+ static const size_t primes[] = {
13973
+ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
13974
+ 2053, 4099, 8209, 16411, 32771, 65537, 131101,
13975
+ 262147, 524309, 1048583, 2097169, 4194319, 8388617,
13976
+ 16777259, 33554467, 67108879, 134217757, 268435459,
13977
+ 536870923, 1073741827, 2147483659
13978
+ };
13979
+ static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
13980
+
13981
+ // find the smallest prime that is larger or equal to min_sz
13982
+ size_t l = 0;
13983
+ size_t r = n_primes;
13984
+ while (l < r) {
13985
+ size_t m = (l + r)/2;
13986
+ if (primes[m] < min_sz) {
13987
+ l = m + 1;
13988
+ } else {
13989
+ r = m;
13990
+ }
13991
+ }
13992
+ size_t sz = l < n_primes ? primes[l] : min_sz | 1;
13993
+ return sz;
13994
+ }
14840
13995
 
14841
- static size_t hash(void * p) {
14842
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
13996
+ static size_t ggml_hash(const void * p) {
13997
+ return (size_t)p;
14843
13998
  }
14844
13999
 
14845
- static size_t hash_find(void * hash_table[], void * p) {
14846
- size_t h = hash(p);
14000
+ size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14001
+ size_t h = ggml_hash(key) % hash_set.size;
14847
14002
 
14848
14003
  // linear probing
14849
14004
  size_t i = h;
14850
- while (hash_table[i] != NULL && hash_table[i] != p) {
14851
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
14005
+ while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
14006
+ i = (i + 1) % hash_set.size;
14852
14007
  if (i == h) {
14853
14008
  // visited all hash table entries -> not found
14854
- return GGML_GRAPH_HASHTABLE_SIZE;
14009
+ return GGML_HASHTABLE_FULL;
14855
14010
  }
14856
14011
  }
14857
14012
  return i;
14858
14013
  }
14859
14014
 
14860
- static bool hash_insert(void * hash_table[], void * p) {
14861
- size_t i = hash_find(hash_table, p);
14015
+ bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14016
+ size_t i = ggml_hash_find(hash_set, key);
14017
+ return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
14018
+ }
14019
+
14020
+ size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14021
+ size_t i = ggml_hash_find(hash_set, key);
14862
14022
 
14863
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14023
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14864
14024
 
14865
- if (hash_table[i] == p) {
14866
- return true;
14025
+ if (hash_set.keys[i] == key) {
14026
+ return GGML_HASHTABLE_ALREADY_EXISTS;
14867
14027
  }
14868
14028
 
14869
14029
  // insert
14870
- GGML_ASSERT(hash_table[i] == NULL);
14871
- hash_table[i] = p;
14872
- return false;
14030
+ GGML_ASSERT(hash_set.keys[i] == NULL);
14031
+ hash_set.keys[i] = key;
14032
+ return i;
14033
+ }
14034
+
14035
+ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14036
+ size_t i = ggml_hash_find(hash_set, key);
14037
+
14038
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14039
+
14040
+ hash_set.keys[i] = key;
14041
+ return i;
14042
+ }
14043
+
14044
+ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14045
+ size = ggml_hash_size(size);
14046
+ struct ggml_hash_set result;
14047
+ result.size = size;
14048
+ result.keys = malloc(sizeof(struct ggml_tensor *) * size);
14049
+ memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14050
+ return result;
14873
14051
  }
14874
14052
 
14875
- static bool hash_contains(void * hash_table[], void * p) {
14876
- size_t i = hash_find(hash_table, p);
14877
- return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
14053
+ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14054
+ free(hash_set.keys);
14878
14055
  }
14879
14056
 
14880
14057
  struct hash_map {
14881
- void * keys[GGML_GRAPH_HASHTABLE_SIZE];
14882
- void * vals[GGML_GRAPH_HASHTABLE_SIZE];
14058
+ struct ggml_hash_set set;
14059
+ struct ggml_tensor ** vals;
14883
14060
  };
14884
14061
 
14885
- static struct hash_map * new_hash_map(void) {
14062
+ static struct hash_map * ggml_new_hash_map(size_t size) {
14886
14063
  struct hash_map * result = malloc(sizeof(struct hash_map));
14887
- for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
14888
- result->keys[i] = NULL;
14889
- result->vals[i] = NULL;
14890
- }
14064
+ result->set = ggml_hash_set_new(size);
14065
+ result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
14066
+ memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14891
14067
  return result;
14892
14068
  }
14893
14069
 
14894
- static void free_hash_map(struct hash_map * map) {
14070
+ static void ggml_hash_map_free(struct hash_map * map) {
14071
+ ggml_hash_set_free(map->set);
14072
+ free(map->vals);
14895
14073
  free(map);
14896
14074
  }
14897
14075
 
@@ -14911,7 +14089,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14911
14089
  return node;
14912
14090
  }
14913
14091
 
14914
- if (!hash_contains(graph->visited_hash_table, node)) {
14092
+ if (!ggml_hash_contains(graph->visited_hash_table, node)) {
14915
14093
  return node;
14916
14094
  }
14917
14095
 
@@ -14926,17 +14104,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14926
14104
  return node;
14927
14105
  }
14928
14106
 
14929
- size_t i = hash_find(replacements->keys, node);
14930
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14931
- if (replacements->keys[i] == node) {
14932
- return (struct ggml_tensor *) replacements->vals[i];
14107
+ size_t i = ggml_hash_find(replacements->set, node);
14108
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
14109
+ if (replacements->set.keys[i] == node) {
14110
+ return replacements->vals[i];
14933
14111
  }
14934
14112
 
14935
14113
  struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14936
14114
 
14937
14115
  // insert clone into replacements
14938
- GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
14939
- replacements->keys[i] = node;
14116
+ GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
14117
+ replacements->set.keys[i] = node;
14940
14118
  replacements->vals[i] = clone;
14941
14119
 
14942
14120
  clone->op = node->op;
@@ -14973,26 +14151,26 @@ void ggml_build_backward_gradient_checkpointing(
14973
14151
  struct ggml_cgraph * gb_tmp,
14974
14152
  struct ggml_tensor * * checkpoints,
14975
14153
  int n_checkpoints) {
14976
- *gb_tmp = *gf;
14154
+ ggml_graph_cpy(gf, gb_tmp);
14977
14155
  ggml_build_backward_expand(ctx, gf, gb_tmp, true);
14978
14156
 
14979
14157
  if (n_checkpoints <= 0) {
14980
- *gb = *gb_tmp;
14158
+ ggml_graph_cpy(gb_tmp, gb);
14981
14159
  return;
14982
14160
  }
14983
14161
 
14984
- struct hash_map * replacements = new_hash_map();
14162
+ struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
14985
14163
 
14986
14164
  // insert checkpoints in replacements
14987
14165
  for (int i = 0; i < n_checkpoints; ++i) {
14988
- size_t k = hash_find(replacements->keys, checkpoints[i]);
14989
- GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14990
- GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
14991
- replacements->keys[k] = checkpoints[i];
14992
- replacements->vals[k] = checkpoints[i];
14166
+ size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
14167
+ GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
14168
+ GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
14169
+ replacements->set.keys[k] = checkpoints[i];
14170
+ replacements->vals[k] = checkpoints[i];
14993
14171
  }
14994
14172
 
14995
- *gb = *gf;
14173
+ ggml_graph_cpy(gf, gb);
14996
14174
  // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
14997
14175
  // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
14998
14176
  // by recomputing them from checkpoints
@@ -15009,21 +14187,21 @@ void ggml_build_backward_gradient_checkpointing(
15009
14187
  ggml_build_forward_expand(gb, node);
15010
14188
  }
15011
14189
 
15012
- free_hash_map(replacements);
14190
+ ggml_hash_map_free(replacements);
15013
14191
  }
15014
14192
 
15015
14193
  // functions to change gradients considering the case that input a might be initial gradient with zero value
15016
14194
 
15017
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
15018
- if (hash_contains(zero_table, a)) {
14195
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14196
+ if (ggml_hash_contains(zero_table, a)) {
15019
14197
  return b;
15020
14198
  } else {
15021
14199
  return ggml_add_impl(ctx, a, b, false);
15022
14200
  }
15023
14201
  }
15024
14202
 
15025
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
15026
- if (hash_contains(zero_table, a)) {
14203
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14204
+ if (ggml_hash_contains(zero_table, a)) {
15027
14205
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
15028
14206
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
15029
14207
  } else {
@@ -15031,23 +14209,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
15031
14209
  }
15032
14210
  }
15033
14211
 
15034
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
15035
- if (hash_contains(zero_table, a)) {
14212
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14213
+ if (ggml_hash_contains(zero_table, a)) {
15036
14214
  return ggml_repeat(ctx, b, a);
15037
14215
  } else {
15038
14216
  return ggml_add1_impl(ctx, a, b, false);
15039
14217
  }
15040
14218
  }
15041
14219
 
15042
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
15043
- if (hash_contains(zero_table, a)) {
14220
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14221
+ if (ggml_hash_contains(zero_table, a)) {
15044
14222
  return ggml_neg(ctx, b);
15045
14223
  } else {
15046
14224
  return ggml_sub_impl(ctx, a, b, false);
15047
14225
  }
15048
14226
  }
15049
14227
 
15050
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
14228
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
15051
14229
  struct ggml_tensor * src0 = tensor->src[0];
15052
14230
  struct ggml_tensor * src1 = tensor->src[1];
15053
14231
 
@@ -15559,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15559
14737
  // necessary for llama
15560
14738
  if (src0->grad) {
15561
14739
  //const int n_past = ((int32_t *) tensor->op_params)[0];
15562
- const int n_dims = ((int32_t *) tensor->op_params)[1];
15563
- const int mode = ((int32_t *) tensor->op_params)[2];
15564
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
15565
- float freq_base;
15566
- float freq_scale;
15567
- float xpos_base;
15568
- bool xpos_down;
15569
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
15570
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
15571
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
15572
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
14740
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
14741
+ const int mode = ((int32_t *) tensor->op_params)[2];
14742
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
14743
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
14744
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
14745
+
14746
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
14747
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
14748
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
14749
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
14750
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
14751
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
14752
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
14753
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
15573
14754
 
15574
14755
  src0->grad = ggml_add_or_set(ctx,
15575
14756
  src0->grad,
@@ -15579,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15579
14760
  n_dims,
15580
14761
  mode,
15581
14762
  n_ctx,
14763
+ n_orig_ctx,
15582
14764
  freq_base,
15583
14765
  freq_scale,
14766
+ ext_factor,
14767
+ attn_factor,
14768
+ beta_fast,
14769
+ beta_slow,
15584
14770
  xpos_base,
15585
14771
  xpos_down),
15586
14772
  zero_table);
@@ -15590,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15590
14776
  {
15591
14777
  if (src0->grad) {
15592
14778
  //const int n_past = ((int32_t *) tensor->op_params)[0];
15593
- const int n_dims = ((int32_t *) tensor->op_params)[1];
15594
- const int mode = ((int32_t *) tensor->op_params)[2];
15595
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
15596
- float freq_base;
15597
- float freq_scale;
15598
- float xpos_base;
15599
- bool xpos_down;
15600
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
15601
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
15602
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
15603
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
14779
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
14780
+ const int mode = ((int32_t *) tensor->op_params)[2];
14781
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
14782
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
14783
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
14784
+
14785
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
14786
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
14787
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
14788
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
14789
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
14790
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
14791
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
14792
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
15604
14793
 
15605
14794
  src0->grad = ggml_add_or_set(ctx,
15606
14795
  src0->grad,
@@ -15609,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15609
14798
  src1,
15610
14799
  n_dims,
15611
14800
  mode,
15612
- 0,
15613
14801
  n_ctx,
14802
+ n_orig_ctx,
15614
14803
  freq_base,
15615
14804
  freq_scale,
15616
- 0.0f,
15617
- 1.0f,
15618
- 0.0f,
15619
- 0.0f,
14805
+ ext_factor,
14806
+ attn_factor,
14807
+ beta_fast,
14808
+ beta_slow,
15620
14809
  xpos_base,
15621
14810
  xpos_down,
15622
14811
  false),
@@ -15631,31 +14820,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15631
14820
  {
15632
14821
  GGML_ASSERT(false); // TODO: not implemented
15633
14822
  } break;
15634
- case GGML_OP_CONV_1D:
15635
- {
15636
- GGML_ASSERT(false); // TODO: not implemented
15637
- } break;
15638
- case GGML_OP_CONV_1D_STAGE_0:
15639
- {
15640
- GGML_ASSERT(false); // TODO: not implemented
15641
- } break;
15642
- case GGML_OP_CONV_1D_STAGE_1:
15643
- {
15644
- GGML_ASSERT(false); // TODO: not implemented
15645
- } break;
15646
14823
  case GGML_OP_CONV_TRANSPOSE_1D:
15647
14824
  {
15648
14825
  GGML_ASSERT(false); // TODO: not implemented
15649
14826
  } break;
15650
- case GGML_OP_CONV_2D:
15651
- {
15652
- GGML_ASSERT(false); // TODO: not implemented
15653
- } break;
15654
- case GGML_OP_CONV_2D_STAGE_0:
15655
- {
15656
- GGML_ASSERT(false); // TODO: not implemented
15657
- } break;
15658
- case GGML_OP_CONV_2D_STAGE_1:
14827
+ case GGML_OP_IM2COL:
15659
14828
  {
15660
14829
  GGML_ASSERT(false); // TODO: not implemented
15661
14830
  } break;
@@ -15869,7 +15038,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15869
15038
  }
15870
15039
 
15871
15040
  // check if already visited
15872
- if (hash_insert(cgraph->visited_hash_table, node)) {
15041
+ if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
15873
15042
  return;
15874
15043
  }
15875
15044
 
@@ -15885,7 +15054,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15885
15054
 
15886
15055
  if (node->op == GGML_OP_NONE && node->grad == NULL) {
15887
15056
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
15888
- GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15057
+ GGML_ASSERT(cgraph->n_leafs < cgraph->size);
15889
15058
 
15890
15059
  if (strlen(node->name) == 0) {
15891
15060
  ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15894,22 +15063,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15894
15063
  cgraph->leafs[cgraph->n_leafs] = node;
15895
15064
  cgraph->n_leafs++;
15896
15065
  } else {
15897
- GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15066
+ GGML_ASSERT(cgraph->n_nodes < cgraph->size);
15898
15067
 
15899
15068
  if (strlen(node->name) == 0) {
15900
15069
  ggml_format_name(node, "node_%d", cgraph->n_nodes);
15901
15070
  }
15902
15071
 
15903
15072
  cgraph->nodes[cgraph->n_nodes] = node;
15904
- cgraph->grads[cgraph->n_nodes] = node->grad;
15073
+ if (cgraph->grads) {
15074
+ cgraph->grads[cgraph->n_nodes] = node->grad;
15075
+ }
15905
15076
  cgraph->n_nodes++;
15906
15077
  }
15907
15078
  }
15908
15079
 
15909
15080
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
15910
15081
  if (!expand) {
15911
- cgraph->n_nodes = 0;
15912
- cgraph->n_leafs = 0;
15082
+ // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
15083
+ ggml_graph_clear(cgraph);
15913
15084
  }
15914
15085
 
15915
15086
  const int n0 = cgraph->n_nodes;
@@ -15930,25 +15101,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
15930
15101
  ggml_build_forward_impl(cgraph, tensor, true);
15931
15102
  }
15932
15103
 
15933
- struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15934
- struct ggml_cgraph result = {
15935
- /*.n_nodes =*/ 0,
15936
- /*.n_leafs =*/ 0,
15937
- /*.nodes =*/ { NULL },
15938
- /*.grads =*/ { NULL },
15939
- /*.leafs =*/ { NULL },
15940
- /*.hash_table =*/ { NULL },
15941
- /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15942
- /*.perf_runs =*/ 0,
15943
- /*.perf_cycles =*/ 0,
15944
- /*.perf_time_us =*/ 0,
15945
- };
15946
-
15947
- ggml_build_forward_impl(&result, tensor, false);
15948
-
15949
- return result;
15950
- }
15951
-
15952
15104
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
15953
15105
  GGML_ASSERT(gf->n_nodes > 0);
15954
15106
 
@@ -15965,11 +15117,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15965
15117
  }
15966
15118
 
15967
15119
  // remember original gradients which start with zero values
15968
- void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
15969
- memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
15120
+ struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
15970
15121
  for (int i = 0; i < gf->n_nodes; i++) {
15971
15122
  if (gf->grads[i]) {
15972
- hash_insert(zero_table, gf->grads[i]);
15123
+ ggml_hash_insert(zero_table, gf->grads[i]);
15973
15124
  }
15974
15125
  }
15975
15126
 
@@ -15992,26 +15143,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15992
15143
  }
15993
15144
  }
15994
15145
 
15995
- free(zero_table);
15146
+ ggml_hash_set_free(zero_table);
15996
15147
  }
15997
15148
 
15998
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
15999
- struct ggml_cgraph result = *gf;
16000
- ggml_build_backward_expand(ctx, gf, &result, keep);
16001
- return result;
15149
+ static size_t ggml_graph_nbytes(size_t size, bool grads) {
15150
+ size_t nbytes = sizeof(struct ggml_cgraph);
15151
+ nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
15152
+ if (grads) {
15153
+ nbytes += size * sizeof(struct ggml_tensor *); // grads
15154
+ }
15155
+ nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
15156
+ return nbytes;
16002
15157
  }
16003
15158
 
16004
- struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
16005
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15159
+ size_t ggml_graph_overhead_custom(size_t size, bool grads) {
15160
+ return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
15161
+ }
15162
+
15163
+ size_t ggml_graph_overhead(void) {
15164
+ return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
15165
+ }
15166
+
15167
+ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
15168
+ const size_t obj_size = ggml_graph_nbytes(size, grads);
15169
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
16006
15170
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16007
15171
 
15172
+ struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
15173
+
15174
+ size_t hash_size = ggml_hash_size(size * 2);
15175
+ struct ggml_tensor ** nodes_ptr = data_start;
15176
+ struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
15177
+ struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
15178
+ struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
15179
+
15180
+ // check that we allocated the correct amount of memory
15181
+ assert(obj_size == (size_t) (
15182
+ (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
15183
+
15184
+ memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
15185
+
16008
15186
  *cgraph = (struct ggml_cgraph) {
15187
+ /*.size =*/ size,
16009
15188
  /*.n_nodes =*/ 0,
16010
15189
  /*.n_leafs =*/ 0,
16011
- /*.nodes =*/ { NULL },
16012
- /*.grads =*/ { NULL },
16013
- /*.leafs =*/ { NULL },
16014
- /*.hash_table =*/ { NULL },
15190
+ /*.nodes =*/ nodes_ptr,
15191
+ /*.grads =*/ grads_ptr,
15192
+ /*.leafs =*/ leafs_ptr,
15193
+ /*.hash_table =*/ { hash_size, hash_keys_ptr },
16015
15194
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
16016
15195
  /*.perf_runs =*/ 0,
16017
15196
  /*.perf_cycles =*/ 0,
@@ -16021,14 +15200,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
16021
15200
  return cgraph;
16022
15201
  }
16023
15202
 
16024
- struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
16025
- struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
16026
- ggml_build_forward_impl(cgraph, tensor, false);
15203
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15204
+ return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15205
+ }
15206
+
15207
+ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15208
+ const size_t obj_size = sizeof(struct ggml_cgraph);
15209
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15210
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15211
+
15212
+ *cgraph = (struct ggml_cgraph) {
15213
+ /*.size =*/ 0,
15214
+ /*.n_nodes =*/ i1 - i0,
15215
+ /*.n_leafs =*/ 0,
15216
+ /*.nodes =*/ cgraph0->nodes + i0,
15217
+ /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
15218
+ /*.leafs =*/ NULL,
15219
+ /*.hash_table =*/ { 0, NULL },
15220
+ /*.order =*/ cgraph0->order,
15221
+ /*.perf_runs =*/ 0,
15222
+ /*.perf_cycles =*/ 0,
15223
+ /*.perf_time_us =*/ 0,
15224
+ };
15225
+
16027
15226
  return cgraph;
16028
15227
  }
16029
15228
 
16030
- size_t ggml_graph_overhead(void) {
16031
- return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15229
+ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
15230
+ GGML_ASSERT(dst->size >= src->n_leafs);
15231
+ GGML_ASSERT(dst->size >= src->n_nodes);
15232
+ GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
15233
+
15234
+ dst->n_leafs = src->n_leafs;
15235
+ dst->n_nodes = src->n_nodes;
15236
+ dst->order = src->order;
15237
+
15238
+ for (int i = 0; i < src->n_leafs; ++i) {
15239
+ dst->leafs[i] = src->leafs[i];
15240
+ }
15241
+
15242
+ for (int i = 0; i < src->n_nodes; ++i) {
15243
+ dst->nodes[i] = src->nodes[i];
15244
+ }
15245
+
15246
+ if (src->grads) {
15247
+ GGML_ASSERT(dst->grads != NULL);
15248
+ for (int i = 0; i < src->n_nodes; ++i) {
15249
+ dst->grads[i] = src->grads[i];
15250
+ }
15251
+ }
15252
+
15253
+ for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
15254
+ if (src->visited_hash_table.keys[i]) {
15255
+ ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
15256
+ }
15257
+ }
15258
+ }
15259
+
15260
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
15261
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
15262
+ ggml_graph_cpy(cgraph, result);
15263
+ return result;
15264
+ }
15265
+
15266
+ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
15267
+ GGML_ASSERT(cgraph->grads != NULL);
15268
+
15269
+ for (int i = 0; i < cgraph->n_nodes; i++) {
15270
+ struct ggml_tensor * grad = cgraph->grads[i];
15271
+
15272
+ if (grad) {
15273
+ ggml_set_zero(grad);
15274
+ }
15275
+ }
15276
+ }
15277
+
15278
+ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
15279
+ cgraph->n_leafs = 0;
15280
+ cgraph->n_nodes = 0;
15281
+ memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
16032
15282
  }
16033
15283
 
16034
15284
  //
@@ -16140,45 +15390,266 @@ static void clear_numa_thread_affinity(void) {
16140
15390
  strerror(rv));
16141
15391
  }
16142
15392
 
16143
- CPU_FREE(cpus);
16144
- }
16145
- #else
16146
- // TODO: Windows etc.
16147
- // (the linux implementation may also work on BSD, someone should test)
16148
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16149
- static void clear_numa_thread_affinity(void) {}
16150
- #endif
16151
-
16152
- struct ggml_compute_state_shared {
16153
- const struct ggml_cgraph * cgraph;
16154
- const struct ggml_cplan * cplan;
16155
-
16156
- int64_t perf_node_start_cycles;
16157
- int64_t perf_node_start_time_us;
16158
-
16159
- const int n_threads;
16160
-
16161
- // synchronization primitives
16162
- atomic_int n_active; // num active threads
16163
- atomic_int node_n; // active graph node
16164
-
16165
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16166
- void * abort_callback_data;
16167
- };
16168
-
16169
- struct ggml_compute_state {
16170
- ggml_thread_t thrd;
16171
- int ith;
16172
- struct ggml_compute_state_shared * shared;
16173
- };
16174
-
16175
- static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16176
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16177
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15393
+ CPU_FREE(cpus);
15394
+ }
15395
+ #else
15396
+ // TODO: Windows etc.
15397
+ // (the linux implementation may also work on BSD, someone should test)
15398
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15399
+ static void clear_numa_thread_affinity(void) {}
15400
+ #endif
15401
+
15402
+ struct ggml_compute_state_shared {
15403
+ const struct ggml_cgraph * cgraph;
15404
+ const struct ggml_cplan * cplan;
15405
+
15406
+ int64_t perf_node_start_cycles;
15407
+ int64_t perf_node_start_time_us;
15408
+
15409
+ const int n_threads;
15410
+
15411
+ // synchronization primitives
15412
+ atomic_int n_active; // num active threads
15413
+ atomic_int node_n; // active graph node
15414
+
15415
+ bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15416
+ void * abort_callback_data;
15417
+ };
15418
+
15419
+ struct ggml_compute_state {
15420
+ ggml_thread_t thrd;
15421
+ int ith;
15422
+ struct ggml_compute_state_shared * shared;
15423
+ };
15424
+
15425
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
15426
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
15427
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15428
+
15429
+ node->perf_runs++;
15430
+ node->perf_cycles += cycles_cur;
15431
+ node->perf_time_us += time_us_cur;
15432
+ }
15433
+
15434
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15435
+ int n_tasks = 0;
15436
+
15437
+ switch (node->op) {
15438
+ case GGML_OP_CPY:
15439
+ case GGML_OP_DUP:
15440
+ case GGML_OP_ADD:
15441
+ case GGML_OP_ADD1:
15442
+ case GGML_OP_ACC:
15443
+ {
15444
+ n_tasks = n_threads;
15445
+ } break;
15446
+ case GGML_OP_SUB:
15447
+ case GGML_OP_DIV:
15448
+ case GGML_OP_SQR:
15449
+ case GGML_OP_SQRT:
15450
+ case GGML_OP_LOG:
15451
+ case GGML_OP_SUM:
15452
+ case GGML_OP_SUM_ROWS:
15453
+ case GGML_OP_MEAN:
15454
+ case GGML_OP_ARGMAX:
15455
+ case GGML_OP_REPEAT:
15456
+ case GGML_OP_REPEAT_BACK:
15457
+ {
15458
+ n_tasks = 1;
15459
+ } break;
15460
+ case GGML_OP_UNARY:
15461
+ switch (ggml_get_unary_op(node)) {
15462
+ case GGML_UNARY_OP_ABS:
15463
+ case GGML_UNARY_OP_SGN:
15464
+ case GGML_UNARY_OP_NEG:
15465
+ case GGML_UNARY_OP_STEP:
15466
+ case GGML_UNARY_OP_TANH:
15467
+ case GGML_UNARY_OP_ELU:
15468
+ case GGML_UNARY_OP_RELU:
15469
+ case GGML_UNARY_OP_LEAKY:
15470
+ {
15471
+ n_tasks = 1;
15472
+ } break;
15473
+
15474
+ case GGML_UNARY_OP_GELU:
15475
+ case GGML_UNARY_OP_GELU_QUICK:
15476
+ case GGML_UNARY_OP_SILU:
15477
+ {
15478
+ n_tasks = n_threads;
15479
+ } break;
15480
+ }
15481
+ break;
15482
+ case GGML_OP_SILU_BACK:
15483
+ case GGML_OP_MUL:
15484
+ case GGML_OP_NORM:
15485
+ case GGML_OP_RMS_NORM:
15486
+ case GGML_OP_RMS_NORM_BACK:
15487
+ case GGML_OP_GROUP_NORM:
15488
+ case GGML_OP_CONCAT:
15489
+ {
15490
+ n_tasks = n_threads;
15491
+ } break;
15492
+ case GGML_OP_MUL_MAT:
15493
+ {
15494
+ n_tasks = n_threads;
15495
+
15496
+ // TODO: use different scheduling for different matrix sizes
15497
+ //const int nr0 = ggml_nrows(node->src[0]);
15498
+ //const int nr1 = ggml_nrows(node->src[1]);
15499
+
15500
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
15501
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
15502
+
15503
+ #if defined(GGML_USE_CUBLAS)
15504
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
15505
+ n_tasks = 1; // TODO: this actually is doing nothing
15506
+ // the threads are still spinning
15507
+ }
15508
+ #elif defined(GGML_USE_CLBLAST)
15509
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
15510
+ n_tasks = 1; // TODO: this actually is doing nothing
15511
+ // the threads are still spinning
15512
+ }
15513
+ #endif
15514
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
15515
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
15516
+ n_tasks = 1; // TODO: this actually is doing nothing
15517
+ // the threads are still spinning
15518
+ }
15519
+ #endif
15520
+ } break;
15521
+ case GGML_OP_OUT_PROD:
15522
+ {
15523
+ n_tasks = n_threads;
15524
+ } break;
15525
+ case GGML_OP_SCALE:
15526
+ case GGML_OP_SET:
15527
+ case GGML_OP_CONT:
15528
+ case GGML_OP_RESHAPE:
15529
+ case GGML_OP_VIEW:
15530
+ case GGML_OP_PERMUTE:
15531
+ case GGML_OP_TRANSPOSE:
15532
+ case GGML_OP_GET_ROWS:
15533
+ case GGML_OP_GET_ROWS_BACK:
15534
+ case GGML_OP_DIAG:
15535
+ {
15536
+ n_tasks = 1;
15537
+ } break;
15538
+ case GGML_OP_DIAG_MASK_ZERO:
15539
+ case GGML_OP_DIAG_MASK_INF:
15540
+ case GGML_OP_SOFT_MAX:
15541
+ case GGML_OP_SOFT_MAX_BACK:
15542
+ case GGML_OP_ROPE:
15543
+ case GGML_OP_ROPE_BACK:
15544
+ case GGML_OP_ADD_REL_POS:
15545
+ {
15546
+ n_tasks = n_threads;
15547
+ } break;
15548
+ case GGML_OP_ALIBI:
15549
+ {
15550
+ n_tasks = 1; //TODO
15551
+ } break;
15552
+ case GGML_OP_CLAMP:
15553
+ {
15554
+ n_tasks = 1; //TODO
15555
+ } break;
15556
+ case GGML_OP_CONV_TRANSPOSE_1D:
15557
+ {
15558
+ n_tasks = n_threads;
15559
+ } break;
15560
+ case GGML_OP_IM2COL:
15561
+ {
15562
+ n_tasks = n_threads;
15563
+ } break;
15564
+ case GGML_OP_CONV_TRANSPOSE_2D:
15565
+ {
15566
+ n_tasks = n_threads;
15567
+ } break;
15568
+ case GGML_OP_POOL_1D:
15569
+ case GGML_OP_POOL_2D:
15570
+ {
15571
+ n_tasks = 1;
15572
+ } break;
15573
+ case GGML_OP_UPSCALE:
15574
+ {
15575
+ n_tasks = n_threads;
15576
+ } break;
15577
+ case GGML_OP_FLASH_ATTN:
15578
+ {
15579
+ n_tasks = n_threads;
15580
+ } break;
15581
+ case GGML_OP_FLASH_FF:
15582
+ {
15583
+ n_tasks = n_threads;
15584
+ } break;
15585
+ case GGML_OP_FLASH_ATTN_BACK:
15586
+ {
15587
+ n_tasks = n_threads;
15588
+ } break;
15589
+ case GGML_OP_WIN_PART:
15590
+ case GGML_OP_WIN_UNPART:
15591
+ case GGML_OP_GET_REL_POS:
15592
+ case GGML_OP_MAP_UNARY:
15593
+ case GGML_OP_MAP_BINARY:
15594
+ case GGML_OP_MAP_CUSTOM1_F32:
15595
+ case GGML_OP_MAP_CUSTOM2_F32:
15596
+ case GGML_OP_MAP_CUSTOM3_F32:
15597
+ {
15598
+ n_tasks = 1;
15599
+ } break;
15600
+ case GGML_OP_MAP_CUSTOM1:
15601
+ {
15602
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
15603
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15604
+ n_tasks = n_threads;
15605
+ } else {
15606
+ n_tasks = MIN(p->n_tasks, n_threads);
15607
+ }
15608
+ } break;
15609
+ case GGML_OP_MAP_CUSTOM2:
15610
+ {
15611
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
15612
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15613
+ n_tasks = n_threads;
15614
+ } else {
15615
+ n_tasks = MIN(p->n_tasks, n_threads);
15616
+ }
15617
+ } break;
15618
+ case GGML_OP_MAP_CUSTOM3:
15619
+ {
15620
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
15621
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15622
+ n_tasks = n_threads;
15623
+ } else {
15624
+ n_tasks = MIN(p->n_tasks, n_threads);
15625
+ }
15626
+ } break;
15627
+ case GGML_OP_CROSS_ENTROPY_LOSS:
15628
+ {
15629
+ n_tasks = n_threads;
15630
+ } break;
15631
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15632
+ {
15633
+ n_tasks = n_threads;
15634
+ } break;
15635
+ case GGML_OP_NONE:
15636
+ {
15637
+ n_tasks = 1;
15638
+ } break;
15639
+ case GGML_OP_COUNT:
15640
+ {
15641
+ GGML_ASSERT(false);
15642
+ } break;
15643
+ default:
15644
+ {
15645
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15646
+ GGML_ASSERT(false);
15647
+ } break;
15648
+ }
15649
+
15650
+ assert(n_tasks > 0);
16178
15651
 
16179
- node->perf_runs++;
16180
- node->perf_cycles += cycles_cur;
16181
- node->perf_time_us += time_us_cur;
15652
+ return n_tasks;
16182
15653
  }
16183
15654
 
16184
15655
  static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -16187,7 +15658,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16187
15658
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
16188
15659
  const struct ggml_cplan * cplan = state->shared->cplan;
16189
15660
 
16190
- const int * n_tasks_arr = cplan->n_tasks;
16191
15661
  const int n_threads = state->shared->n_threads;
16192
15662
 
16193
15663
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16212,9 +15682,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16212
15682
 
16213
15683
  if (node_n != -1) {
16214
15684
  /* FINALIZE */
16215
- struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
15685
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16216
15686
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16217
- params.nth = n_tasks_arr[node_n];
15687
+ params.nth = ggml_get_n_tasks(node, n_threads);
16218
15688
  ggml_compute_forward(&params, node);
16219
15689
  }
16220
15690
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16225,7 +15695,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16225
15695
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16226
15696
 
16227
15697
  struct ggml_tensor * node = cgraph->nodes[node_n];
16228
- const int n_tasks = n_tasks_arr[node_n];
15698
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16229
15699
 
16230
15700
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16231
15701
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -16283,7 +15753,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16283
15753
 
16284
15754
  /* COMPUTE */
16285
15755
  struct ggml_tensor * node = cgraph->nodes[node_n];
16286
- const int n_tasks = n_tasks_arr[node_n];
15756
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16287
15757
 
16288
15758
  struct ggml_compute_params params = {
16289
15759
  /*.type =*/ GGML_TASK_COMPUTE,
@@ -16317,121 +15787,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16317
15787
 
16318
15788
  struct ggml_tensor * node = cgraph->nodes[i];
16319
15789
 
15790
+ size_t cur = 0;
15791
+
16320
15792
  switch (node->op) {
16321
15793
  case GGML_OP_CPY:
16322
15794
  case GGML_OP_DUP:
16323
15795
  {
16324
15796
  n_tasks = n_threads;
16325
15797
 
16326
- size_t cur = 0;
16327
15798
  if (ggml_is_quantized(node->type)) {
16328
15799
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16329
15800
  }
16330
-
16331
- work_size = MAX(work_size, cur);
16332
15801
  } break;
16333
15802
  case GGML_OP_ADD:
16334
15803
  case GGML_OP_ADD1:
16335
15804
  {
16336
15805
  n_tasks = n_threads;
16337
15806
 
16338
- size_t cur = 0;
16339
-
16340
15807
  if (ggml_is_quantized(node->src[0]->type)) {
16341
15808
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16342
15809
  }
16343
-
16344
- work_size = MAX(work_size, cur);
16345
15810
  } break;
16346
15811
  case GGML_OP_ACC:
16347
15812
  {
16348
15813
  n_tasks = n_threads;
16349
15814
 
16350
- size_t cur = 0;
16351
-
16352
15815
  if (ggml_is_quantized(node->src[0]->type)) {
16353
15816
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16354
15817
  }
16355
-
16356
- work_size = MAX(work_size, cur);
16357
- } break;
16358
- case GGML_OP_SUB:
16359
- case GGML_OP_DIV:
16360
- case GGML_OP_SQR:
16361
- case GGML_OP_SQRT:
16362
- case GGML_OP_LOG:
16363
- case GGML_OP_SUM:
16364
- case GGML_OP_SUM_ROWS:
16365
- case GGML_OP_MEAN:
16366
- case GGML_OP_ARGMAX:
16367
- case GGML_OP_REPEAT:
16368
- case GGML_OP_REPEAT_BACK:
16369
- {
16370
- n_tasks = 1;
16371
- } break;
16372
-
16373
- case GGML_OP_UNARY:
16374
- {
16375
- switch (ggml_get_unary_op(node)) {
16376
- case GGML_UNARY_OP_ABS:
16377
- case GGML_UNARY_OP_SGN:
16378
- case GGML_UNARY_OP_NEG:
16379
- case GGML_UNARY_OP_STEP:
16380
- case GGML_UNARY_OP_TANH:
16381
- case GGML_UNARY_OP_ELU:
16382
- case GGML_UNARY_OP_RELU:
16383
- {
16384
- n_tasks = 1;
16385
- } break;
16386
-
16387
- case GGML_UNARY_OP_GELU:
16388
- case GGML_UNARY_OP_GELU_QUICK:
16389
- case GGML_UNARY_OP_SILU:
16390
- {
16391
- n_tasks = n_threads;
16392
- } break;
16393
- }
16394
15818
  } break;
16395
- case GGML_OP_SILU_BACK:
16396
- case GGML_OP_MUL:
16397
- case GGML_OP_NORM:
16398
- case GGML_OP_RMS_NORM:
16399
- case GGML_OP_RMS_NORM_BACK:
16400
- case GGML_OP_GROUP_NORM:
16401
- {
16402
- n_tasks = n_threads;
16403
- } break;
16404
- case GGML_OP_CONCAT:
16405
15819
  case GGML_OP_MUL_MAT:
16406
15820
  {
16407
- n_tasks = n_threads;
16408
-
16409
- // TODO: use different scheduling for different matrix sizes
16410
- //const int nr0 = ggml_nrows(node->src[0]);
16411
- //const int nr1 = ggml_nrows(node->src[1]);
16412
-
16413
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16414
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16415
-
16416
- size_t cur = 0;
16417
15821
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16418
15822
 
16419
- #if defined(GGML_USE_CUBLAS)
16420
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16421
- n_tasks = 1; // TODO: this actually is doing nothing
16422
- // the threads are still spinning
16423
- } else
16424
- #elif defined(GGML_USE_CLBLAST)
15823
+ #if defined(GGML_USE_CLBLAST)
16425
15824
  if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16426
- n_tasks = 1; // TODO: this actually is doing nothing
16427
- // the threads are still spinning
16428
15825
  cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16429
15826
  } else
16430
15827
  #endif
16431
15828
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16432
15829
  if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16433
- n_tasks = 1; // TODO: this actually is doing nothing
16434
- // the threads are still spinning
16435
15830
  if (node->src[0]->type != GGML_TYPE_F32) {
16436
15831
  // here we need memory just for single 2D matrix from src0
16437
15832
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16440,108 +15835,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16440
15835
  #endif
16441
15836
  if (node->src[1]->type != vec_dot_type) {
16442
15837
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16443
- } else {
16444
- cur = 0;
16445
15838
  }
16446
-
16447
- work_size = MAX(work_size, cur);
16448
15839
  } break;
16449
15840
  case GGML_OP_OUT_PROD:
16450
15841
  {
16451
15842
  n_tasks = n_threads;
16452
15843
 
16453
- size_t cur = 0;
16454
-
16455
15844
  if (ggml_is_quantized(node->src[0]->type)) {
16456
15845
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16457
15846
  }
16458
-
16459
- work_size = MAX(work_size, cur);
16460
- } break;
16461
- case GGML_OP_SCALE:
16462
- {
16463
- n_tasks = 1;
16464
- } break;
16465
- case GGML_OP_SET:
16466
- case GGML_OP_CONT:
16467
- case GGML_OP_RESHAPE:
16468
- case GGML_OP_VIEW:
16469
- case GGML_OP_PERMUTE:
16470
- case GGML_OP_TRANSPOSE:
16471
- case GGML_OP_GET_ROWS:
16472
- case GGML_OP_GET_ROWS_BACK:
16473
- case GGML_OP_DIAG:
16474
- {
16475
- n_tasks = 1;
16476
- } break;
16477
- case GGML_OP_DIAG_MASK_ZERO:
16478
- case GGML_OP_DIAG_MASK_INF:
16479
- case GGML_OP_SOFT_MAX:
16480
- case GGML_OP_SOFT_MAX_BACK:
16481
- case GGML_OP_ROPE:
16482
- case GGML_OP_ROPE_BACK:
16483
- case GGML_OP_ADD_REL_POS:
16484
- {
16485
- n_tasks = n_threads;
16486
- } break;
16487
- case GGML_OP_ALIBI:
16488
- {
16489
- n_tasks = 1; //TODO
16490
- } break;
16491
- case GGML_OP_CLAMP:
16492
- {
16493
- n_tasks = 1; //TODO
16494
- } break;
16495
- case GGML_OP_CONV_1D:
16496
- {
16497
- n_tasks = n_threads;
16498
-
16499
- GGML_ASSERT(node->src[0]->ne[3] == 1);
16500
- GGML_ASSERT(node->src[1]->ne[2] == 1);
16501
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16502
-
16503
- const int64_t ne00 = node->src[0]->ne[0];
16504
- const int64_t ne01 = node->src[0]->ne[1];
16505
- const int64_t ne02 = node->src[0]->ne[2];
16506
-
16507
- const int64_t ne10 = node->src[1]->ne[0];
16508
- const int64_t ne11 = node->src[1]->ne[1];
16509
-
16510
- const int64_t ne0 = node->ne[0];
16511
- const int64_t ne1 = node->ne[1];
16512
- const int64_t nk = ne00;
16513
- const int64_t ew0 = nk * ne01;
16514
-
16515
- UNUSED(ne02);
16516
- UNUSED(ne10);
16517
- UNUSED(ne11);
16518
-
16519
- size_t cur = 0;
16520
-
16521
- if (node->src[0]->type == GGML_TYPE_F16 &&
16522
- node->src[1]->type == GGML_TYPE_F32) {
16523
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16524
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16525
- node->src[1]->type == GGML_TYPE_F32) {
16526
- cur = sizeof(float)*(ne0*ne1*ew0);
16527
- } else {
16528
- GGML_ASSERT(false);
16529
- }
16530
-
16531
- work_size = MAX(work_size, cur);
16532
- } break;
16533
- case GGML_OP_CONV_1D_STAGE_0:
16534
- {
16535
- n_tasks = n_threads;
16536
- } break;
16537
- case GGML_OP_CONV_1D_STAGE_1:
16538
- {
16539
- n_tasks = n_threads;
16540
15847
  } break;
16541
15848
  case GGML_OP_CONV_TRANSPOSE_1D:
16542
15849
  {
16543
- n_tasks = n_threads;
16544
-
16545
15850
  GGML_ASSERT(node->src[0]->ne[3] == 1);
16546
15851
  GGML_ASSERT(node->src[1]->ne[2] == 1);
16547
15852
  GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16553,7 +15858,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16553
15858
  const int64_t ne10 = node->src[1]->ne[0]; // L
16554
15859
  const int64_t ne11 = node->src[1]->ne[1]; // Cin
16555
15860
 
16556
- size_t cur = 0;
16557
15861
  if (node->src[0]->type == GGML_TYPE_F16 &&
16558
15862
  node->src[1]->type == GGML_TYPE_F32) {
16559
15863
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
@@ -16565,59 +15869,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16565
15869
  } else {
16566
15870
  GGML_ASSERT(false);
16567
15871
  }
16568
-
16569
- work_size = MAX(work_size, cur);
16570
- } break;
16571
- case GGML_OP_CONV_2D:
16572
- {
16573
- n_tasks = n_threads;
16574
-
16575
- const int64_t ne00 = node->src[0]->ne[0]; // W
16576
- const int64_t ne01 = node->src[0]->ne[1]; // H
16577
- const int64_t ne02 = node->src[0]->ne[2]; // C
16578
- const int64_t ne03 = node->src[0]->ne[3]; // N
16579
-
16580
- const int64_t ne10 = node->src[1]->ne[0]; // W
16581
- const int64_t ne11 = node->src[1]->ne[1]; // H
16582
- const int64_t ne12 = node->src[1]->ne[2]; // C
16583
-
16584
- const int64_t ne0 = node->ne[0];
16585
- const int64_t ne1 = node->ne[1];
16586
- const int64_t ne2 = node->ne[2];
16587
- const int64_t ne3 = node->ne[3];
16588
- const int64_t nk = ne00*ne01;
16589
- const int64_t ew0 = nk * ne02;
16590
-
16591
- UNUSED(ne03);
16592
- UNUSED(ne2);
16593
-
16594
- size_t cur = 0;
16595
-
16596
- if (node->src[0]->type == GGML_TYPE_F16 &&
16597
- node->src[1]->type == GGML_TYPE_F32) {
16598
- // im2col: [N*OH*OW, IC*KH*KW]
16599
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
16600
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16601
- node->src[1]->type == GGML_TYPE_F32) {
16602
- cur = sizeof(float)* (ne10*ne11*ne12);
16603
- } else {
16604
- GGML_ASSERT(false);
16605
- }
16606
-
16607
- work_size = MAX(work_size, cur);
16608
- } break;
16609
- case GGML_OP_CONV_2D_STAGE_0:
16610
- {
16611
- n_tasks = n_threads;
16612
15872
  } break;
16613
- case GGML_OP_CONV_2D_STAGE_1:
15873
+ case GGML_OP_IM2COL:
16614
15874
  {
16615
15875
  n_tasks = n_threads;
16616
15876
  } break;
16617
15877
  case GGML_OP_CONV_TRANSPOSE_2D:
16618
15878
  {
16619
- n_tasks = n_threads;
16620
-
16621
15879
  const int64_t ne00 = node->src[0]->ne[0]; // W
16622
15880
  const int64_t ne01 = node->src[0]->ne[1]; // H
16623
15881
  const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16627,141 +15885,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16627
15885
  const int64_t ne11 = node->src[1]->ne[1]; // H
16628
15886
  const int64_t ne12 = node->src[1]->ne[2]; // Channels In
16629
15887
 
16630
- size_t cur = 0;
16631
15888
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
16632
15889
  cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
16633
-
16634
- work_size = MAX(work_size, cur);
16635
- } break;
16636
- case GGML_OP_POOL_1D:
16637
- case GGML_OP_POOL_2D:
16638
- {
16639
- n_tasks = 1;
16640
- } break;
16641
- case GGML_OP_UPSCALE:
16642
- {
16643
- n_tasks = n_threads;
16644
15890
  } break;
16645
15891
  case GGML_OP_FLASH_ATTN:
16646
15892
  {
16647
15893
  n_tasks = n_threads;
16648
15894
 
16649
- size_t cur = 0;
16650
-
16651
15895
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16652
15896
 
16653
15897
  if (node->src[1]->type == GGML_TYPE_F32) {
16654
15898
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16655
15899
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16656
- }
16657
-
16658
- if (node->src[1]->type == GGML_TYPE_F16) {
15900
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16659
15901
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16660
15902
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16661
15903
  }
16662
-
16663
- work_size = MAX(work_size, cur);
16664
15904
  } break;
16665
15905
  case GGML_OP_FLASH_FF:
16666
15906
  {
16667
15907
  n_tasks = n_threads;
16668
15908
 
16669
- size_t cur = 0;
16670
-
16671
15909
  if (node->src[1]->type == GGML_TYPE_F32) {
16672
15910
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16673
15911
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16674
- }
16675
-
16676
- if (node->src[1]->type == GGML_TYPE_F16) {
15912
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16677
15913
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16678
15914
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16679
15915
  }
16680
-
16681
- work_size = MAX(work_size, cur);
16682
15916
  } break;
16683
15917
  case GGML_OP_FLASH_ATTN_BACK:
16684
15918
  {
16685
15919
  n_tasks = n_threads;
16686
15920
 
16687
- size_t cur = 0;
16688
-
16689
15921
  const int64_t D = node->src[0]->ne[0];
16690
15922
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16691
15923
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16692
15924
  if (node->src[1]->type == GGML_TYPE_F32) {
16693
15925
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16694
15926
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16695
- }
16696
-
16697
- if (node->src[1]->type == GGML_TYPE_F16) {
15927
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16698
15928
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16699
15929
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16700
15930
  }
16701
-
16702
- work_size = MAX(work_size, cur);
16703
- } break;
16704
- case GGML_OP_WIN_PART:
16705
- case GGML_OP_WIN_UNPART:
16706
- case GGML_OP_GET_REL_POS:
16707
- case GGML_OP_MAP_UNARY:
16708
- case GGML_OP_MAP_BINARY:
16709
- case GGML_OP_MAP_CUSTOM1_F32:
16710
- case GGML_OP_MAP_CUSTOM2_F32:
16711
- case GGML_OP_MAP_CUSTOM3_F32:
16712
- {
16713
- n_tasks = 1;
16714
- } break;
16715
- case GGML_OP_MAP_CUSTOM1:
16716
- {
16717
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16718
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16719
- n_tasks = n_threads;
16720
- } else {
16721
- n_tasks = MIN(p->n_tasks, n_threads);
16722
- }
16723
- } break;
16724
- case GGML_OP_MAP_CUSTOM2:
16725
- {
16726
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16727
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16728
- n_tasks = n_threads;
16729
- } else {
16730
- n_tasks = MIN(p->n_tasks, n_threads);
16731
- }
16732
- } break;
16733
- case GGML_OP_MAP_CUSTOM3:
16734
- {
16735
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16736
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16737
- n_tasks = n_threads;
16738
- } else {
16739
- n_tasks = MIN(p->n_tasks, n_threads);
16740
- }
16741
15931
  } break;
15932
+
16742
15933
  case GGML_OP_CROSS_ENTROPY_LOSS:
16743
15934
  {
16744
15935
  n_tasks = n_threads;
16745
15936
 
16746
- size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16747
-
16748
- work_size = MAX(work_size, cur);
16749
- } break;
16750
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16751
- {
16752
- n_tasks = n_threads;
16753
- } break;
16754
- case GGML_OP_NONE:
16755
- {
16756
- n_tasks = 1;
15937
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16757
15938
  } break;
16758
15939
  case GGML_OP_COUNT:
16759
15940
  {
16760
15941
  GGML_ASSERT(false);
16761
15942
  } break;
15943
+ default:
15944
+ break;
16762
15945
  }
16763
15946
 
16764
- cplan.n_tasks[i] = n_tasks;
15947
+ work_size = MAX(work_size, cur);
16765
15948
  }
16766
15949
 
16767
15950
  if (work_size > 0) {
@@ -16783,12 +15966,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16783
15966
  if (cplan->work_size > 0) {
16784
15967
  GGML_ASSERT(cplan->work_data);
16785
15968
  }
16786
-
16787
- for (int i = 0; i < cgraph->n_nodes; ++i) {
16788
- if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16789
- GGML_ASSERT(cplan->n_tasks[i] > 0);
16790
- }
16791
- }
16792
15969
  }
16793
15970
 
16794
15971
  const int n_threads = cplan->n_threads;
@@ -16861,16 +16038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16861
16038
  return compute_status;
16862
16039
  }
16863
16040
 
16864
- void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16865
- for (int i = 0; i < cgraph->n_nodes; i++) {
16866
- struct ggml_tensor * grad = cgraph->grads[i];
16867
-
16868
- if (grad) {
16869
- ggml_set_zero(grad);
16870
- }
16871
- }
16872
- }
16873
-
16874
16041
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16875
16042
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16876
16043
 
@@ -16997,12 +16164,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16997
16164
  const uint32_t magic = GGML_FILE_MAGIC;
16998
16165
  const uint32_t version = GGML_FILE_VERSION;
16999
16166
  const uint32_t n_leafs = cgraph->n_leafs;
17000
- const uint32_t nodes = cgraph->n_nodes;
16167
+ const uint32_t n_nodes = cgraph->n_nodes;
17001
16168
 
17002
16169
  fwrite(&magic, sizeof(uint32_t), 1, fout);
17003
16170
  fwrite(&version, sizeof(uint32_t), 1, fout);
17004
16171
  fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
17005
- fwrite(&nodes, sizeof(uint32_t), 1, fout);
16172
+ fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
17006
16173
  fwrite(&size_eval, sizeof(uint64_t), 1, fout);
17007
16174
  }
17008
16175
 
@@ -17090,7 +16257,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17090
16257
  if (idx == -1) {
17091
16258
  for (int k = 0; k < cgraph->n_nodes; ++k) {
17092
16259
  if (args[j] == cgraph->nodes[k]) {
17093
- idx = GGML_MAX_NODES + k;
16260
+ idx = cgraph->n_leafs + k;
17094
16261
  break;
17095
16262
  }
17096
16263
  }
@@ -17117,11 +16284,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17117
16284
  }
17118
16285
  }
17119
16286
 
17120
- struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16287
+ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
17121
16288
  assert(*ctx_data == NULL);
17122
16289
  assert(*ctx_eval == NULL);
17123
16290
 
17124
- struct ggml_cgraph result = { 0 };
16291
+ struct ggml_cgraph * result = NULL;
17125
16292
 
17126
16293
  struct ggml_tensor * data = NULL;
17127
16294
 
@@ -17193,13 +16360,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17193
16360
  const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
17194
16361
  const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
17195
16362
  const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
17196
-
17197
- result.n_leafs = n_leafs;
17198
- result.n_nodes = n_nodes;
16363
+ const int graph_size = MAX(n_leafs, n_nodes);
17199
16364
 
17200
16365
  // create the data context
17201
16366
  {
17202
- const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
16367
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
17203
16368
 
17204
16369
  struct ggml_init_params params = {
17205
16370
  .mem_size = size_eval + overhead,
@@ -17215,6 +16380,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17215
16380
  }
17216
16381
  }
17217
16382
 
16383
+ result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
16384
+
16385
+ result->n_leafs = n_leafs;
16386
+ result->n_nodes = n_nodes;
16387
+
16388
+
17218
16389
  // leafs
17219
16390
  {
17220
16391
  uint32_t type;
@@ -17253,7 +16424,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17253
16424
  tensor->nb[j] = nb[j];
17254
16425
  }
17255
16426
 
17256
- result.leafs[i] = tensor;
16427
+ result->leafs[i] = tensor;
17257
16428
 
17258
16429
  ptr += ggml_nbytes(tensor);
17259
16430
 
@@ -17305,10 +16476,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17305
16476
  continue;
17306
16477
  }
17307
16478
 
17308
- if (arg_idx < GGML_MAX_NODES) {
17309
- args[j] = result.leafs[arg_idx];
16479
+ if (arg_idx < result->n_leafs) {
16480
+ args[j] = result->leafs[arg_idx];
17310
16481
  } else {
17311
- args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
16482
+ args[j] = result->nodes[arg_idx - result->n_leafs];
17312
16483
  }
17313
16484
  }
17314
16485
 
@@ -17360,7 +16531,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17360
16531
  tensor->src[j] = args[j];
17361
16532
  }
17362
16533
 
17363
- result.nodes[i] = tensor;
16534
+ result->nodes[i] = tensor;
17364
16535
 
17365
16536
  fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17366
16537
  }
@@ -18265,10 +17436,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18265
17436
  case GGML_OPT_ADAM:
18266
17437
  {
18267
17438
  result = (struct ggml_opt_params) {
18268
- .type = GGML_OPT_ADAM,
18269
- .n_threads = 1,
18270
- .past = 0,
18271
- .delta = 1e-5f,
17439
+ .type = GGML_OPT_ADAM,
17440
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17441
+ .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
17442
+ .past = 0,
17443
+ .delta = 1e-5f,
18272
17444
 
18273
17445
  .max_no_improvement = 100,
18274
17446
 
@@ -18295,10 +17467,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18295
17467
  case GGML_OPT_LBFGS:
18296
17468
  {
18297
17469
  result = (struct ggml_opt_params) {
18298
- .type = GGML_OPT_LBFGS,
18299
- .n_threads = 1,
18300
- .past = 0,
18301
- .delta = 1e-5f,
17470
+ .type = GGML_OPT_LBFGS,
17471
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17472
+ .n_threads = 1,
17473
+ .past = 0,
17474
+ .delta = 1e-5f,
18302
17475
 
18303
17476
  .max_no_improvement = 0,
18304
17477
 
@@ -18440,14 +17613,11 @@ enum ggml_opt_result ggml_opt_resume(
18440
17613
  struct ggml_tensor * f) {
18441
17614
 
18442
17615
  // build forward + backward compute graphs
18443
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18444
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18445
-
18446
- struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18447
- struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
17616
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
17617
+ ggml_build_forward_expand(gf, f);
18448
17618
 
18449
- *gf = ggml_build_forward (f);
18450
- *gb = ggml_build_backward(ctx, gf, true);
17619
+ struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
17620
+ ggml_build_backward_expand(ctx, gf, gb, true);
18451
17621
 
18452
17622
  return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18453
17623
  }
@@ -18903,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18903
18073
  {
18904
18074
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18905
18075
 
18906
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18076
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18907
18077
  struct gguf_kv * kv = &ctx->kv[i];
18908
18078
 
18909
18079
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18950,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18950
18120
  case GGUF_TYPE_STRING:
18951
18121
  {
18952
18122
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18953
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18123
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18954
18124
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18955
18125
  }
18956
18126
  } break;
@@ -18978,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18978
18148
  {
18979
18149
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18980
18150
 
18981
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18151
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18982
18152
  struct gguf_tensor_info * info = &ctx->infos[i];
18983
18153
 
18984
18154
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -19025,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19025
18195
  // compute the total size of the data section, taking into account the alignment
19026
18196
  {
19027
18197
  ctx->size = 0;
19028
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18198
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19029
18199
  struct gguf_tensor_info * info = &ctx->infos[i];
19030
18200
 
19031
18201
  const int64_t ne =
@@ -19094,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19094
18264
  ggml_set_no_alloc(ctx_data, true);
19095
18265
 
19096
18266
  // create the tensors
19097
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18267
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19098
18268
  const int64_t ne[GGML_MAX_DIMS] = {
19099
18269
  ctx->infos[i].ne[0],
19100
18270
  ctx->infos[i].ne[1],