llama_cpp 0.9.2 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
100
100
  #include <hbwmalloc.h>
101
101
  #endif
102
102
 
103
+ #if defined(__APPLE__)
104
+ #include <TargetConditionals.h>
105
+ #endif
106
+
107
+ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
108
+ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
109
+
110
+ #include <sys/wait.h>
111
+
112
+ void ggml_print_backtrace(void) {
113
+ /*
114
+ #include <execinfo.h>
115
+ #include <dlfcn.h>
116
+
117
+ void * trace[100];
118
+
119
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
120
+
121
+ backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
122
+ */
123
+
124
+ // backtrack_symbols does not show line numbers, use gdb instead
125
+ char attach[32];
126
+ snprintf(attach, sizeof(attach), "attach %d", getpid());
127
+ int pid = fork();
128
+ if (pid == 0) {
129
+ execlp("gdb", "gdb", "--batch",
130
+ "-ex", "set style enabled on",
131
+ "-ex", attach,
132
+ "-ex", "bt -frame-info source-and-location",
133
+ "-ex", "detach",
134
+ "-ex", "quit",
135
+ NULL);
136
+ } else {
137
+ waitpid(pid, NULL, 0);
138
+ }
139
+ }
140
+ #else
141
+ void ggml_print_backtrace(void) {
142
+ // platform not supported
143
+ }
144
+ #endif
145
+
103
146
  /*#define GGML_PERF*/
104
147
  #define GGML_DEBUG 0
105
148
  #define GGML_GELU_FP16
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
228
271
  // floating point type used to accumulate sums
229
272
  typedef double ggml_float;
230
273
 
274
+ #undef MIN
275
+ #undef MAX
276
+
277
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
278
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
279
+
231
280
  //
232
281
  // global data
233
282
  //
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
561
610
  // simd mappings
562
611
  //
563
612
 
613
+ #if defined(__ARM_NEON)
614
+ #if !defined(__aarch64__)
615
+
616
+ // 64-bit compatibility
617
+
618
+ inline static float vaddvq_f32(float32x4_t v) {
619
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620
+ }
621
+
622
+ #endif
623
+ #endif
624
+
564
625
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
565
626
  // we then implement the fundamental computation operations below using only these macros
566
627
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1352
1413
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1353
1414
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1354
1415
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
+ inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1355
1417
 
1356
1418
  static const float GELU_COEF_A = 0.044715f;
1357
1419
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1572
1634
  "ROPE_BACK",
1573
1635
  "ALIBI",
1574
1636
  "CLAMP",
1575
- "CONV_1D",
1576
- "CONV_1D_STAGE_0",
1577
- "CONV_1D_STAGE_1",
1578
1637
  "CONV_TRANSPOSE_1D",
1579
- "CONV_2D",
1580
- "CONV_2D_STAGE_0",
1581
- "CONV_2D_STAGE_1",
1638
+ "IM2COL",
1582
1639
  "CONV_TRANSPOSE_2D",
1583
1640
  "POOL_1D",
1584
1641
  "POOL_2D",
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1609
1666
  "CROSS_ENTROPY_LOSS_BACK",
1610
1667
  };
1611
1668
 
1612
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1669
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1613
1670
 
1614
1671
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1615
1672
  "none",
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1659
1716
  "rope_back(x)",
1660
1717
  "alibi(x)",
1661
1718
  "clamp(x)",
1662
- "conv_1d(x)",
1663
- "conv_1d_stage_0(x)",
1664
- "conv_1d_stage_1(x)",
1665
1719
  "conv_transpose_1d(x)",
1666
- "conv_2d(x)",
1667
- "conv_2d_stage_0(x)",
1668
- "conv_2d_stage_1(x)",
1720
+ "im2col(x)",
1669
1721
  "conv_transpose_2d(x)",
1670
1722
  "pool_1d(x)",
1671
1723
  "pool_2d(x)",
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1696
1748
  "cross_entropy_loss_back(x,y)",
1697
1749
  };
1698
1750
 
1699
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1751
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1700
1752
 
1701
1753
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1702
1754
 
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
1724
1776
  p[GGML_OP_GET_ROWS_BACK ] = true;
1725
1777
  p[GGML_OP_DIAG_MASK_INF ] = true;
1726
1778
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
1727
- p[GGML_OP_CONV_1D ] = true;
1728
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
1729
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
1730
1779
  p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
1731
- p[GGML_OP_CONV_2D ] = true;
1732
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
1733
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
1734
1780
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
1735
1781
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
1736
1782
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
3769
3815
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3770
3816
  }
3771
3817
 
3818
+ // ggml_leaky
3819
+
3820
+ struct ggml_tensor * ggml_leaky(
3821
+ struct ggml_context * ctx,
3822
+ struct ggml_tensor * a) {
3823
+ return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3824
+ }
3825
+
3772
3826
  // ggml_gelu
3773
3827
 
3774
3828
  struct ggml_tensor * ggml_gelu(
@@ -5076,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
5076
5130
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
5077
5131
  }
5078
5132
 
5079
- // im2col: [N, IC, IL] => [N, OL, IC*K]
5080
- // a: [OC,IC, K]
5081
- // b: [N, IC, IL]
5082
- // result: [N, OL, IC*K]
5083
- static struct ggml_tensor * ggml_conv_1d_stage_0(
5084
- struct ggml_context * ctx,
5085
- struct ggml_tensor * a,
5086
- struct ggml_tensor * b,
5087
- int s0,
5088
- int p0,
5089
- int d0) {
5090
- GGML_ASSERT(a->ne[1] == b->ne[1]);
5091
- bool is_node = false;
5092
-
5093
- if (a->grad || b->grad) {
5094
- GGML_ASSERT(false); // TODO: implement backward
5095
- is_node = true;
5096
- }
5097
-
5098
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5099
-
5100
- const int64_t ne[4] = {
5101
- a->ne[1] * a->ne[0],
5102
- OL,
5103
- b->ne[2],
5104
- 1,
5105
- };
5106
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5107
-
5108
- int32_t params[] = { s0, p0, d0 };
5109
- ggml_set_op_params(result, params, sizeof(params));
5110
-
5111
- result->op = GGML_OP_CONV_1D_STAGE_0;
5112
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5113
- result->src[0] = a;
5114
- result->src[1] = b;
5115
-
5116
- return result;
5117
- }
5118
-
5119
- // ggml_conv_1d_stage_1
5120
-
5121
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
5122
- // a: [OC, IC, K]
5123
- // b: [N, OL, IC * K]
5124
- // result: [N, OC, OL]
5125
- static struct ggml_tensor * ggml_conv_1d_stage_1(
5126
- struct ggml_context * ctx,
5127
- struct ggml_tensor * a,
5128
- struct ggml_tensor * b) {
5129
-
5130
- bool is_node = false;
5131
-
5132
- if (a->grad || b->grad) {
5133
- GGML_ASSERT(false); // TODO: implement backward
5134
- is_node = true;
5135
- }
5136
-
5137
- const int64_t ne[4] = {
5138
- b->ne[1],
5139
- a->ne[2],
5140
- b->ne[2],
5141
- 1,
5142
- };
5143
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5144
-
5145
- result->op = GGML_OP_CONV_1D_STAGE_1;
5146
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5147
- result->src[0] = a;
5148
- result->src[1] = b;
5149
-
5150
- return result;
5151
- }
5152
-
5153
- // ggml_conv_1d
5154
-
5155
5133
  GGML_API struct ggml_tensor * ggml_conv_1d(
5156
5134
  struct ggml_context * ctx,
5157
5135
  struct ggml_tensor * a,
@@ -5159,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5159
5137
  int s0,
5160
5138
  int p0,
5161
5139
  int d0) {
5162
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
5163
- result = ggml_conv_1d_stage_1(ctx, a, result);
5164
- return result;
5165
- }
5166
-
5167
- // GGML_API struct ggml_tensor * ggml_conv_1d(
5168
- // struct ggml_context * ctx,
5169
- // struct ggml_tensor * a,
5170
- // struct ggml_tensor * b,
5171
- // int s0,
5172
- // int p0,
5173
- // int d0) {
5174
- // GGML_ASSERT(ggml_is_matrix(b));
5175
- // GGML_ASSERT(a->ne[1] == b->ne[1]);
5176
- // bool is_node = false;
5177
-
5178
- // if (a->grad || b->grad) {
5179
- // GGML_ASSERT(false); // TODO: implement backward
5180
- // is_node = true;
5181
- // }
5140
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5182
5141
 
5183
- // const int64_t ne[4] = {
5184
- // ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
5185
- // a->ne[2], 1, 1,
5186
- // };
5187
- // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5142
+ struct ggml_tensor * result =
5143
+ ggml_mul_mat(ctx,
5144
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
5145
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
5188
5146
 
5189
- // int32_t params[] = { s0, p0, d0 };
5190
- // ggml_set_op_params(result, params, sizeof(params));
5147
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
5191
5148
 
5192
- // result->op = GGML_OP_CONV_1D;
5193
- // result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5194
- // result->src[0] = a;
5195
- // result->src[1] = b;
5196
-
5197
- // return result;
5198
- // }
5149
+ return result;
5150
+ }
5199
5151
 
5200
5152
  // ggml_conv_1d_ph
5201
5153
 
@@ -5258,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5258
5210
  // a: [OC,IC, KH, KW]
5259
5211
  // b: [N, IC, IH, IW]
5260
5212
  // result: [N, OH, OW, IC*KH*KW]
5261
- static struct ggml_tensor * ggml_conv_2d_stage_0(
5213
+ struct ggml_tensor * ggml_im2col(
5262
5214
  struct ggml_context * ctx,
5263
5215
  struct ggml_tensor * a,
5264
5216
  struct ggml_tensor * b,
@@ -5267,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5267
5219
  int p0,
5268
5220
  int p1,
5269
5221
  int d0,
5270
- int d1) {
5222
+ int d1,
5223
+ bool is_2D) {
5271
5224
 
5272
- GGML_ASSERT(a->ne[2] == b->ne[2]);
5225
+ if(is_2D) {
5226
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
5227
+ } else {
5228
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
5229
+ }
5273
5230
  bool is_node = false;
5274
5231
 
5275
5232
  if (a->grad || b->grad) {
@@ -5277,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5277
5234
  is_node = true;
5278
5235
  }
5279
5236
 
5280
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
5281
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5237
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
5238
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5282
5239
 
5283
5240
  const int64_t ne[4] = {
5284
- a->ne[2] * a->ne[1] * a->ne[0],
5241
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
5285
5242
  OW,
5286
- OH,
5287
- b->ne[3],
5243
+ is_2D ? OH : b->ne[2],
5244
+ is_2D ? b->ne[3] : 1,
5288
5245
  };
5289
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5290
5246
 
5291
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
5247
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5248
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5292
5249
  ggml_set_op_params(result, params, sizeof(params));
5293
5250
 
5294
- result->op = GGML_OP_CONV_2D_STAGE_0;
5295
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5296
- result->src[0] = a;
5297
- result->src[1] = b;
5298
-
5299
- return result;
5300
-
5301
- }
5302
-
5303
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
5304
- // a: [OC, IC, KH, KW]
5305
- // b: [N, OH, OW, IC * KH * KW]
5306
- // result: [N, OC, OH, OW]
5307
- static struct ggml_tensor * ggml_conv_2d_stage_1(
5308
- struct ggml_context * ctx,
5309
- struct ggml_tensor * a,
5310
- struct ggml_tensor * b) {
5311
-
5312
- bool is_node = false;
5313
-
5314
- if (a->grad || b->grad) {
5315
- GGML_ASSERT(false); // TODO: implement backward
5316
- is_node = true;
5317
- }
5318
-
5319
- const int64_t ne[4] = {
5320
- b->ne[1],
5321
- b->ne[2],
5322
- a->ne[3],
5323
- b->ne[3],
5324
- };
5325
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5326
-
5327
- result->op = GGML_OP_CONV_2D_STAGE_1;
5251
+ result->op = GGML_OP_IM2COL;
5328
5252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5329
5253
  result->src[0] = a;
5330
5254
  result->src[1] = b;
5331
5255
 
5332
5256
  return result;
5333
-
5334
5257
  }
5335
5258
 
5336
5259
  // a: [OC,IC, KH, KW]
5337
5260
  // b: [N, IC, IH, IW]
5338
5261
  // result: [N, OC, OH, OW]
5339
5262
  struct ggml_tensor * ggml_conv_2d(
5340
- struct ggml_context * ctx,
5341
- struct ggml_tensor * a,
5342
- struct ggml_tensor * b,
5343
- int s0,
5344
- int s1,
5345
- int p0,
5346
- int p1,
5347
- int d0,
5348
- int d1) {
5263
+ struct ggml_context * ctx,
5264
+ struct ggml_tensor * a,
5265
+ struct ggml_tensor * b,
5266
+ int s0,
5267
+ int s1,
5268
+ int p0,
5269
+ int p1,
5270
+ int d0,
5271
+ int d1) {
5272
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5349
5273
 
5350
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
5351
- result = ggml_conv_2d_stage_1(ctx, a, result);
5274
+ struct ggml_tensor * result =
5275
+ ggml_mul_mat(ctx,
5276
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5277
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5352
5278
 
5353
- return result;
5279
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5354
5280
 
5281
+ return result;
5355
5282
  }
5356
5283
 
5357
5284
  // ggml_conv_2d_sk_p0
@@ -5411,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
5411
5338
 
5412
5339
  // ggml_pool_*
5413
5340
 
5414
- static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
5341
+ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
5415
5342
  return (ins + 2 * p - ks) / s + 1;
5416
5343
  }
5417
5344
 
@@ -5458,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
5458
5385
  int k1,
5459
5386
  int s0,
5460
5387
  int s1,
5461
- int p0,
5462
- int p1) {
5388
+ float p0,
5389
+ float p1) {
5463
5390
 
5464
5391
  bool is_node = false;
5465
5392
 
@@ -8921,6 +8848,48 @@ static void ggml_compute_forward_silu(
8921
8848
  }
8922
8849
  }
8923
8850
 
8851
+ // ggml_compute_forward_leaky
8852
+
8853
+ static void ggml_compute_forward_leaky_f32(
8854
+ const struct ggml_compute_params * params,
8855
+ const struct ggml_tensor * src0,
8856
+ struct ggml_tensor * dst) {
8857
+ assert(params->ith == 0);
8858
+ assert(ggml_are_same_shape(src0, dst));
8859
+
8860
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8861
+ return;
8862
+ }
8863
+
8864
+ const int n = ggml_nrows(src0);
8865
+ const int nc = src0->ne[0];
8866
+
8867
+ assert(dst->nb[0] == sizeof(float));
8868
+ assert(src0->nb[0] == sizeof(float));
8869
+
8870
+ for (int i = 0; i < n; i++) {
8871
+ ggml_vec_leaky_f32(nc,
8872
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8873
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8874
+ }
8875
+ }
8876
+
8877
+ static void ggml_compute_forward_leaky(
8878
+ const struct ggml_compute_params * params,
8879
+ const struct ggml_tensor * src0,
8880
+ struct ggml_tensor * dst) {
8881
+ switch (src0->type) {
8882
+ case GGML_TYPE_F32:
8883
+ {
8884
+ ggml_compute_forward_leaky_f32(params, src0, dst);
8885
+ } break;
8886
+ default:
8887
+ {
8888
+ GGML_ASSERT(false);
8889
+ } break;
8890
+ }
8891
+ }
8892
+
8924
8893
  // ggml_compute_forward_silu_back
8925
8894
 
8926
8895
  static void ggml_compute_forward_silu_back_f32(
@@ -9404,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9404
9373
  // TODO: find the optimal values for these
9405
9374
  if (ggml_is_contiguous(src0) &&
9406
9375
  ggml_is_contiguous(src1) &&
9376
+ src0->type == GGML_TYPE_F32 &&
9377
+ src1->type == GGML_TYPE_F32 &&
9407
9378
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9408
9379
 
9409
9380
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9442,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
9442
9413
 
9443
9414
  // we don't support permuted src0 or src1
9444
9415
  GGML_ASSERT(nb00 == ggml_type_size(type));
9445
- GGML_ASSERT(nb10 == sizeof(float));
9416
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9446
9417
 
9447
9418
  // dst cannot be transposed or permuted
9448
9419
  GGML_ASSERT(nb0 == sizeof(float));
@@ -11340,9 +11311,9 @@ static void ggml_compute_forward_rope_back(
11340
11311
  }
11341
11312
  }
11342
11313
 
11343
- // ggml_compute_forward_conv_1d
11314
+ // ggml_compute_forward_conv_transpose_1d
11344
11315
 
11345
- static void ggml_compute_forward_conv_1d_f16_f32(
11316
+ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11346
11317
  const struct ggml_compute_params * params,
11347
11318
  const struct ggml_tensor * src0,
11348
11319
  const struct ggml_tensor * src1,
@@ -11359,14 +11330,7 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11359
11330
  const int ith = params->ith;
11360
11331
  const int nth = params->nth;
11361
11332
 
11362
- const int nk = ne00;
11363
-
11364
- // size of the convolution row - the kernel size unrolled across all input channels
11365
- const int ew0 = nk*ne01;
11366
-
11367
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11368
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11369
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11333
+ const int nk = ne00*ne01*ne02;
11370
11334
 
11371
11335
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11372
11336
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11374,23 +11338,37 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11374
11338
  if (params->type == GGML_TASK_INIT) {
11375
11339
  memset(params->wdata, 0, params->wsize);
11376
11340
 
11377
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11341
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11342
+ {
11343
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11344
+
11345
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11346
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11347
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11348
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11349
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11350
+ dst_data[i00*ne02 + i02] = src[i00];
11351
+ }
11352
+ }
11353
+ }
11354
+ }
11378
11355
 
11379
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11380
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11356
+ // permute source data (src1) from (L x Cin) to (Cin x L)
11357
+ {
11358
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11381
11359
  ggml_fp16_t * dst_data = wdata;
11382
11360
 
11383
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11384
- for (int64_t ik = 0; ik < nk; ik++) {
11385
- const int idx0 = i0*s0 + ik*d0 - p0;
11386
-
11387
- if(!(idx0 < 0 || idx0 >= ne10)) {
11388
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
11389
- }
11361
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
11362
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
11363
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
11364
+ dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11390
11365
  }
11391
11366
  }
11392
11367
  }
11393
11368
 
11369
+ // need to zero dst since we are accumulating into it
11370
+ memset(dst->data, 0, ggml_nbytes(dst));
11371
+
11394
11372
  return;
11395
11373
  }
11396
11374
 
@@ -11398,8 +11376,10 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11398
11376
  return;
11399
11377
  }
11400
11378
 
11379
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11380
+
11401
11381
  // total rows in dst
11402
- const int nr = ne2;
11382
+ const int nr = ne1;
11403
11383
 
11404
11384
  // rows per thread
11405
11385
  const int dr = (nr + nth - 1)/nth;
@@ -11408,22 +11388,26 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11408
11388
  const int ir0 = dr*ith;
11409
11389
  const int ir1 = MIN(ir0 + dr, nr);
11410
11390
 
11411
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11412
-
11413
- for (int i2 = 0; i2 < ne2; i2++) {
11414
- for (int i1 = ir0; i1 < ir1; i1++) {
11415
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11391
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11392
+ ggml_fp16_t * const wdata_src = wdata + nk;
11416
11393
 
11417
- for (int i0 = 0; i0 < ne0; i0++) {
11418
- ggml_vec_dot_f16(ew0, dst_data + i0,
11419
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
11420
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
11394
+ for (int i1 = ir0; i1 < ir1; i1++) {
11395
+ float * dst_data = (float *)((char *) dst->data + i1*nb1);
11396
+ ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11397
+ for (int i10 = 0; i10 < ne10; i10++) {
11398
+ const int i1n = i10*ne11;
11399
+ for (int i00 = 0; i00 < ne00; i00++) {
11400
+ float v = 0;
11401
+ ggml_vec_dot_f16(ne02, &v,
11402
+ (ggml_fp16_t *) wdata_src + i1n,
11403
+ (ggml_fp16_t *) wdata_kernel + i00*ne02);
11404
+ dst_data[i10*s0 + i00] += v;
11421
11405
  }
11422
11406
  }
11423
11407
  }
11424
11408
  }
11425
11409
 
11426
- static void ggml_compute_forward_conv_1d_f32(
11410
+ static void ggml_compute_forward_conv_transpose_1d_f32(
11427
11411
  const struct ggml_compute_params * params,
11428
11412
  const struct ggml_tensor * src0,
11429
11413
  const struct ggml_tensor * src1,
@@ -11440,13 +11424,7 @@ static void ggml_compute_forward_conv_1d_f32(
11440
11424
  const int ith = params->ith;
11441
11425
  const int nth = params->nth;
11442
11426
 
11443
- const int nk = ne00;
11444
-
11445
- const int ew0 = nk*ne01;
11446
-
11447
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11448
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11449
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11427
+ const int nk = ne00*ne01*ne02;
11450
11428
 
11451
11429
  GGML_ASSERT(nb00 == sizeof(float));
11452
11430
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11454,23 +11432,37 @@ static void ggml_compute_forward_conv_1d_f32(
11454
11432
  if (params->type == GGML_TASK_INIT) {
11455
11433
  memset(params->wdata, 0, params->wsize);
11456
11434
 
11457
- float * const wdata = (float *) params->wdata + 0;
11435
+ // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11436
+ {
11437
+ float * const wdata = (float *) params->wdata + 0;
11438
+
11439
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11440
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11441
+ const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
11442
+ float * dst_data = wdata + i01*ne00*ne02;
11443
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11444
+ dst_data[i00*ne02 + i02] = src[i00];
11445
+ }
11446
+ }
11447
+ }
11448
+ }
11458
11449
 
11459
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11460
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11450
+ // prepare source data (src1)
11451
+ {
11452
+ float * const wdata = (float *) params->wdata + nk;
11461
11453
  float * dst_data = wdata;
11462
11454
 
11463
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11464
- for (int64_t ik = 0; ik < nk; ik++) {
11465
- const int idx0 = i0*s0 + ik*d0 - p0;
11466
-
11467
- if(!(idx0 < 0 || idx0 >= ne10)) {
11468
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
11469
- }
11455
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
11456
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
11457
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
11458
+ dst_data[i10*ne11 + i11] = src[i10];
11470
11459
  }
11471
11460
  }
11472
11461
  }
11473
11462
 
11463
+ // need to zero dst since we are accumulating into it
11464
+ memset(dst->data, 0, ggml_nbytes(dst));
11465
+
11474
11466
  return;
11475
11467
  }
11476
11468
 
@@ -11478,8 +11470,10 @@ static void ggml_compute_forward_conv_1d_f32(
11478
11470
  return;
11479
11471
  }
11480
11472
 
11473
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11474
+
11481
11475
  // total rows in dst
11482
- const int nr = ne02;
11476
+ const int nr = ne1;
11483
11477
 
11484
11478
  // rows per thread
11485
11479
  const int dr = (nr + nth - 1)/nth;
@@ -11488,441 +11482,8 @@ static void ggml_compute_forward_conv_1d_f32(
11488
11482
  const int ir0 = dr*ith;
11489
11483
  const int ir1 = MIN(ir0 + dr, nr);
11490
11484
 
11491
- float * const wdata = (float *) params->wdata + 0;
11492
-
11493
- for (int i2 = 0; i2 < ne2; i2++) {
11494
- for (int i1 = ir0; i1 < ir1; i1++) {
11495
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11496
-
11497
- for (int i0 = 0; i0 < ne0; i0++) {
11498
- ggml_vec_dot_f32(ew0, dst_data + i0,
11499
- (float *) ((char *) src0->data + i1*nb02),
11500
- (float *) wdata + i2*nb2 + i0*ew0);
11501
- }
11502
- }
11503
- }
11504
- }
11505
-
11506
- // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
11507
- static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
11508
- ggml_fp16_t * A,
11509
- ggml_fp16_t * B,
11510
- float * C,
11511
- const int ith, const int nth) {
11512
- // does not seem to make a difference
11513
- int64_t m0, m1, n0, n1;
11514
- // patches per thread
11515
- if (m > n) {
11516
- n0 = 0;
11517
- n1 = n;
11518
-
11519
- // total patches in dst
11520
- const int np = m;
11521
-
11522
- // patches per thread
11523
- const int dp = (np + nth - 1)/nth;
11524
-
11525
- // patch range for this thread
11526
- m0 = dp*ith;
11527
- m1 = MIN(m0 + dp, np);
11528
- } else {
11529
- m0 = 0;
11530
- m1 = m;
11531
-
11532
- // total patches in dst
11533
- const int np = n;
11534
-
11535
- // patches per thread
11536
- const int dp = (np + nth - 1)/nth;
11537
-
11538
- // patch range for this thread
11539
- n0 = dp*ith;
11540
- n1 = MIN(n0 + dp, np);
11541
- }
11542
-
11543
- // block-tiling attempt
11544
- int64_t blck_n = 16;
11545
- int64_t blck_m = 16;
11546
-
11547
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
11548
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
11549
- // if (blck_size > 0) {
11550
- // blck_0 = 4;
11551
- // blck_1 = blck_size / blck_0;
11552
- // if (blck_1 < 0) {
11553
- // blck_1 = 1;
11554
- // }
11555
- // // blck_0 = (int64_t)sqrt(blck_size);
11556
- // // blck_1 = blck_0;
11557
- // }
11558
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
11559
-
11560
- for (int j = n0; j < n1; j+=blck_n) {
11561
- for (int i = m0; i < m1; i+=blck_m) {
11562
- // printf("i j k => %d %d %d\n", i, j, K);
11563
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
11564
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
11565
- ggml_vec_dot_f16(k,
11566
- C + ii*n + jj,
11567
- A + ii * k,
11568
- B + jj * k);
11569
- }
11570
- }
11571
- }
11572
- }
11573
- }
11574
-
11575
- // src0: kernel [OC, IC, K]
11576
- // src1: signal [N, IC, IL]
11577
- // dst: result [N, OL, IC*K]
11578
- static void ggml_compute_forward_conv_1d_stage_0_f32(
11579
- const struct ggml_compute_params * params,
11580
- const struct ggml_tensor * src0,
11581
- const struct ggml_tensor * src1,
11582
- struct ggml_tensor * dst) {
11583
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11584
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11585
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
11586
-
11587
- int64_t t0 = ggml_perf_time_us();
11588
- UNUSED(t0);
11589
-
11590
- GGML_TENSOR_BINARY_OP_LOCALS;
11591
-
11592
- const int64_t N = ne12;
11593
- const int64_t IC = ne11;
11594
- const int64_t IL = ne10;
11595
-
11596
- const int64_t K = ne00;
11597
-
11598
- const int64_t OL = ne1;
11599
-
11600
- const int ith = params->ith;
11601
- const int nth = params->nth;
11602
-
11603
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11604
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11605
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11606
-
11607
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11608
- GGML_ASSERT(nb10 == sizeof(float));
11609
-
11610
- if (params->type == GGML_TASK_INIT) {
11611
- memset(dst->data, 0, ggml_nbytes(dst));
11612
- return;
11613
- }
11614
-
11615
- if (params->type == GGML_TASK_FINALIZE) {
11616
- return;
11617
- }
11618
-
11619
- // im2col: [N, IC, IL] => [N, OL, IC*K]
11620
- {
11621
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
11622
-
11623
- for (int64_t in = 0; in < N; in++) {
11624
- for (int64_t iol = 0; iol < OL; iol++) {
11625
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11626
-
11627
- // micro kernel
11628
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
11629
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
11630
-
11631
- for (int64_t ik = 0; ik < K; ik++) {
11632
- const int64_t iil = iol*s0 + ik*d0 - p0;
11633
-
11634
- if (!(iil < 0 || iil >= IL)) {
11635
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
11636
- }
11637
- }
11638
- }
11639
- }
11640
- }
11641
- }
11642
- }
11643
-
11644
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11645
- // src0: [OC, IC, K]
11646
- // src1: [N, OL, IC * K]
11647
- // result: [N, OC, OL]
11648
- static void ggml_compute_forward_conv_1d_stage_1_f16(
11649
- const struct ggml_compute_params * params,
11650
- const struct ggml_tensor * src0,
11651
- const struct ggml_tensor * src1,
11652
- struct ggml_tensor * dst) {
11653
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11654
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
11655
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11656
-
11657
- int64_t t0 = ggml_perf_time_us();
11658
- UNUSED(t0);
11659
-
11660
- if (params->type == GGML_TASK_INIT) {
11661
- return;
11662
- }
11663
-
11664
- if (params->type == GGML_TASK_FINALIZE) {
11665
- return;
11666
- }
11667
-
11668
- GGML_TENSOR_BINARY_OP_LOCALS;
11669
-
11670
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11671
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
11672
- GGML_ASSERT(nb0 == sizeof(float));
11673
-
11674
- const int N = ne12;
11675
- const int OL = ne11;
11676
-
11677
- const int OC = ne02;
11678
- const int IC = ne01;
11679
- const int K = ne00;
11680
-
11681
- const int ith = params->ith;
11682
- const int nth = params->nth;
11683
-
11684
- int64_t m = OC;
11685
- int64_t n = OL;
11686
- int64_t k = IC * K;
11687
-
11688
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11689
- for (int i = 0; i < N; i++) {
11690
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
11691
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
11692
- float * C = (float *)dst->data + i * m * n; // [m, n]
11693
-
11694
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
11695
- }
11696
- }
11697
-
11698
- static void ggml_compute_forward_conv_1d(
11699
- const struct ggml_compute_params * params,
11700
- const struct ggml_tensor * src0,
11701
- const struct ggml_tensor * src1,
11702
- struct ggml_tensor * dst) {
11703
- switch(src0->type) {
11704
- case GGML_TYPE_F16:
11705
- {
11706
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
11707
- } break;
11708
- case GGML_TYPE_F32:
11709
- {
11710
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
11711
- } break;
11712
- default:
11713
- {
11714
- GGML_ASSERT(false);
11715
- } break;
11716
- }
11717
- }
11718
-
11719
- static void ggml_compute_forward_conv_1d_stage_0(
11720
- const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
- const struct ggml_tensor * src1,
11723
- struct ggml_tensor * dst) {
11724
- switch(src0->type) {
11725
- case GGML_TYPE_F16:
11726
- {
11727
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
11728
- } break;
11729
- default:
11730
- {
11731
- GGML_ASSERT(false);
11732
- } break;
11733
- }
11734
- }
11735
-
11736
- static void ggml_compute_forward_conv_1d_stage_1(
11737
- const struct ggml_compute_params * params,
11738
- const struct ggml_tensor * src0,
11739
- const struct ggml_tensor * src1,
11740
- struct ggml_tensor * dst) {
11741
- switch(src0->type) {
11742
- case GGML_TYPE_F16:
11743
- {
11744
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
11745
- } break;
11746
- default:
11747
- {
11748
- GGML_ASSERT(false);
11749
- } break;
11750
- }
11751
- }
11752
-
11753
- // ggml_compute_forward_conv_transpose_1d
11754
-
11755
- static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11756
- const struct ggml_compute_params * params,
11757
- const struct ggml_tensor * src0,
11758
- const struct ggml_tensor * src1,
11759
- struct ggml_tensor * dst) {
11760
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11761
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11762
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11763
-
11764
- int64_t t0 = ggml_perf_time_us();
11765
- UNUSED(t0);
11766
-
11767
- GGML_TENSOR_BINARY_OP_LOCALS
11768
-
11769
- const int ith = params->ith;
11770
- const int nth = params->nth;
11771
-
11772
- const int nk = ne00*ne01*ne02;
11773
-
11774
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11775
- GGML_ASSERT(nb10 == sizeof(float));
11776
-
11777
- if (params->type == GGML_TASK_INIT) {
11778
- memset(params->wdata, 0, params->wsize);
11779
-
11780
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11781
- {
11782
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11783
-
11784
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11785
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11786
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11787
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11788
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11789
- dst_data[i00*ne02 + i02] = src[i00];
11790
- }
11791
- }
11792
- }
11793
- }
11794
-
11795
- // permute source data (src1) from (L x Cin) to (Cin x L)
11796
- {
11797
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11798
- ggml_fp16_t * dst_data = wdata;
11799
-
11800
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11801
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11802
- for (int64_t i10 = 0; i10 < ne10; i10++) {
11803
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11804
- }
11805
- }
11806
- }
11807
-
11808
- // need to zero dst since we are accumulating into it
11809
- memset(dst->data, 0, ggml_nbytes(dst));
11810
-
11811
- return;
11812
- }
11813
-
11814
- if (params->type == GGML_TASK_FINALIZE) {
11815
- return;
11816
- }
11817
-
11818
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11819
-
11820
- // total rows in dst
11821
- const int nr = ne1;
11822
-
11823
- // rows per thread
11824
- const int dr = (nr + nth - 1)/nth;
11825
-
11826
- // row range for this thread
11827
- const int ir0 = dr*ith;
11828
- const int ir1 = MIN(ir0 + dr, nr);
11829
-
11830
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11831
- ggml_fp16_t * const wdata_src = wdata + nk;
11832
-
11833
- for (int i1 = ir0; i1 < ir1; i1++) {
11834
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
11835
- ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11836
- for (int i10 = 0; i10 < ne10; i10++) {
11837
- const int i1n = i10*ne11;
11838
- for (int i00 = 0; i00 < ne00; i00++) {
11839
- float v = 0;
11840
- ggml_vec_dot_f16(ne02, &v,
11841
- (ggml_fp16_t *) wdata_src + i1n,
11842
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
11843
- dst_data[i10*s0 + i00] += v;
11844
- }
11845
- }
11846
- }
11847
- }
11848
-
11849
- static void ggml_compute_forward_conv_transpose_1d_f32(
11850
- const struct ggml_compute_params * params,
11851
- const struct ggml_tensor * src0,
11852
- const struct ggml_tensor * src1,
11853
- struct ggml_tensor * dst) {
11854
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
11855
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11856
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11857
-
11858
- int64_t t0 = ggml_perf_time_us();
11859
- UNUSED(t0);
11860
-
11861
- GGML_TENSOR_BINARY_OP_LOCALS
11862
-
11863
- const int ith = params->ith;
11864
- const int nth = params->nth;
11865
-
11866
- const int nk = ne00*ne01*ne02;
11867
-
11868
- GGML_ASSERT(nb00 == sizeof(float));
11869
- GGML_ASSERT(nb10 == sizeof(float));
11870
-
11871
- if (params->type == GGML_TASK_INIT) {
11872
- memset(params->wdata, 0, params->wsize);
11873
-
11874
- // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11875
- {
11876
- float * const wdata = (float *) params->wdata + 0;
11877
-
11878
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11879
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11880
- const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
11881
- float * dst_data = wdata + i01*ne00*ne02;
11882
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11883
- dst_data[i00*ne02 + i02] = src[i00];
11884
- }
11885
- }
11886
- }
11887
- }
11888
-
11889
- // prepare source data (src1)
11890
- {
11891
- float * const wdata = (float *) params->wdata + nk;
11892
- float * dst_data = wdata;
11893
-
11894
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11895
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11896
- for (int64_t i10 = 0; i10 < ne10; i10++) {
11897
- dst_data[i10*ne11 + i11] = src[i10];
11898
- }
11899
- }
11900
- }
11901
-
11902
- // need to zero dst since we are accumulating into it
11903
- memset(dst->data, 0, ggml_nbytes(dst));
11904
-
11905
- return;
11906
- }
11907
-
11908
- if (params->type == GGML_TASK_FINALIZE) {
11909
- return;
11910
- }
11911
-
11912
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11913
-
11914
- // total rows in dst
11915
- const int nr = ne1;
11916
-
11917
- // rows per thread
11918
- const int dr = (nr + nth - 1)/nth;
11919
-
11920
- // row range for this thread
11921
- const int ir0 = dr*ith;
11922
- const int ir1 = MIN(ir0 + dr, nr);
11923
-
11924
- float * const wdata = (float *) params->wdata + 0;
11925
- float * const wdata_src = wdata + nk;
11485
+ float * const wdata = (float *) params->wdata + 0;
11486
+ float * const wdata_src = wdata + nk;
11926
11487
 
11927
11488
  for (int i1 = ir0; i1 < ir1; i1++) {
11928
11489
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
@@ -11961,12 +11522,10 @@ static void ggml_compute_forward_conv_transpose_1d(
11961
11522
  }
11962
11523
  }
11963
11524
 
11964
- // ggml_compute_forward_conv_2d
11965
-
11966
11525
  // src0: kernel [OC, IC, KH, KW]
11967
11526
  // src1: image [N, IC, IH, IW]
11968
11527
  // dst: result [N, OH, OW, IC*KH*KW]
11969
- static void ggml_compute_forward_conv_2d_stage_0_f32(
11528
+ static void ggml_compute_forward_im2col_f16(
11970
11529
  const struct ggml_compute_params * params,
11971
11530
  const struct ggml_tensor * src0,
11972
11531
  const struct ggml_tensor * src1,
@@ -11980,34 +11539,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
11980
11539
 
11981
11540
  GGML_TENSOR_BINARY_OP_LOCALS;
11982
11541
 
11983
- const int64_t N = ne13;
11984
- const int64_t IC = ne12;
11985
- const int64_t IH = ne11;
11542
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
11543
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
11544
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
11545
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
11546
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
11547
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
11548
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
11549
+
11550
+ const int ith = params->ith;
11551
+ const int nth = params->nth;
11552
+
11553
+ const int64_t N = is_2D ? ne13 : ne12;
11554
+ const int64_t IC = is_2D ? ne12 : ne11;
11555
+ const int64_t IH = is_2D ? ne11 : 1;
11986
11556
  const int64_t IW = ne10;
11987
11557
 
11988
- // const int64_t OC = ne03;
11989
- // const int64_t IC = ne02;
11990
- const int64_t KH = ne01;
11558
+ const int64_t KH = is_2D ? ne01 : 1;
11991
11559
  const int64_t KW = ne00;
11992
11560
 
11993
- const int64_t OH = ne2;
11561
+ const int64_t OH = is_2D ? ne2 : 1;
11994
11562
  const int64_t OW = ne1;
11995
11563
 
11996
- const int ith = params->ith;
11997
- const int nth = params->nth;
11998
-
11999
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12000
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12001
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12002
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12003
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12004
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
11564
+ int ofs0 = is_2D ? nb13 : nb12;
11565
+ int ofs1 = is_2D ? nb12 : nb11;
12005
11566
 
12006
11567
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12007
11568
  GGML_ASSERT(nb10 == sizeof(float));
12008
11569
 
12009
11570
  if (params->type == GGML_TASK_INIT) {
12010
- memset(dst->data, 0, ggml_nbytes(dst));
12011
11571
  return;
12012
11572
  }
12013
11573
 
@@ -12020,20 +11580,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12020
11580
  ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
12021
11581
 
12022
11582
  for (int64_t in = 0; in < N; in++) {
12023
- for (int64_t ioh = 0; ioh < OH; ioh++) {
11583
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12024
11584
  for (int64_t iow = 0; iow < OW; iow++) {
12025
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11585
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12026
11586
 
12027
11587
  // micro kernel
12028
11588
  ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12029
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
11589
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12030
11590
 
12031
- for (int64_t ikh = 0; ikh < KH; ikh++) {
11591
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12032
11592
  for (int64_t ikw = 0; ikw < KW; ikw++) {
12033
11593
  const int64_t iiw = iow*s0 + ikw*d0 - p0;
12034
11594
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
12035
11595
 
12036
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
11596
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
11597
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
11598
+ } else {
12037
11599
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12038
11600
  }
12039
11601
  }
@@ -12045,223 +11607,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12045
11607
  }
12046
11608
  }
12047
11609
 
12048
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12049
- // src0: [OC, IC, KH, KW]
12050
- // src1: [N, OH, OW, IC * KH * KW]
12051
- // result: [N, OC, OH, OW]
12052
- static void ggml_compute_forward_conv_2d_stage_1_f16(
12053
- const struct ggml_compute_params * params,
12054
- const struct ggml_tensor * src0,
12055
- const struct ggml_tensor * src1,
12056
- struct ggml_tensor * dst) {
12057
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12058
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
12059
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12060
-
12061
- int64_t t0 = ggml_perf_time_us();
12062
- UNUSED(t0);
12063
-
12064
- if (params->type == GGML_TASK_INIT) {
12065
- return;
12066
- }
12067
-
12068
- if (params->type == GGML_TASK_FINALIZE) {
12069
- return;
12070
- }
12071
-
12072
- GGML_TENSOR_BINARY_OP_LOCALS;
12073
-
12074
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12075
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
12076
- GGML_ASSERT(nb0 == sizeof(float));
12077
-
12078
- const int N = ne13;
12079
- const int OH = ne12;
12080
- const int OW = ne11;
12081
-
12082
- const int OC = ne03;
12083
- const int IC = ne02;
12084
- const int KH = ne01;
12085
- const int KW = ne00;
12086
-
12087
- const int ith = params->ith;
12088
- const int nth = params->nth;
12089
-
12090
- int64_t m = OC;
12091
- int64_t n = OH * OW;
12092
- int64_t k = IC * KH * KW;
12093
-
12094
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12095
- for (int i = 0; i < N; i++) {
12096
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12097
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
12098
- float * C = (float *)dst->data + i * m * n; // [m, n]
12099
-
12100
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12101
- }
12102
- }
12103
-
12104
- static void ggml_compute_forward_conv_2d_f16_f32(
12105
- const struct ggml_compute_params * params,
12106
- const struct ggml_tensor * src0,
12107
- const struct ggml_tensor * src1,
12108
- struct ggml_tensor * dst) {
12109
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12110
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12111
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12112
-
12113
- int64_t t0 = ggml_perf_time_us();
12114
- UNUSED(t0);
12115
-
12116
- GGML_TENSOR_BINARY_OP_LOCALS
12117
-
12118
- // src1: image [N, IC, IH, IW]
12119
- // src0: kernel [OC, IC, KH, KW]
12120
- // dst: result [N, OC, OH, OW]
12121
- // ne12: IC
12122
- // ne0: OW
12123
- // ne1: OH
12124
- // nk0: KW
12125
- // nk1: KH
12126
- // ne13: N
12127
-
12128
- const int N = ne13;
12129
- const int IC = ne12;
12130
- const int IH = ne11;
12131
- const int IW = ne10;
12132
-
12133
- const int OC = ne03;
12134
- // const int IC = ne02;
12135
- const int KH = ne01;
12136
- const int KW = ne00;
12137
-
12138
- const int OH = ne1;
12139
- const int OW = ne0;
12140
-
12141
- const int ith = params->ith;
12142
- const int nth = params->nth;
12143
-
12144
- // const int nk0 = ne00;
12145
- // const int nk1 = ne01;
12146
-
12147
- // size of the convolution row - the kernel size unrolled across all channels
12148
- // const int ew0 = nk0*nk1*ne02;
12149
- // ew0: IC*KH*KW
12150
-
12151
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12152
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12153
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12154
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12155
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12156
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12157
-
12158
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12159
- GGML_ASSERT(nb10 == sizeof(float));
12160
-
12161
- if (params->type == GGML_TASK_INIT) {
12162
- memset(params->wdata, 0, params->wsize);
12163
-
12164
- // prepare source data (src1)
12165
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
12166
-
12167
- {
12168
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12169
-
12170
- for (int in = 0; in < N; in++) {
12171
- for (int iic = 0; iic < IC; iic++) {
12172
- for (int ioh = 0; ioh < OH; ioh++) {
12173
- for (int iow = 0; iow < OW; iow++) {
12174
-
12175
- // micro kernel
12176
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12177
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
12178
-
12179
- for (int ikh = 0; ikh < KH; ikh++) {
12180
- for (int ikw = 0; ikw < KW; ikw++) {
12181
- const int iiw = iow*s0 + ikw*d0 - p0;
12182
- const int iih = ioh*s1 + ikh*d1 - p1;
12183
-
12184
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
12185
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12186
- }
12187
- }
12188
- }
12189
- }
12190
- }
12191
- }
12192
- }
12193
- }
12194
-
12195
- return;
12196
- }
12197
-
12198
- if (params->type == GGML_TASK_FINALIZE) {
12199
- return;
12200
- }
12201
-
12202
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12203
- // wdata: [N*OH*OW, IC*KH*KW]
12204
- // dst: result [N, OC, OH, OW]
12205
- // src0: kernel [OC, IC, KH, KW]
12206
-
12207
- int64_t m = OC;
12208
- int64_t n = OH * OW;
12209
- int64_t k = IC * KH * KW;
12210
-
12211
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12212
- for (int i = 0; i < N; i++) {
12213
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12214
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
12215
- float * C = (float *)dst->data + i * m * n; // [m * k]
12216
-
12217
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12218
- }
12219
- }
12220
-
12221
- static void ggml_compute_forward_conv_2d(
12222
- const struct ggml_compute_params * params,
12223
- const struct ggml_tensor * src0,
12224
- const struct ggml_tensor * src1,
12225
- struct ggml_tensor * dst) {
12226
- switch (src0->type) {
12227
- case GGML_TYPE_F16:
12228
- {
12229
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
12230
- } break;
12231
- case GGML_TYPE_F32:
12232
- {
12233
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
12234
- GGML_ASSERT(false);
12235
- } break;
12236
- default:
12237
- {
12238
- GGML_ASSERT(false);
12239
- } break;
12240
- }
12241
- }
12242
-
12243
- static void ggml_compute_forward_conv_2d_stage_0(
12244
- const struct ggml_compute_params * params,
12245
- const struct ggml_tensor * src0,
12246
- const struct ggml_tensor * src1,
12247
- struct ggml_tensor * dst) {
12248
- switch (src0->type) {
12249
- case GGML_TYPE_F16:
12250
- {
12251
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
12252
- } break;
12253
- case GGML_TYPE_F32:
12254
- {
12255
- GGML_ASSERT(false);
12256
- } break;
12257
- default:
12258
- {
12259
- GGML_ASSERT(false);
12260
- } break;
12261
- }
12262
- }
12263
-
12264
- static void ggml_compute_forward_conv_2d_stage_1(
11610
+ static void ggml_compute_forward_im2col(
12265
11611
  const struct ggml_compute_params * params,
12266
11612
  const struct ggml_tensor * src0,
12267
11613
  const struct ggml_tensor * src1,
@@ -12269,7 +11615,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
12269
11615
  switch (src0->type) {
12270
11616
  case GGML_TYPE_F16:
12271
11617
  {
12272
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
11618
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12273
11619
  } break;
12274
11620
  case GGML_TYPE_F32:
12275
11621
  {
@@ -12454,14 +11800,11 @@ static void ggml_compute_forward_pool_1d(
12454
11800
  ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
12455
11801
  }
12456
11802
 
12457
- // ggml_compute_forward_pool_2d_sk_p0
11803
+ // ggml_compute_forward_pool_2d
12458
11804
 
12459
- static void ggml_compute_forward_pool_2d_sk_p0(
11805
+ static void ggml_compute_forward_pool_2d(
12460
11806
  const struct ggml_compute_params * params,
12461
- const enum ggml_op_pool op,
12462
11807
  const struct ggml_tensor * src,
12463
- const int k0,
12464
- const int k1,
12465
11808
  struct ggml_tensor * dst) {
12466
11809
  assert(src->type == GGML_TYPE_F32);
12467
11810
  assert(params->ith == 0);
@@ -12470,6 +11813,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12470
11813
  return;
12471
11814
  }
12472
11815
 
11816
+ const int32_t * opts = (const int32_t *)dst->op_params;
11817
+ enum ggml_op_pool op = opts[0];
11818
+ const int k0 = opts[1];
11819
+ const int k1 = opts[2];
11820
+ const int s0 = opts[3];
11821
+ const int s1 = opts[4];
11822
+ const int p0 = opts[5];
11823
+ const int p1 = opts[6];
12473
11824
  const char * cdata = (const char*)src->data;
12474
11825
  const char * const data_end = cdata + ggml_nbytes(src);
12475
11826
 
@@ -12480,6 +11831,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12480
11831
  float * dplane = (float *)dst->data;
12481
11832
 
12482
11833
  const int ka = k0 * k1;
11834
+ const int offset0 = -p0;
11835
+ const int offset1 = -p1;
12483
11836
 
12484
11837
  while (cdata < data_end) {
12485
11838
  for (int oy = 0; oy < py; ++oy) {
@@ -12492,13 +11845,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12492
11845
  case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
12493
11846
  }
12494
11847
 
12495
- const int ix = ox * k0;
12496
- const int iy = oy * k1;
11848
+ const int ix = offset0 + ox * s0;
11849
+ const int iy = offset1 + oy * s1;
12497
11850
 
12498
11851
  for (int ky = 0; ky < k1; ++ky) {
11852
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
12499
11853
  const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
12500
11854
  for (int kx = 0; kx < k0; ++kx) {
12501
11855
  int j = ix + kx;
11856
+ if (j < 0 || j >= src->ne[0]) continue;
12502
11857
  switch (op) {
12503
11858
  case GGML_OP_POOL_AVG: *out += srow[j]; break;
12504
11859
  case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
@@ -12515,31 +11870,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12515
11870
  }
12516
11871
 
12517
11872
  cdata += src->nb[2];
12518
- dplane += pa;
12519
- }
12520
- }
12521
-
12522
- // ggml_compute_forward_pool_2d
12523
-
12524
- static void ggml_compute_forward_pool_2d(
12525
- const struct ggml_compute_params * params,
12526
- const struct ggml_tensor * src0,
12527
- struct ggml_tensor * dst) {
12528
-
12529
- const int32_t * opts = (const int32_t *)dst->op_params;
12530
- enum ggml_op_pool op = opts[0];
12531
- const int k0 = opts[1];
12532
- const int k1 = opts[2];
12533
- const int s0 = opts[3];
12534
- const int s1 = opts[4];
12535
- const int p0 = opts[5];
12536
- const int p1 = opts[6];
12537
- GGML_ASSERT(p0 == 0);
12538
- GGML_ASSERT(p1 == 0); // padding not supported
12539
- GGML_ASSERT(k0 == s0);
12540
- GGML_ASSERT(k1 == s1); // only s = k supported
12541
-
12542
- ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
11873
+ dplane += pa;
11874
+ }
12543
11875
  }
12544
11876
 
12545
11877
  // ggml_compute_forward_upscale
@@ -13743,6 +13075,10 @@ static void ggml_compute_forward_unary(
13743
13075
  {
13744
13076
  ggml_compute_forward_silu(params, src0, dst);
13745
13077
  } break;
13078
+ case GGML_UNARY_OP_LEAKY:
13079
+ {
13080
+ ggml_compute_forward_leaky(params, src0, dst);
13081
+ } break;
13746
13082
  default:
13747
13083
  {
13748
13084
  GGML_ASSERT(false);
@@ -14496,33 +13832,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14496
13832
  {
14497
13833
  ggml_compute_forward_clamp(params, tensor->src[0], tensor);
14498
13834
  } break;
14499
- case GGML_OP_CONV_1D:
14500
- {
14501
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
14502
- } break;
14503
- case GGML_OP_CONV_1D_STAGE_0:
14504
- {
14505
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14506
- } break;
14507
- case GGML_OP_CONV_1D_STAGE_1:
14508
- {
14509
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
14510
- } break;
14511
13835
  case GGML_OP_CONV_TRANSPOSE_1D:
14512
13836
  {
14513
13837
  ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
14514
13838
  } break;
14515
- case GGML_OP_CONV_2D:
14516
- {
14517
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14518
- } break;
14519
- case GGML_OP_CONV_2D_STAGE_0:
13839
+ case GGML_OP_IM2COL:
14520
13840
  {
14521
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14522
- } break;
14523
- case GGML_OP_CONV_2D_STAGE_1:
14524
- {
14525
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
13841
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
14526
13842
  } break;
14527
13843
  case GGML_OP_CONV_TRANSPOSE_2D:
14528
13844
  {
@@ -14651,62 +13967,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14651
13967
 
14652
13968
  ////////////////////////////////////////////////////////////////////////////////
14653
13969
 
14654
- static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
13970
+ static size_t ggml_hash_size(size_t min_sz) {
13971
+ // next primes after powers of two
13972
+ static const size_t primes[] = {
13973
+ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
13974
+ 2053, 4099, 8209, 16411, 32771, 65537, 131101,
13975
+ 262147, 524309, 1048583, 2097169, 4194319, 8388617,
13976
+ 16777259, 33554467, 67108879, 134217757, 268435459,
13977
+ 536870923, 1073741827, 2147483659
13978
+ };
13979
+ static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
13980
+
13981
+ // find the smallest prime that is larger or equal to min_sz
13982
+ size_t l = 0;
13983
+ size_t r = n_primes;
13984
+ while (l < r) {
13985
+ size_t m = (l + r)/2;
13986
+ if (primes[m] < min_sz) {
13987
+ l = m + 1;
13988
+ } else {
13989
+ r = m;
13990
+ }
13991
+ }
13992
+ size_t sz = l < n_primes ? primes[l] : min_sz | 1;
13993
+ return sz;
13994
+ }
14655
13995
 
14656
- static size_t hash(void * p) {
14657
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
13996
+ static size_t ggml_hash(const void * p) {
13997
+ return (size_t)p;
14658
13998
  }
14659
13999
 
14660
- static size_t hash_find(void * hash_table[], void * p) {
14661
- size_t h = hash(p);
14000
+ size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14001
+ size_t h = ggml_hash(key) % hash_set.size;
14662
14002
 
14663
14003
  // linear probing
14664
14004
  size_t i = h;
14665
- while (hash_table[i] != NULL && hash_table[i] != p) {
14666
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
14005
+ while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
14006
+ i = (i + 1) % hash_set.size;
14667
14007
  if (i == h) {
14668
14008
  // visited all hash table entries -> not found
14669
- return GGML_GRAPH_HASHTABLE_SIZE;
14009
+ return GGML_HASHTABLE_FULL;
14670
14010
  }
14671
14011
  }
14672
14012
  return i;
14673
14013
  }
14674
14014
 
14675
- static bool hash_insert(void * hash_table[], void * p) {
14676
- size_t i = hash_find(hash_table, p);
14015
+ bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14016
+ size_t i = ggml_hash_find(hash_set, key);
14017
+ return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
14018
+ }
14019
+
14020
+ size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14021
+ size_t i = ggml_hash_find(hash_set, key);
14677
14022
 
14678
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14023
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14679
14024
 
14680
- if (hash_table[i] == p) {
14681
- return true;
14025
+ if (hash_set.keys[i] == key) {
14026
+ return GGML_HASHTABLE_ALREADY_EXISTS;
14682
14027
  }
14683
14028
 
14684
14029
  // insert
14685
- GGML_ASSERT(hash_table[i] == NULL);
14686
- hash_table[i] = p;
14687
- return false;
14030
+ GGML_ASSERT(hash_set.keys[i] == NULL);
14031
+ hash_set.keys[i] = key;
14032
+ return i;
14033
+ }
14034
+
14035
+ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14036
+ size_t i = ggml_hash_find(hash_set, key);
14037
+
14038
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14039
+
14040
+ hash_set.keys[i] = key;
14041
+ return i;
14042
+ }
14043
+
14044
+ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14045
+ size = ggml_hash_size(size);
14046
+ struct ggml_hash_set result;
14047
+ result.size = size;
14048
+ result.keys = malloc(sizeof(struct ggml_tensor *) * size);
14049
+ memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14050
+ return result;
14688
14051
  }
14689
14052
 
14690
- static bool hash_contains(void * hash_table[], void * p) {
14691
- size_t i = hash_find(hash_table, p);
14692
- return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
14053
+ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14054
+ free(hash_set.keys);
14693
14055
  }
14694
14056
 
14695
14057
  struct hash_map {
14696
- void * keys[GGML_GRAPH_HASHTABLE_SIZE];
14697
- void * vals[GGML_GRAPH_HASHTABLE_SIZE];
14058
+ struct ggml_hash_set set;
14059
+ struct ggml_tensor ** vals;
14698
14060
  };
14699
14061
 
14700
- static struct hash_map * new_hash_map(void) {
14062
+ static struct hash_map * ggml_new_hash_map(size_t size) {
14701
14063
  struct hash_map * result = malloc(sizeof(struct hash_map));
14702
- for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
14703
- result->keys[i] = NULL;
14704
- result->vals[i] = NULL;
14705
- }
14064
+ result->set = ggml_hash_set_new(size);
14065
+ result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
14066
+ memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14706
14067
  return result;
14707
14068
  }
14708
14069
 
14709
- static void free_hash_map(struct hash_map * map) {
14070
+ static void ggml_hash_map_free(struct hash_map * map) {
14071
+ ggml_hash_set_free(map->set);
14072
+ free(map->vals);
14710
14073
  free(map);
14711
14074
  }
14712
14075
 
@@ -14726,7 +14089,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14726
14089
  return node;
14727
14090
  }
14728
14091
 
14729
- if (!hash_contains(graph->visited_hash_table, node)) {
14092
+ if (!ggml_hash_contains(graph->visited_hash_table, node)) {
14730
14093
  return node;
14731
14094
  }
14732
14095
 
@@ -14741,17 +14104,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14741
14104
  return node;
14742
14105
  }
14743
14106
 
14744
- size_t i = hash_find(replacements->keys, node);
14745
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14746
- if (replacements->keys[i] == node) {
14747
- return (struct ggml_tensor *) replacements->vals[i];
14107
+ size_t i = ggml_hash_find(replacements->set, node);
14108
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
14109
+ if (replacements->set.keys[i] == node) {
14110
+ return replacements->vals[i];
14748
14111
  }
14749
14112
 
14750
14113
  struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14751
14114
 
14752
14115
  // insert clone into replacements
14753
- GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
14754
- replacements->keys[i] = node;
14116
+ GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
14117
+ replacements->set.keys[i] = node;
14755
14118
  replacements->vals[i] = clone;
14756
14119
 
14757
14120
  clone->op = node->op;
@@ -14788,26 +14151,26 @@ void ggml_build_backward_gradient_checkpointing(
14788
14151
  struct ggml_cgraph * gb_tmp,
14789
14152
  struct ggml_tensor * * checkpoints,
14790
14153
  int n_checkpoints) {
14791
- *gb_tmp = *gf;
14154
+ ggml_graph_cpy(gf, gb_tmp);
14792
14155
  ggml_build_backward_expand(ctx, gf, gb_tmp, true);
14793
14156
 
14794
14157
  if (n_checkpoints <= 0) {
14795
- *gb = *gb_tmp;
14158
+ ggml_graph_cpy(gb_tmp, gb);
14796
14159
  return;
14797
14160
  }
14798
14161
 
14799
- struct hash_map * replacements = new_hash_map();
14162
+ struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
14800
14163
 
14801
14164
  // insert checkpoints in replacements
14802
14165
  for (int i = 0; i < n_checkpoints; ++i) {
14803
- size_t k = hash_find(replacements->keys, checkpoints[i]);
14804
- GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14805
- GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
14806
- replacements->keys[k] = checkpoints[i];
14807
- replacements->vals[k] = checkpoints[i];
14166
+ size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
14167
+ GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
14168
+ GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
14169
+ replacements->set.keys[k] = checkpoints[i];
14170
+ replacements->vals[k] = checkpoints[i];
14808
14171
  }
14809
14172
 
14810
- *gb = *gf;
14173
+ ggml_graph_cpy(gf, gb);
14811
14174
  // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
14812
14175
  // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
14813
14176
  // by recomputing them from checkpoints
@@ -14824,21 +14187,21 @@ void ggml_build_backward_gradient_checkpointing(
14824
14187
  ggml_build_forward_expand(gb, node);
14825
14188
  }
14826
14189
 
14827
- free_hash_map(replacements);
14190
+ ggml_hash_map_free(replacements);
14828
14191
  }
14829
14192
 
14830
14193
  // functions to change gradients considering the case that input a might be initial gradient with zero value
14831
14194
 
14832
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14833
- if (hash_contains(zero_table, a)) {
14195
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14196
+ if (ggml_hash_contains(zero_table, a)) {
14834
14197
  return b;
14835
14198
  } else {
14836
14199
  return ggml_add_impl(ctx, a, b, false);
14837
14200
  }
14838
14201
  }
14839
14202
 
14840
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
14841
- if (hash_contains(zero_table, a)) {
14203
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14204
+ if (ggml_hash_contains(zero_table, a)) {
14842
14205
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14843
14206
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14844
14207
  } else {
@@ -14846,23 +14209,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
14846
14209
  }
14847
14210
  }
14848
14211
 
14849
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14850
- if (hash_contains(zero_table, a)) {
14212
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14213
+ if (ggml_hash_contains(zero_table, a)) {
14851
14214
  return ggml_repeat(ctx, b, a);
14852
14215
  } else {
14853
14216
  return ggml_add1_impl(ctx, a, b, false);
14854
14217
  }
14855
14218
  }
14856
14219
 
14857
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14858
- if (hash_contains(zero_table, a)) {
14220
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14221
+ if (ggml_hash_contains(zero_table, a)) {
14859
14222
  return ggml_neg(ctx, b);
14860
14223
  } else {
14861
14224
  return ggml_sub_impl(ctx, a, b, false);
14862
14225
  }
14863
14226
  }
14864
14227
 
14865
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
14228
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
14866
14229
  struct ggml_tensor * src0 = tensor->src[0];
14867
14230
  struct ggml_tensor * src1 = tensor->src[1];
14868
14231
 
@@ -15457,31 +14820,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15457
14820
  {
15458
14821
  GGML_ASSERT(false); // TODO: not implemented
15459
14822
  } break;
15460
- case GGML_OP_CONV_1D:
15461
- {
15462
- GGML_ASSERT(false); // TODO: not implemented
15463
- } break;
15464
- case GGML_OP_CONV_1D_STAGE_0:
15465
- {
15466
- GGML_ASSERT(false); // TODO: not implemented
15467
- } break;
15468
- case GGML_OP_CONV_1D_STAGE_1:
15469
- {
15470
- GGML_ASSERT(false); // TODO: not implemented
15471
- } break;
15472
14823
  case GGML_OP_CONV_TRANSPOSE_1D:
15473
14824
  {
15474
14825
  GGML_ASSERT(false); // TODO: not implemented
15475
14826
  } break;
15476
- case GGML_OP_CONV_2D:
15477
- {
15478
- GGML_ASSERT(false); // TODO: not implemented
15479
- } break;
15480
- case GGML_OP_CONV_2D_STAGE_0:
15481
- {
15482
- GGML_ASSERT(false); // TODO: not implemented
15483
- } break;
15484
- case GGML_OP_CONV_2D_STAGE_1:
14827
+ case GGML_OP_IM2COL:
15485
14828
  {
15486
14829
  GGML_ASSERT(false); // TODO: not implemented
15487
14830
  } break;
@@ -15695,7 +15038,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15695
15038
  }
15696
15039
 
15697
15040
  // check if already visited
15698
- if (hash_insert(cgraph->visited_hash_table, node)) {
15041
+ if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
15699
15042
  return;
15700
15043
  }
15701
15044
 
@@ -15711,7 +15054,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15711
15054
 
15712
15055
  if (node->op == GGML_OP_NONE && node->grad == NULL) {
15713
15056
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
15714
- GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15057
+ GGML_ASSERT(cgraph->n_leafs < cgraph->size);
15715
15058
 
15716
15059
  if (strlen(node->name) == 0) {
15717
15060
  ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15720,22 +15063,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15720
15063
  cgraph->leafs[cgraph->n_leafs] = node;
15721
15064
  cgraph->n_leafs++;
15722
15065
  } else {
15723
- GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15066
+ GGML_ASSERT(cgraph->n_nodes < cgraph->size);
15724
15067
 
15725
15068
  if (strlen(node->name) == 0) {
15726
15069
  ggml_format_name(node, "node_%d", cgraph->n_nodes);
15727
15070
  }
15728
15071
 
15729
15072
  cgraph->nodes[cgraph->n_nodes] = node;
15730
- cgraph->grads[cgraph->n_nodes] = node->grad;
15073
+ if (cgraph->grads) {
15074
+ cgraph->grads[cgraph->n_nodes] = node->grad;
15075
+ }
15731
15076
  cgraph->n_nodes++;
15732
15077
  }
15733
15078
  }
15734
15079
 
15735
15080
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
15736
15081
  if (!expand) {
15737
- cgraph->n_nodes = 0;
15738
- cgraph->n_leafs = 0;
15082
+ // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
15083
+ ggml_graph_clear(cgraph);
15739
15084
  }
15740
15085
 
15741
15086
  const int n0 = cgraph->n_nodes;
@@ -15756,25 +15101,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
15756
15101
  ggml_build_forward_impl(cgraph, tensor, true);
15757
15102
  }
15758
15103
 
15759
- struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15760
- struct ggml_cgraph result = {
15761
- /*.n_nodes =*/ 0,
15762
- /*.n_leafs =*/ 0,
15763
- /*.nodes =*/ { NULL },
15764
- /*.grads =*/ { NULL },
15765
- /*.leafs =*/ { NULL },
15766
- /*.hash_table =*/ { NULL },
15767
- /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15768
- /*.perf_runs =*/ 0,
15769
- /*.perf_cycles =*/ 0,
15770
- /*.perf_time_us =*/ 0,
15771
- };
15772
-
15773
- ggml_build_forward_impl(&result, tensor, false);
15774
-
15775
- return result;
15776
- }
15777
-
15778
15104
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
15779
15105
  GGML_ASSERT(gf->n_nodes > 0);
15780
15106
 
@@ -15791,11 +15117,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15791
15117
  }
15792
15118
 
15793
15119
  // remember original gradients which start with zero values
15794
- void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
15795
- memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
15120
+ struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
15796
15121
  for (int i = 0; i < gf->n_nodes; i++) {
15797
15122
  if (gf->grads[i]) {
15798
- hash_insert(zero_table, gf->grads[i]);
15123
+ ggml_hash_insert(zero_table, gf->grads[i]);
15799
15124
  }
15800
15125
  }
15801
15126
 
@@ -15818,26 +15143,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15818
15143
  }
15819
15144
  }
15820
15145
 
15821
- free(zero_table);
15146
+ ggml_hash_set_free(zero_table);
15822
15147
  }
15823
15148
 
15824
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
15825
- struct ggml_cgraph result = *gf;
15826
- ggml_build_backward_expand(ctx, gf, &result, keep);
15827
- return result;
15149
+ static size_t ggml_graph_nbytes(size_t size, bool grads) {
15150
+ size_t nbytes = sizeof(struct ggml_cgraph);
15151
+ nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
15152
+ if (grads) {
15153
+ nbytes += size * sizeof(struct ggml_tensor *); // grads
15154
+ }
15155
+ nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
15156
+ return nbytes;
15828
15157
  }
15829
15158
 
15830
- struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15831
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15159
+ size_t ggml_graph_overhead_custom(size_t size, bool grads) {
15160
+ return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
15161
+ }
15162
+
15163
+ size_t ggml_graph_overhead(void) {
15164
+ return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
15165
+ }
15166
+
15167
+ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
15168
+ const size_t obj_size = ggml_graph_nbytes(size, grads);
15169
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15832
15170
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15833
15171
 
15172
+ struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
15173
+
15174
+ size_t hash_size = ggml_hash_size(size * 2);
15175
+ struct ggml_tensor ** nodes_ptr = data_start;
15176
+ struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
15177
+ struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
15178
+ struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
15179
+
15180
+ // check that we allocated the correct amount of memory
15181
+ assert(obj_size == (size_t) (
15182
+ (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
15183
+
15184
+ memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
15185
+
15834
15186
  *cgraph = (struct ggml_cgraph) {
15187
+ /*.size =*/ size,
15835
15188
  /*.n_nodes =*/ 0,
15836
15189
  /*.n_leafs =*/ 0,
15837
- /*.nodes =*/ { NULL },
15838
- /*.grads =*/ { NULL },
15839
- /*.leafs =*/ { NULL },
15840
- /*.hash_table =*/ { NULL },
15190
+ /*.nodes =*/ nodes_ptr,
15191
+ /*.grads =*/ grads_ptr,
15192
+ /*.leafs =*/ leafs_ptr,
15193
+ /*.hash_table =*/ { hash_size, hash_keys_ptr },
15841
15194
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15842
15195
  /*.perf_runs =*/ 0,
15843
15196
  /*.perf_cycles =*/ 0,
@@ -15847,14 +15200,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15847
15200
  return cgraph;
15848
15201
  }
15849
15202
 
15850
- struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15851
- struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15852
- ggml_build_forward_impl(cgraph, tensor, false);
15203
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15204
+ return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15205
+ }
15206
+
15207
+ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15208
+ const size_t obj_size = sizeof(struct ggml_cgraph);
15209
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15210
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15211
+
15212
+ *cgraph = (struct ggml_cgraph) {
15213
+ /*.size =*/ 0,
15214
+ /*.n_nodes =*/ i1 - i0,
15215
+ /*.n_leafs =*/ 0,
15216
+ /*.nodes =*/ cgraph0->nodes + i0,
15217
+ /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
15218
+ /*.leafs =*/ NULL,
15219
+ /*.hash_table =*/ { 0, NULL },
15220
+ /*.order =*/ cgraph0->order,
15221
+ /*.perf_runs =*/ 0,
15222
+ /*.perf_cycles =*/ 0,
15223
+ /*.perf_time_us =*/ 0,
15224
+ };
15225
+
15853
15226
  return cgraph;
15854
15227
  }
15855
15228
 
15856
- size_t ggml_graph_overhead(void) {
15857
- return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15229
+ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
15230
+ GGML_ASSERT(dst->size >= src->n_leafs);
15231
+ GGML_ASSERT(dst->size >= src->n_nodes);
15232
+ GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
15233
+
15234
+ dst->n_leafs = src->n_leafs;
15235
+ dst->n_nodes = src->n_nodes;
15236
+ dst->order = src->order;
15237
+
15238
+ for (int i = 0; i < src->n_leafs; ++i) {
15239
+ dst->leafs[i] = src->leafs[i];
15240
+ }
15241
+
15242
+ for (int i = 0; i < src->n_nodes; ++i) {
15243
+ dst->nodes[i] = src->nodes[i];
15244
+ }
15245
+
15246
+ if (src->grads) {
15247
+ GGML_ASSERT(dst->grads != NULL);
15248
+ for (int i = 0; i < src->n_nodes; ++i) {
15249
+ dst->grads[i] = src->grads[i];
15250
+ }
15251
+ }
15252
+
15253
+ for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
15254
+ if (src->visited_hash_table.keys[i]) {
15255
+ ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
15256
+ }
15257
+ }
15258
+ }
15259
+
15260
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
15261
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
15262
+ ggml_graph_cpy(cgraph, result);
15263
+ return result;
15264
+ }
15265
+
15266
+ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
15267
+ GGML_ASSERT(cgraph->grads != NULL);
15268
+
15269
+ for (int i = 0; i < cgraph->n_nodes; i++) {
15270
+ struct ggml_tensor * grad = cgraph->grads[i];
15271
+
15272
+ if (grad) {
15273
+ ggml_set_zero(grad);
15274
+ }
15275
+ }
15276
+ }
15277
+
15278
+ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
15279
+ cgraph->n_leafs = 0;
15280
+ cgraph->n_nodes = 0;
15281
+ memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
15858
15282
  }
15859
15283
 
15860
15284
  //
@@ -15966,45 +15390,266 @@ static void clear_numa_thread_affinity(void) {
15966
15390
  strerror(rv));
15967
15391
  }
15968
15392
 
15969
- CPU_FREE(cpus);
15970
- }
15971
- #else
15972
- // TODO: Windows etc.
15973
- // (the linux implementation may also work on BSD, someone should test)
15974
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15975
- static void clear_numa_thread_affinity(void) {}
15976
- #endif
15977
-
15978
- struct ggml_compute_state_shared {
15979
- const struct ggml_cgraph * cgraph;
15980
- const struct ggml_cplan * cplan;
15981
-
15982
- int64_t perf_node_start_cycles;
15983
- int64_t perf_node_start_time_us;
15984
-
15985
- const int n_threads;
15986
-
15987
- // synchronization primitives
15988
- atomic_int n_active; // num active threads
15989
- atomic_int node_n; // active graph node
15990
-
15991
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15992
- void * abort_callback_data;
15993
- };
15994
-
15995
- struct ggml_compute_state {
15996
- ggml_thread_t thrd;
15997
- int ith;
15998
- struct ggml_compute_state_shared * shared;
15999
- };
16000
-
16001
- static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16002
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16003
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15393
+ CPU_FREE(cpus);
15394
+ }
15395
+ #else
15396
+ // TODO: Windows etc.
15397
+ // (the linux implementation may also work on BSD, someone should test)
15398
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15399
+ static void clear_numa_thread_affinity(void) {}
15400
+ #endif
15401
+
15402
+ struct ggml_compute_state_shared {
15403
+ const struct ggml_cgraph * cgraph;
15404
+ const struct ggml_cplan * cplan;
15405
+
15406
+ int64_t perf_node_start_cycles;
15407
+ int64_t perf_node_start_time_us;
15408
+
15409
+ const int n_threads;
15410
+
15411
+ // synchronization primitives
15412
+ atomic_int n_active; // num active threads
15413
+ atomic_int node_n; // active graph node
15414
+
15415
+ bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15416
+ void * abort_callback_data;
15417
+ };
15418
+
15419
+ struct ggml_compute_state {
15420
+ ggml_thread_t thrd;
15421
+ int ith;
15422
+ struct ggml_compute_state_shared * shared;
15423
+ };
15424
+
15425
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
15426
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
15427
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15428
+
15429
+ node->perf_runs++;
15430
+ node->perf_cycles += cycles_cur;
15431
+ node->perf_time_us += time_us_cur;
15432
+ }
15433
+
15434
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15435
+ int n_tasks = 0;
15436
+
15437
+ switch (node->op) {
15438
+ case GGML_OP_CPY:
15439
+ case GGML_OP_DUP:
15440
+ case GGML_OP_ADD:
15441
+ case GGML_OP_ADD1:
15442
+ case GGML_OP_ACC:
15443
+ {
15444
+ n_tasks = n_threads;
15445
+ } break;
15446
+ case GGML_OP_SUB:
15447
+ case GGML_OP_DIV:
15448
+ case GGML_OP_SQR:
15449
+ case GGML_OP_SQRT:
15450
+ case GGML_OP_LOG:
15451
+ case GGML_OP_SUM:
15452
+ case GGML_OP_SUM_ROWS:
15453
+ case GGML_OP_MEAN:
15454
+ case GGML_OP_ARGMAX:
15455
+ case GGML_OP_REPEAT:
15456
+ case GGML_OP_REPEAT_BACK:
15457
+ {
15458
+ n_tasks = 1;
15459
+ } break;
15460
+ case GGML_OP_UNARY:
15461
+ switch (ggml_get_unary_op(node)) {
15462
+ case GGML_UNARY_OP_ABS:
15463
+ case GGML_UNARY_OP_SGN:
15464
+ case GGML_UNARY_OP_NEG:
15465
+ case GGML_UNARY_OP_STEP:
15466
+ case GGML_UNARY_OP_TANH:
15467
+ case GGML_UNARY_OP_ELU:
15468
+ case GGML_UNARY_OP_RELU:
15469
+ case GGML_UNARY_OP_LEAKY:
15470
+ {
15471
+ n_tasks = 1;
15472
+ } break;
15473
+
15474
+ case GGML_UNARY_OP_GELU:
15475
+ case GGML_UNARY_OP_GELU_QUICK:
15476
+ case GGML_UNARY_OP_SILU:
15477
+ {
15478
+ n_tasks = n_threads;
15479
+ } break;
15480
+ }
15481
+ break;
15482
+ case GGML_OP_SILU_BACK:
15483
+ case GGML_OP_MUL:
15484
+ case GGML_OP_NORM:
15485
+ case GGML_OP_RMS_NORM:
15486
+ case GGML_OP_RMS_NORM_BACK:
15487
+ case GGML_OP_GROUP_NORM:
15488
+ case GGML_OP_CONCAT:
15489
+ {
15490
+ n_tasks = n_threads;
15491
+ } break;
15492
+ case GGML_OP_MUL_MAT:
15493
+ {
15494
+ n_tasks = n_threads;
15495
+
15496
+ // TODO: use different scheduling for different matrix sizes
15497
+ //const int nr0 = ggml_nrows(node->src[0]);
15498
+ //const int nr1 = ggml_nrows(node->src[1]);
15499
+
15500
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
15501
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
15502
+
15503
+ #if defined(GGML_USE_CUBLAS)
15504
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
15505
+ n_tasks = 1; // TODO: this actually is doing nothing
15506
+ // the threads are still spinning
15507
+ }
15508
+ #elif defined(GGML_USE_CLBLAST)
15509
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
15510
+ n_tasks = 1; // TODO: this actually is doing nothing
15511
+ // the threads are still spinning
15512
+ }
15513
+ #endif
15514
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
15515
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
15516
+ n_tasks = 1; // TODO: this actually is doing nothing
15517
+ // the threads are still spinning
15518
+ }
15519
+ #endif
15520
+ } break;
15521
+ case GGML_OP_OUT_PROD:
15522
+ {
15523
+ n_tasks = n_threads;
15524
+ } break;
15525
+ case GGML_OP_SCALE:
15526
+ case GGML_OP_SET:
15527
+ case GGML_OP_CONT:
15528
+ case GGML_OP_RESHAPE:
15529
+ case GGML_OP_VIEW:
15530
+ case GGML_OP_PERMUTE:
15531
+ case GGML_OP_TRANSPOSE:
15532
+ case GGML_OP_GET_ROWS:
15533
+ case GGML_OP_GET_ROWS_BACK:
15534
+ case GGML_OP_DIAG:
15535
+ {
15536
+ n_tasks = 1;
15537
+ } break;
15538
+ case GGML_OP_DIAG_MASK_ZERO:
15539
+ case GGML_OP_DIAG_MASK_INF:
15540
+ case GGML_OP_SOFT_MAX:
15541
+ case GGML_OP_SOFT_MAX_BACK:
15542
+ case GGML_OP_ROPE:
15543
+ case GGML_OP_ROPE_BACK:
15544
+ case GGML_OP_ADD_REL_POS:
15545
+ {
15546
+ n_tasks = n_threads;
15547
+ } break;
15548
+ case GGML_OP_ALIBI:
15549
+ {
15550
+ n_tasks = 1; //TODO
15551
+ } break;
15552
+ case GGML_OP_CLAMP:
15553
+ {
15554
+ n_tasks = 1; //TODO
15555
+ } break;
15556
+ case GGML_OP_CONV_TRANSPOSE_1D:
15557
+ {
15558
+ n_tasks = n_threads;
15559
+ } break;
15560
+ case GGML_OP_IM2COL:
15561
+ {
15562
+ n_tasks = n_threads;
15563
+ } break;
15564
+ case GGML_OP_CONV_TRANSPOSE_2D:
15565
+ {
15566
+ n_tasks = n_threads;
15567
+ } break;
15568
+ case GGML_OP_POOL_1D:
15569
+ case GGML_OP_POOL_2D:
15570
+ {
15571
+ n_tasks = 1;
15572
+ } break;
15573
+ case GGML_OP_UPSCALE:
15574
+ {
15575
+ n_tasks = n_threads;
15576
+ } break;
15577
+ case GGML_OP_FLASH_ATTN:
15578
+ {
15579
+ n_tasks = n_threads;
15580
+ } break;
15581
+ case GGML_OP_FLASH_FF:
15582
+ {
15583
+ n_tasks = n_threads;
15584
+ } break;
15585
+ case GGML_OP_FLASH_ATTN_BACK:
15586
+ {
15587
+ n_tasks = n_threads;
15588
+ } break;
15589
+ case GGML_OP_WIN_PART:
15590
+ case GGML_OP_WIN_UNPART:
15591
+ case GGML_OP_GET_REL_POS:
15592
+ case GGML_OP_MAP_UNARY:
15593
+ case GGML_OP_MAP_BINARY:
15594
+ case GGML_OP_MAP_CUSTOM1_F32:
15595
+ case GGML_OP_MAP_CUSTOM2_F32:
15596
+ case GGML_OP_MAP_CUSTOM3_F32:
15597
+ {
15598
+ n_tasks = 1;
15599
+ } break;
15600
+ case GGML_OP_MAP_CUSTOM1:
15601
+ {
15602
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
15603
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15604
+ n_tasks = n_threads;
15605
+ } else {
15606
+ n_tasks = MIN(p->n_tasks, n_threads);
15607
+ }
15608
+ } break;
15609
+ case GGML_OP_MAP_CUSTOM2:
15610
+ {
15611
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
15612
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15613
+ n_tasks = n_threads;
15614
+ } else {
15615
+ n_tasks = MIN(p->n_tasks, n_threads);
15616
+ }
15617
+ } break;
15618
+ case GGML_OP_MAP_CUSTOM3:
15619
+ {
15620
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
15621
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15622
+ n_tasks = n_threads;
15623
+ } else {
15624
+ n_tasks = MIN(p->n_tasks, n_threads);
15625
+ }
15626
+ } break;
15627
+ case GGML_OP_CROSS_ENTROPY_LOSS:
15628
+ {
15629
+ n_tasks = n_threads;
15630
+ } break;
15631
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15632
+ {
15633
+ n_tasks = n_threads;
15634
+ } break;
15635
+ case GGML_OP_NONE:
15636
+ {
15637
+ n_tasks = 1;
15638
+ } break;
15639
+ case GGML_OP_COUNT:
15640
+ {
15641
+ GGML_ASSERT(false);
15642
+ } break;
15643
+ default:
15644
+ {
15645
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15646
+ GGML_ASSERT(false);
15647
+ } break;
15648
+ }
15649
+
15650
+ assert(n_tasks > 0);
16004
15651
 
16005
- node->perf_runs++;
16006
- node->perf_cycles += cycles_cur;
16007
- node->perf_time_us += time_us_cur;
15652
+ return n_tasks;
16008
15653
  }
16009
15654
 
16010
15655
  static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -16013,7 +15658,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16013
15658
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
16014
15659
  const struct ggml_cplan * cplan = state->shared->cplan;
16015
15660
 
16016
- const int * n_tasks_arr = cplan->n_tasks;
16017
15661
  const int n_threads = state->shared->n_threads;
16018
15662
 
16019
15663
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16038,9 +15682,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16038
15682
 
16039
15683
  if (node_n != -1) {
16040
15684
  /* FINALIZE */
16041
- struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
15685
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16042
15686
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16043
- params.nth = n_tasks_arr[node_n];
15687
+ params.nth = ggml_get_n_tasks(node, n_threads);
16044
15688
  ggml_compute_forward(&params, node);
16045
15689
  }
16046
15690
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16051,7 +15695,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16051
15695
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16052
15696
 
16053
15697
  struct ggml_tensor * node = cgraph->nodes[node_n];
16054
- const int n_tasks = n_tasks_arr[node_n];
15698
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16055
15699
 
16056
15700
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16057
15701
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -16109,7 +15753,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16109
15753
 
16110
15754
  /* COMPUTE */
16111
15755
  struct ggml_tensor * node = cgraph->nodes[node_n];
16112
- const int n_tasks = n_tasks_arr[node_n];
15756
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16113
15757
 
16114
15758
  struct ggml_compute_params params = {
16115
15759
  /*.type =*/ GGML_TASK_COMPUTE,
@@ -16143,121 +15787,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16143
15787
 
16144
15788
  struct ggml_tensor * node = cgraph->nodes[i];
16145
15789
 
15790
+ size_t cur = 0;
15791
+
16146
15792
  switch (node->op) {
16147
15793
  case GGML_OP_CPY:
16148
15794
  case GGML_OP_DUP:
16149
15795
  {
16150
15796
  n_tasks = n_threads;
16151
15797
 
16152
- size_t cur = 0;
16153
15798
  if (ggml_is_quantized(node->type)) {
16154
15799
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16155
15800
  }
16156
-
16157
- work_size = MAX(work_size, cur);
16158
15801
  } break;
16159
15802
  case GGML_OP_ADD:
16160
15803
  case GGML_OP_ADD1:
16161
15804
  {
16162
15805
  n_tasks = n_threads;
16163
15806
 
16164
- size_t cur = 0;
16165
-
16166
15807
  if (ggml_is_quantized(node->src[0]->type)) {
16167
15808
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16168
15809
  }
16169
-
16170
- work_size = MAX(work_size, cur);
16171
15810
  } break;
16172
15811
  case GGML_OP_ACC:
16173
15812
  {
16174
15813
  n_tasks = n_threads;
16175
15814
 
16176
- size_t cur = 0;
16177
-
16178
15815
  if (ggml_is_quantized(node->src[0]->type)) {
16179
15816
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16180
15817
  }
16181
-
16182
- work_size = MAX(work_size, cur);
16183
- } break;
16184
- case GGML_OP_SUB:
16185
- case GGML_OP_DIV:
16186
- case GGML_OP_SQR:
16187
- case GGML_OP_SQRT:
16188
- case GGML_OP_LOG:
16189
- case GGML_OP_SUM:
16190
- case GGML_OP_SUM_ROWS:
16191
- case GGML_OP_MEAN:
16192
- case GGML_OP_ARGMAX:
16193
- case GGML_OP_REPEAT:
16194
- case GGML_OP_REPEAT_BACK:
16195
- {
16196
- n_tasks = 1;
16197
- } break;
16198
-
16199
- case GGML_OP_UNARY:
16200
- {
16201
- switch (ggml_get_unary_op(node)) {
16202
- case GGML_UNARY_OP_ABS:
16203
- case GGML_UNARY_OP_SGN:
16204
- case GGML_UNARY_OP_NEG:
16205
- case GGML_UNARY_OP_STEP:
16206
- case GGML_UNARY_OP_TANH:
16207
- case GGML_UNARY_OP_ELU:
16208
- case GGML_UNARY_OP_RELU:
16209
- {
16210
- n_tasks = 1;
16211
- } break;
16212
-
16213
- case GGML_UNARY_OP_GELU:
16214
- case GGML_UNARY_OP_GELU_QUICK:
16215
- case GGML_UNARY_OP_SILU:
16216
- {
16217
- n_tasks = n_threads;
16218
- } break;
16219
- }
16220
15818
  } break;
16221
- case GGML_OP_SILU_BACK:
16222
- case GGML_OP_MUL:
16223
- case GGML_OP_NORM:
16224
- case GGML_OP_RMS_NORM:
16225
- case GGML_OP_RMS_NORM_BACK:
16226
- case GGML_OP_GROUP_NORM:
16227
- {
16228
- n_tasks = n_threads;
16229
- } break;
16230
- case GGML_OP_CONCAT:
16231
15819
  case GGML_OP_MUL_MAT:
16232
15820
  {
16233
- n_tasks = n_threads;
16234
-
16235
- // TODO: use different scheduling for different matrix sizes
16236
- //const int nr0 = ggml_nrows(node->src[0]);
16237
- //const int nr1 = ggml_nrows(node->src[1]);
16238
-
16239
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16240
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16241
-
16242
- size_t cur = 0;
16243
15821
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16244
15822
 
16245
- #if defined(GGML_USE_CUBLAS)
16246
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16247
- n_tasks = 1; // TODO: this actually is doing nothing
16248
- // the threads are still spinning
16249
- } else
16250
- #elif defined(GGML_USE_CLBLAST)
15823
+ #if defined(GGML_USE_CLBLAST)
16251
15824
  if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16252
- n_tasks = 1; // TODO: this actually is doing nothing
16253
- // the threads are still spinning
16254
15825
  cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16255
15826
  } else
16256
15827
  #endif
16257
15828
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16258
15829
  if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16259
- n_tasks = 1; // TODO: this actually is doing nothing
16260
- // the threads are still spinning
16261
15830
  if (node->src[0]->type != GGML_TYPE_F32) {
16262
15831
  // here we need memory just for single 2D matrix from src0
16263
15832
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16266,108 +15835,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16266
15835
  #endif
16267
15836
  if (node->src[1]->type != vec_dot_type) {
16268
15837
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16269
- } else {
16270
- cur = 0;
16271
15838
  }
16272
-
16273
- work_size = MAX(work_size, cur);
16274
15839
  } break;
16275
15840
  case GGML_OP_OUT_PROD:
16276
15841
  {
16277
15842
  n_tasks = n_threads;
16278
15843
 
16279
- size_t cur = 0;
16280
-
16281
15844
  if (ggml_is_quantized(node->src[0]->type)) {
16282
15845
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16283
15846
  }
16284
-
16285
- work_size = MAX(work_size, cur);
16286
- } break;
16287
- case GGML_OP_SCALE:
16288
- {
16289
- n_tasks = 1;
16290
- } break;
16291
- case GGML_OP_SET:
16292
- case GGML_OP_CONT:
16293
- case GGML_OP_RESHAPE:
16294
- case GGML_OP_VIEW:
16295
- case GGML_OP_PERMUTE:
16296
- case GGML_OP_TRANSPOSE:
16297
- case GGML_OP_GET_ROWS:
16298
- case GGML_OP_GET_ROWS_BACK:
16299
- case GGML_OP_DIAG:
16300
- {
16301
- n_tasks = 1;
16302
- } break;
16303
- case GGML_OP_DIAG_MASK_ZERO:
16304
- case GGML_OP_DIAG_MASK_INF:
16305
- case GGML_OP_SOFT_MAX:
16306
- case GGML_OP_SOFT_MAX_BACK:
16307
- case GGML_OP_ROPE:
16308
- case GGML_OP_ROPE_BACK:
16309
- case GGML_OP_ADD_REL_POS:
16310
- {
16311
- n_tasks = n_threads;
16312
- } break;
16313
- case GGML_OP_ALIBI:
16314
- {
16315
- n_tasks = 1; //TODO
16316
- } break;
16317
- case GGML_OP_CLAMP:
16318
- {
16319
- n_tasks = 1; //TODO
16320
- } break;
16321
- case GGML_OP_CONV_1D:
16322
- {
16323
- n_tasks = n_threads;
16324
-
16325
- GGML_ASSERT(node->src[0]->ne[3] == 1);
16326
- GGML_ASSERT(node->src[1]->ne[2] == 1);
16327
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16328
-
16329
- const int64_t ne00 = node->src[0]->ne[0];
16330
- const int64_t ne01 = node->src[0]->ne[1];
16331
- const int64_t ne02 = node->src[0]->ne[2];
16332
-
16333
- const int64_t ne10 = node->src[1]->ne[0];
16334
- const int64_t ne11 = node->src[1]->ne[1];
16335
-
16336
- const int64_t ne0 = node->ne[0];
16337
- const int64_t ne1 = node->ne[1];
16338
- const int64_t nk = ne00;
16339
- const int64_t ew0 = nk * ne01;
16340
-
16341
- UNUSED(ne02);
16342
- UNUSED(ne10);
16343
- UNUSED(ne11);
16344
-
16345
- size_t cur = 0;
16346
-
16347
- if (node->src[0]->type == GGML_TYPE_F16 &&
16348
- node->src[1]->type == GGML_TYPE_F32) {
16349
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16350
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16351
- node->src[1]->type == GGML_TYPE_F32) {
16352
- cur = sizeof(float)*(ne0*ne1*ew0);
16353
- } else {
16354
- GGML_ASSERT(false);
16355
- }
16356
-
16357
- work_size = MAX(work_size, cur);
16358
- } break;
16359
- case GGML_OP_CONV_1D_STAGE_0:
16360
- {
16361
- n_tasks = n_threads;
16362
- } break;
16363
- case GGML_OP_CONV_1D_STAGE_1:
16364
- {
16365
- n_tasks = n_threads;
16366
15847
  } break;
16367
15848
  case GGML_OP_CONV_TRANSPOSE_1D:
16368
15849
  {
16369
- n_tasks = n_threads;
16370
-
16371
15850
  GGML_ASSERT(node->src[0]->ne[3] == 1);
16372
15851
  GGML_ASSERT(node->src[1]->ne[2] == 1);
16373
15852
  GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16379,7 +15858,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16379
15858
  const int64_t ne10 = node->src[1]->ne[0]; // L
16380
15859
  const int64_t ne11 = node->src[1]->ne[1]; // Cin
16381
15860
 
16382
- size_t cur = 0;
16383
15861
  if (node->src[0]->type == GGML_TYPE_F16 &&
16384
15862
  node->src[1]->type == GGML_TYPE_F32) {
16385
15863
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
@@ -16391,59 +15869,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16391
15869
  } else {
16392
15870
  GGML_ASSERT(false);
16393
15871
  }
16394
-
16395
- work_size = MAX(work_size, cur);
16396
- } break;
16397
- case GGML_OP_CONV_2D:
16398
- {
16399
- n_tasks = n_threads;
16400
-
16401
- const int64_t ne00 = node->src[0]->ne[0]; // W
16402
- const int64_t ne01 = node->src[0]->ne[1]; // H
16403
- const int64_t ne02 = node->src[0]->ne[2]; // C
16404
- const int64_t ne03 = node->src[0]->ne[3]; // N
16405
-
16406
- const int64_t ne10 = node->src[1]->ne[0]; // W
16407
- const int64_t ne11 = node->src[1]->ne[1]; // H
16408
- const int64_t ne12 = node->src[1]->ne[2]; // C
16409
-
16410
- const int64_t ne0 = node->ne[0];
16411
- const int64_t ne1 = node->ne[1];
16412
- const int64_t ne2 = node->ne[2];
16413
- const int64_t ne3 = node->ne[3];
16414
- const int64_t nk = ne00*ne01;
16415
- const int64_t ew0 = nk * ne02;
16416
-
16417
- UNUSED(ne03);
16418
- UNUSED(ne2);
16419
-
16420
- size_t cur = 0;
16421
-
16422
- if (node->src[0]->type == GGML_TYPE_F16 &&
16423
- node->src[1]->type == GGML_TYPE_F32) {
16424
- // im2col: [N*OH*OW, IC*KH*KW]
16425
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
16426
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16427
- node->src[1]->type == GGML_TYPE_F32) {
16428
- cur = sizeof(float)* (ne10*ne11*ne12);
16429
- } else {
16430
- GGML_ASSERT(false);
16431
- }
16432
-
16433
- work_size = MAX(work_size, cur);
16434
- } break;
16435
- case GGML_OP_CONV_2D_STAGE_0:
16436
- {
16437
- n_tasks = n_threads;
16438
15872
  } break;
16439
- case GGML_OP_CONV_2D_STAGE_1:
15873
+ case GGML_OP_IM2COL:
16440
15874
  {
16441
15875
  n_tasks = n_threads;
16442
15876
  } break;
16443
15877
  case GGML_OP_CONV_TRANSPOSE_2D:
16444
15878
  {
16445
- n_tasks = n_threads;
16446
-
16447
15879
  const int64_t ne00 = node->src[0]->ne[0]; // W
16448
15880
  const int64_t ne01 = node->src[0]->ne[1]; // H
16449
15881
  const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16453,141 +15885,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16453
15885
  const int64_t ne11 = node->src[1]->ne[1]; // H
16454
15886
  const int64_t ne12 = node->src[1]->ne[2]; // Channels In
16455
15887
 
16456
- size_t cur = 0;
16457
15888
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
16458
15889
  cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
16459
-
16460
- work_size = MAX(work_size, cur);
16461
- } break;
16462
- case GGML_OP_POOL_1D:
16463
- case GGML_OP_POOL_2D:
16464
- {
16465
- n_tasks = 1;
16466
- } break;
16467
- case GGML_OP_UPSCALE:
16468
- {
16469
- n_tasks = n_threads;
16470
15890
  } break;
16471
15891
  case GGML_OP_FLASH_ATTN:
16472
15892
  {
16473
15893
  n_tasks = n_threads;
16474
15894
 
16475
- size_t cur = 0;
16476
-
16477
15895
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16478
15896
 
16479
15897
  if (node->src[1]->type == GGML_TYPE_F32) {
16480
15898
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16481
15899
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16482
- }
16483
-
16484
- if (node->src[1]->type == GGML_TYPE_F16) {
15900
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16485
15901
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16486
15902
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16487
15903
  }
16488
-
16489
- work_size = MAX(work_size, cur);
16490
15904
  } break;
16491
15905
  case GGML_OP_FLASH_FF:
16492
15906
  {
16493
15907
  n_tasks = n_threads;
16494
15908
 
16495
- size_t cur = 0;
16496
-
16497
15909
  if (node->src[1]->type == GGML_TYPE_F32) {
16498
15910
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16499
15911
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16500
- }
16501
-
16502
- if (node->src[1]->type == GGML_TYPE_F16) {
15912
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16503
15913
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16504
15914
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16505
15915
  }
16506
-
16507
- work_size = MAX(work_size, cur);
16508
15916
  } break;
16509
15917
  case GGML_OP_FLASH_ATTN_BACK:
16510
15918
  {
16511
15919
  n_tasks = n_threads;
16512
15920
 
16513
- size_t cur = 0;
16514
-
16515
15921
  const int64_t D = node->src[0]->ne[0];
16516
15922
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16517
15923
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16518
15924
  if (node->src[1]->type == GGML_TYPE_F32) {
16519
15925
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16520
15926
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16521
- }
16522
-
16523
- if (node->src[1]->type == GGML_TYPE_F16) {
15927
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16524
15928
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16525
15929
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16526
15930
  }
16527
-
16528
- work_size = MAX(work_size, cur);
16529
- } break;
16530
- case GGML_OP_WIN_PART:
16531
- case GGML_OP_WIN_UNPART:
16532
- case GGML_OP_GET_REL_POS:
16533
- case GGML_OP_MAP_UNARY:
16534
- case GGML_OP_MAP_BINARY:
16535
- case GGML_OP_MAP_CUSTOM1_F32:
16536
- case GGML_OP_MAP_CUSTOM2_F32:
16537
- case GGML_OP_MAP_CUSTOM3_F32:
16538
- {
16539
- n_tasks = 1;
16540
- } break;
16541
- case GGML_OP_MAP_CUSTOM1:
16542
- {
16543
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16544
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16545
- n_tasks = n_threads;
16546
- } else {
16547
- n_tasks = MIN(p->n_tasks, n_threads);
16548
- }
16549
- } break;
16550
- case GGML_OP_MAP_CUSTOM2:
16551
- {
16552
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16553
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16554
- n_tasks = n_threads;
16555
- } else {
16556
- n_tasks = MIN(p->n_tasks, n_threads);
16557
- }
16558
- } break;
16559
- case GGML_OP_MAP_CUSTOM3:
16560
- {
16561
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16562
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16563
- n_tasks = n_threads;
16564
- } else {
16565
- n_tasks = MIN(p->n_tasks, n_threads);
16566
- }
16567
15931
  } break;
15932
+
16568
15933
  case GGML_OP_CROSS_ENTROPY_LOSS:
16569
15934
  {
16570
15935
  n_tasks = n_threads;
16571
15936
 
16572
- size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16573
-
16574
- work_size = MAX(work_size, cur);
16575
- } break;
16576
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16577
- {
16578
- n_tasks = n_threads;
16579
- } break;
16580
- case GGML_OP_NONE:
16581
- {
16582
- n_tasks = 1;
15937
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16583
15938
  } break;
16584
15939
  case GGML_OP_COUNT:
16585
15940
  {
16586
15941
  GGML_ASSERT(false);
16587
15942
  } break;
15943
+ default:
15944
+ break;
16588
15945
  }
16589
15946
 
16590
- cplan.n_tasks[i] = n_tasks;
15947
+ work_size = MAX(work_size, cur);
16591
15948
  }
16592
15949
 
16593
15950
  if (work_size > 0) {
@@ -16609,12 +15966,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16609
15966
  if (cplan->work_size > 0) {
16610
15967
  GGML_ASSERT(cplan->work_data);
16611
15968
  }
16612
-
16613
- for (int i = 0; i < cgraph->n_nodes; ++i) {
16614
- if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16615
- GGML_ASSERT(cplan->n_tasks[i] > 0);
16616
- }
16617
- }
16618
15969
  }
16619
15970
 
16620
15971
  const int n_threads = cplan->n_threads;
@@ -16687,16 +16038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16687
16038
  return compute_status;
16688
16039
  }
16689
16040
 
16690
- void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16691
- for (int i = 0; i < cgraph->n_nodes; i++) {
16692
- struct ggml_tensor * grad = cgraph->grads[i];
16693
-
16694
- if (grad) {
16695
- ggml_set_zero(grad);
16696
- }
16697
- }
16698
- }
16699
-
16700
16041
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16701
16042
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16702
16043
 
@@ -16823,12 +16164,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16823
16164
  const uint32_t magic = GGML_FILE_MAGIC;
16824
16165
  const uint32_t version = GGML_FILE_VERSION;
16825
16166
  const uint32_t n_leafs = cgraph->n_leafs;
16826
- const uint32_t nodes = cgraph->n_nodes;
16167
+ const uint32_t n_nodes = cgraph->n_nodes;
16827
16168
 
16828
16169
  fwrite(&magic, sizeof(uint32_t), 1, fout);
16829
16170
  fwrite(&version, sizeof(uint32_t), 1, fout);
16830
16171
  fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
16831
- fwrite(&nodes, sizeof(uint32_t), 1, fout);
16172
+ fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
16832
16173
  fwrite(&size_eval, sizeof(uint64_t), 1, fout);
16833
16174
  }
16834
16175
 
@@ -16916,7 +16257,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16916
16257
  if (idx == -1) {
16917
16258
  for (int k = 0; k < cgraph->n_nodes; ++k) {
16918
16259
  if (args[j] == cgraph->nodes[k]) {
16919
- idx = GGML_MAX_NODES + k;
16260
+ idx = cgraph->n_leafs + k;
16920
16261
  break;
16921
16262
  }
16922
16263
  }
@@ -16943,11 +16284,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16943
16284
  }
16944
16285
  }
16945
16286
 
16946
- struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16287
+ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16947
16288
  assert(*ctx_data == NULL);
16948
16289
  assert(*ctx_eval == NULL);
16949
16290
 
16950
- struct ggml_cgraph result = { 0 };
16291
+ struct ggml_cgraph * result = NULL;
16951
16292
 
16952
16293
  struct ggml_tensor * data = NULL;
16953
16294
 
@@ -17019,13 +16360,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17019
16360
  const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
17020
16361
  const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
17021
16362
  const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
17022
-
17023
- result.n_leafs = n_leafs;
17024
- result.n_nodes = n_nodes;
16363
+ const int graph_size = MAX(n_leafs, n_nodes);
17025
16364
 
17026
16365
  // create the data context
17027
16366
  {
17028
- const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
16367
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
17029
16368
 
17030
16369
  struct ggml_init_params params = {
17031
16370
  .mem_size = size_eval + overhead,
@@ -17041,6 +16380,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17041
16380
  }
17042
16381
  }
17043
16382
 
16383
+ result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
16384
+
16385
+ result->n_leafs = n_leafs;
16386
+ result->n_nodes = n_nodes;
16387
+
16388
+
17044
16389
  // leafs
17045
16390
  {
17046
16391
  uint32_t type;
@@ -17079,7 +16424,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17079
16424
  tensor->nb[j] = nb[j];
17080
16425
  }
17081
16426
 
17082
- result.leafs[i] = tensor;
16427
+ result->leafs[i] = tensor;
17083
16428
 
17084
16429
  ptr += ggml_nbytes(tensor);
17085
16430
 
@@ -17131,10 +16476,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17131
16476
  continue;
17132
16477
  }
17133
16478
 
17134
- if (arg_idx < GGML_MAX_NODES) {
17135
- args[j] = result.leafs[arg_idx];
16479
+ if (arg_idx < result->n_leafs) {
16480
+ args[j] = result->leafs[arg_idx];
17136
16481
  } else {
17137
- args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
16482
+ args[j] = result->nodes[arg_idx - result->n_leafs];
17138
16483
  }
17139
16484
  }
17140
16485
 
@@ -17186,7 +16531,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17186
16531
  tensor->src[j] = args[j];
17187
16532
  }
17188
16533
 
17189
- result.nodes[i] = tensor;
16534
+ result->nodes[i] = tensor;
17190
16535
 
17191
16536
  fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17192
16537
  }
@@ -18091,10 +17436,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18091
17436
  case GGML_OPT_ADAM:
18092
17437
  {
18093
17438
  result = (struct ggml_opt_params) {
18094
- .type = GGML_OPT_ADAM,
18095
- .n_threads = 1,
18096
- .past = 0,
18097
- .delta = 1e-5f,
17439
+ .type = GGML_OPT_ADAM,
17440
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17441
+ .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
17442
+ .past = 0,
17443
+ .delta = 1e-5f,
18098
17444
 
18099
17445
  .max_no_improvement = 100,
18100
17446
 
@@ -18121,10 +17467,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18121
17467
  case GGML_OPT_LBFGS:
18122
17468
  {
18123
17469
  result = (struct ggml_opt_params) {
18124
- .type = GGML_OPT_LBFGS,
18125
- .n_threads = 1,
18126
- .past = 0,
18127
- .delta = 1e-5f,
17470
+ .type = GGML_OPT_LBFGS,
17471
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17472
+ .n_threads = 1,
17473
+ .past = 0,
17474
+ .delta = 1e-5f,
18128
17475
 
18129
17476
  .max_no_improvement = 0,
18130
17477
 
@@ -18266,14 +17613,11 @@ enum ggml_opt_result ggml_opt_resume(
18266
17613
  struct ggml_tensor * f) {
18267
17614
 
18268
17615
  // build forward + backward compute graphs
18269
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18270
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18271
-
18272
- struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18273
- struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
17616
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
17617
+ ggml_build_forward_expand(gf, f);
18274
17618
 
18275
- *gf = ggml_build_forward (f);
18276
- *gb = ggml_build_backward(ctx, gf, true);
17619
+ struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
17620
+ ggml_build_backward_expand(ctx, gf, gb, true);
18277
17621
 
18278
17622
  return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18279
17623
  }
@@ -18729,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18729
18073
  {
18730
18074
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18731
18075
 
18732
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18076
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18733
18077
  struct gguf_kv * kv = &ctx->kv[i];
18734
18078
 
18735
18079
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18776,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18776
18120
  case GGUF_TYPE_STRING:
18777
18121
  {
18778
18122
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18779
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18123
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18780
18124
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18781
18125
  }
18782
18126
  } break;
@@ -18804,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18804
18148
  {
18805
18149
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18806
18150
 
18807
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18151
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18808
18152
  struct gguf_tensor_info * info = &ctx->infos[i];
18809
18153
 
18810
18154
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18851,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18851
18195
  // compute the total size of the data section, taking into account the alignment
18852
18196
  {
18853
18197
  ctx->size = 0;
18854
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18198
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18855
18199
  struct gguf_tensor_info * info = &ctx->infos[i];
18856
18200
 
18857
18201
  const int64_t ne =
@@ -18920,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18920
18264
  ggml_set_no_alloc(ctx_data, true);
18921
18265
 
18922
18266
  // create the tensors
18923
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18267
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18924
18268
  const int64_t ne[GGML_MAX_DIMS] = {
18925
18269
  ctx->infos[i].ne[0],
18926
18270
  ctx->infos[i].ne[1],