llama_cpp 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
100
100
  #include <hbwmalloc.h>
101
101
  #endif
102
102
 
103
+ #if defined(__APPLE__)
104
+ #include <TargetConditionals.h>
105
+ #endif
106
+
107
+ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
108
+ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
109
+
110
+ #include <sys/wait.h>
111
+
112
+ void ggml_print_backtrace(void) {
113
+ /*
114
+ #include <execinfo.h>
115
+ #include <dlfcn.h>
116
+
117
+ void * trace[100];
118
+
119
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
120
+
121
+ backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
122
+ */
123
+
124
+ // backtrack_symbols does not show line numbers, use gdb instead
125
+ char attach[32];
126
+ snprintf(attach, sizeof(attach), "attach %d", getpid());
127
+ int pid = fork();
128
+ if (pid == 0) {
129
+ execlp("gdb", "gdb", "--batch",
130
+ "-ex", "set style enabled on",
131
+ "-ex", attach,
132
+ "-ex", "bt -frame-info source-and-location",
133
+ "-ex", "detach",
134
+ "-ex", "quit",
135
+ NULL);
136
+ } else {
137
+ waitpid(pid, NULL, 0);
138
+ }
139
+ }
140
+ #else
141
+ void ggml_print_backtrace(void) {
142
+ // platform not supported
143
+ }
144
+ #endif
145
+
103
146
  /*#define GGML_PERF*/
104
147
  #define GGML_DEBUG 0
105
148
  #define GGML_GELU_FP16
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
228
271
  // floating point type used to accumulate sums
229
272
  typedef double ggml_float;
230
273
 
274
+ #undef MIN
275
+ #undef MAX
276
+
277
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
278
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
279
+
231
280
  //
232
281
  // global data
233
282
  //
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
561
610
  // simd mappings
562
611
  //
563
612
 
613
+ #if defined(__ARM_NEON)
614
+ #if !defined(__aarch64__)
615
+
616
+ // 64-bit compatibility
617
+
618
+ inline static float vaddvq_f32(float32x4_t v) {
619
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620
+ }
621
+
622
+ #endif
623
+ #endif
624
+
564
625
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
565
626
  // we then implement the fundamental computation operations below using only these macros
566
627
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1352
1413
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1353
1414
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1354
1415
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
+ inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1355
1417
 
1356
1418
  static const float GELU_COEF_A = 0.044715f;
1357
1419
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1572
1634
  "ROPE_BACK",
1573
1635
  "ALIBI",
1574
1636
  "CLAMP",
1575
- "CONV_1D",
1576
- "CONV_1D_STAGE_0",
1577
- "CONV_1D_STAGE_1",
1578
1637
  "CONV_TRANSPOSE_1D",
1579
- "CONV_2D",
1580
- "CONV_2D_STAGE_0",
1581
- "CONV_2D_STAGE_1",
1638
+ "IM2COL",
1582
1639
  "CONV_TRANSPOSE_2D",
1583
1640
  "POOL_1D",
1584
1641
  "POOL_2D",
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1609
1666
  "CROSS_ENTROPY_LOSS_BACK",
1610
1667
  };
1611
1668
 
1612
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1669
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1613
1670
 
1614
1671
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1615
1672
  "none",
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1659
1716
  "rope_back(x)",
1660
1717
  "alibi(x)",
1661
1718
  "clamp(x)",
1662
- "conv_1d(x)",
1663
- "conv_1d_stage_0(x)",
1664
- "conv_1d_stage_1(x)",
1665
1719
  "conv_transpose_1d(x)",
1666
- "conv_2d(x)",
1667
- "conv_2d_stage_0(x)",
1668
- "conv_2d_stage_1(x)",
1720
+ "im2col(x)",
1669
1721
  "conv_transpose_2d(x)",
1670
1722
  "pool_1d(x)",
1671
1723
  "pool_2d(x)",
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1696
1748
  "cross_entropy_loss_back(x,y)",
1697
1749
  };
1698
1750
 
1699
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1751
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1700
1752
 
1701
1753
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1702
1754
 
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
1724
1776
  p[GGML_OP_GET_ROWS_BACK ] = true;
1725
1777
  p[GGML_OP_DIAG_MASK_INF ] = true;
1726
1778
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
1727
- p[GGML_OP_CONV_1D ] = true;
1728
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
1729
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
1730
1779
  p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
1731
- p[GGML_OP_CONV_2D ] = true;
1732
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
1733
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
1734
1780
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
1735
1781
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
1736
1782
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
3769
3815
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3770
3816
  }
3771
3817
 
3818
+ // ggml_leaky
3819
+
3820
+ struct ggml_tensor * ggml_leaky(
3821
+ struct ggml_context * ctx,
3822
+ struct ggml_tensor * a) {
3823
+ return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3824
+ }
3825
+
3772
3826
  // ggml_gelu
3773
3827
 
3774
3828
  struct ggml_tensor * ggml_gelu(
@@ -5076,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
5076
5130
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
5077
5131
  }
5078
5132
 
5079
- // im2col: [N, IC, IL] => [N, OL, IC*K]
5080
- // a: [OC,IC, K]
5081
- // b: [N, IC, IL]
5082
- // result: [N, OL, IC*K]
5083
- static struct ggml_tensor * ggml_conv_1d_stage_0(
5084
- struct ggml_context * ctx,
5085
- struct ggml_tensor * a,
5086
- struct ggml_tensor * b,
5087
- int s0,
5088
- int p0,
5089
- int d0) {
5090
- GGML_ASSERT(a->ne[1] == b->ne[1]);
5091
- bool is_node = false;
5092
-
5093
- if (a->grad || b->grad) {
5094
- GGML_ASSERT(false); // TODO: implement backward
5095
- is_node = true;
5096
- }
5097
-
5098
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5099
-
5100
- const int64_t ne[4] = {
5101
- a->ne[1] * a->ne[0],
5102
- OL,
5103
- b->ne[2],
5104
- 1,
5105
- };
5106
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5107
-
5108
- int32_t params[] = { s0, p0, d0 };
5109
- ggml_set_op_params(result, params, sizeof(params));
5110
-
5111
- result->op = GGML_OP_CONV_1D_STAGE_0;
5112
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5113
- result->src[0] = a;
5114
- result->src[1] = b;
5115
-
5116
- return result;
5117
- }
5118
-
5119
- // ggml_conv_1d_stage_1
5120
-
5121
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
5122
- // a: [OC, IC, K]
5123
- // b: [N, OL, IC * K]
5124
- // result: [N, OC, OL]
5125
- static struct ggml_tensor * ggml_conv_1d_stage_1(
5126
- struct ggml_context * ctx,
5127
- struct ggml_tensor * a,
5128
- struct ggml_tensor * b) {
5129
-
5130
- bool is_node = false;
5131
-
5132
- if (a->grad || b->grad) {
5133
- GGML_ASSERT(false); // TODO: implement backward
5134
- is_node = true;
5135
- }
5136
-
5137
- const int64_t ne[4] = {
5138
- b->ne[1],
5139
- a->ne[2],
5140
- b->ne[2],
5141
- 1,
5142
- };
5143
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5144
-
5145
- result->op = GGML_OP_CONV_1D_STAGE_1;
5146
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5147
- result->src[0] = a;
5148
- result->src[1] = b;
5149
-
5150
- return result;
5151
- }
5152
-
5153
- // ggml_conv_1d
5154
-
5155
5133
  GGML_API struct ggml_tensor * ggml_conv_1d(
5156
5134
  struct ggml_context * ctx,
5157
5135
  struct ggml_tensor * a,
@@ -5159,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5159
5137
  int s0,
5160
5138
  int p0,
5161
5139
  int d0) {
5162
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
5163
- result = ggml_conv_1d_stage_1(ctx, a, result);
5164
- return result;
5165
- }
5166
-
5167
- // GGML_API struct ggml_tensor * ggml_conv_1d(
5168
- // struct ggml_context * ctx,
5169
- // struct ggml_tensor * a,
5170
- // struct ggml_tensor * b,
5171
- // int s0,
5172
- // int p0,
5173
- // int d0) {
5174
- // GGML_ASSERT(ggml_is_matrix(b));
5175
- // GGML_ASSERT(a->ne[1] == b->ne[1]);
5176
- // bool is_node = false;
5177
-
5178
- // if (a->grad || b->grad) {
5179
- // GGML_ASSERT(false); // TODO: implement backward
5180
- // is_node = true;
5181
- // }
5140
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5182
5141
 
5183
- // const int64_t ne[4] = {
5184
- // ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
5185
- // a->ne[2], 1, 1,
5186
- // };
5187
- // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5142
+ struct ggml_tensor * result =
5143
+ ggml_mul_mat(ctx,
5144
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
5145
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
5188
5146
 
5189
- // int32_t params[] = { s0, p0, d0 };
5190
- // ggml_set_op_params(result, params, sizeof(params));
5147
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
5191
5148
 
5192
- // result->op = GGML_OP_CONV_1D;
5193
- // result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5194
- // result->src[0] = a;
5195
- // result->src[1] = b;
5196
-
5197
- // return result;
5198
- // }
5149
+ return result;
5150
+ }
5199
5151
 
5200
5152
  // ggml_conv_1d_ph
5201
5153
 
@@ -5258,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5258
5210
  // a: [OC,IC, KH, KW]
5259
5211
  // b: [N, IC, IH, IW]
5260
5212
  // result: [N, OH, OW, IC*KH*KW]
5261
- static struct ggml_tensor * ggml_conv_2d_stage_0(
5213
+ struct ggml_tensor * ggml_im2col(
5262
5214
  struct ggml_context * ctx,
5263
5215
  struct ggml_tensor * a,
5264
5216
  struct ggml_tensor * b,
@@ -5267,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5267
5219
  int p0,
5268
5220
  int p1,
5269
5221
  int d0,
5270
- int d1) {
5222
+ int d1,
5223
+ bool is_2D) {
5271
5224
 
5272
- GGML_ASSERT(a->ne[2] == b->ne[2]);
5225
+ if(is_2D) {
5226
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
5227
+ } else {
5228
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
5229
+ }
5273
5230
  bool is_node = false;
5274
5231
 
5275
5232
  if (a->grad || b->grad) {
@@ -5277,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5277
5234
  is_node = true;
5278
5235
  }
5279
5236
 
5280
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
5281
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5237
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
5238
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5282
5239
 
5283
5240
  const int64_t ne[4] = {
5284
- a->ne[2] * a->ne[1] * a->ne[0],
5241
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
5285
5242
  OW,
5286
- OH,
5287
- b->ne[3],
5243
+ is_2D ? OH : b->ne[2],
5244
+ is_2D ? b->ne[3] : 1,
5288
5245
  };
5289
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5290
5246
 
5291
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
5247
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5248
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5292
5249
  ggml_set_op_params(result, params, sizeof(params));
5293
5250
 
5294
- result->op = GGML_OP_CONV_2D_STAGE_0;
5295
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5296
- result->src[0] = a;
5297
- result->src[1] = b;
5298
-
5299
- return result;
5300
-
5301
- }
5302
-
5303
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
5304
- // a: [OC, IC, KH, KW]
5305
- // b: [N, OH, OW, IC * KH * KW]
5306
- // result: [N, OC, OH, OW]
5307
- static struct ggml_tensor * ggml_conv_2d_stage_1(
5308
- struct ggml_context * ctx,
5309
- struct ggml_tensor * a,
5310
- struct ggml_tensor * b) {
5311
-
5312
- bool is_node = false;
5313
-
5314
- if (a->grad || b->grad) {
5315
- GGML_ASSERT(false); // TODO: implement backward
5316
- is_node = true;
5317
- }
5318
-
5319
- const int64_t ne[4] = {
5320
- b->ne[1],
5321
- b->ne[2],
5322
- a->ne[3],
5323
- b->ne[3],
5324
- };
5325
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5326
-
5327
- result->op = GGML_OP_CONV_2D_STAGE_1;
5251
+ result->op = GGML_OP_IM2COL;
5328
5252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5329
5253
  result->src[0] = a;
5330
5254
  result->src[1] = b;
5331
5255
 
5332
5256
  return result;
5333
-
5334
5257
  }
5335
5258
 
5336
5259
  // a: [OC,IC, KH, KW]
5337
5260
  // b: [N, IC, IH, IW]
5338
5261
  // result: [N, OC, OH, OW]
5339
5262
  struct ggml_tensor * ggml_conv_2d(
5340
- struct ggml_context * ctx,
5341
- struct ggml_tensor * a,
5342
- struct ggml_tensor * b,
5343
- int s0,
5344
- int s1,
5345
- int p0,
5346
- int p1,
5347
- int d0,
5348
- int d1) {
5263
+ struct ggml_context * ctx,
5264
+ struct ggml_tensor * a,
5265
+ struct ggml_tensor * b,
5266
+ int s0,
5267
+ int s1,
5268
+ int p0,
5269
+ int p1,
5270
+ int d0,
5271
+ int d1) {
5272
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5349
5273
 
5350
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
5351
- result = ggml_conv_2d_stage_1(ctx, a, result);
5274
+ struct ggml_tensor * result =
5275
+ ggml_mul_mat(ctx,
5276
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5277
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5352
5278
 
5353
- return result;
5279
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5354
5280
 
5281
+ return result;
5355
5282
  }
5356
5283
 
5357
5284
  // ggml_conv_2d_sk_p0
@@ -5411,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
5411
5338
 
5412
5339
  // ggml_pool_*
5413
5340
 
5414
- static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
5341
+ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
5415
5342
  return (ins + 2 * p - ks) / s + 1;
5416
5343
  }
5417
5344
 
@@ -5458,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
5458
5385
  int k1,
5459
5386
  int s0,
5460
5387
  int s1,
5461
- int p0,
5462
- int p1) {
5388
+ float p0,
5389
+ float p1) {
5463
5390
 
5464
5391
  bool is_node = false;
5465
5392
 
@@ -8921,6 +8848,48 @@ static void ggml_compute_forward_silu(
8921
8848
  }
8922
8849
  }
8923
8850
 
8851
+ // ggml_compute_forward_leaky
8852
+
8853
+ static void ggml_compute_forward_leaky_f32(
8854
+ const struct ggml_compute_params * params,
8855
+ const struct ggml_tensor * src0,
8856
+ struct ggml_tensor * dst) {
8857
+ assert(params->ith == 0);
8858
+ assert(ggml_are_same_shape(src0, dst));
8859
+
8860
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8861
+ return;
8862
+ }
8863
+
8864
+ const int n = ggml_nrows(src0);
8865
+ const int nc = src0->ne[0];
8866
+
8867
+ assert(dst->nb[0] == sizeof(float));
8868
+ assert(src0->nb[0] == sizeof(float));
8869
+
8870
+ for (int i = 0; i < n; i++) {
8871
+ ggml_vec_leaky_f32(nc,
8872
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8873
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8874
+ }
8875
+ }
8876
+
8877
+ static void ggml_compute_forward_leaky(
8878
+ const struct ggml_compute_params * params,
8879
+ const struct ggml_tensor * src0,
8880
+ struct ggml_tensor * dst) {
8881
+ switch (src0->type) {
8882
+ case GGML_TYPE_F32:
8883
+ {
8884
+ ggml_compute_forward_leaky_f32(params, src0, dst);
8885
+ } break;
8886
+ default:
8887
+ {
8888
+ GGML_ASSERT(false);
8889
+ } break;
8890
+ }
8891
+ }
8892
+
8924
8893
  // ggml_compute_forward_silu_back
8925
8894
 
8926
8895
  static void ggml_compute_forward_silu_back_f32(
@@ -9404,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9404
9373
  // TODO: find the optimal values for these
9405
9374
  if (ggml_is_contiguous(src0) &&
9406
9375
  ggml_is_contiguous(src1) &&
9376
+ src0->type == GGML_TYPE_F32 &&
9377
+ src1->type == GGML_TYPE_F32 &&
9407
9378
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9408
9379
 
9409
9380
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9442,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
9442
9413
 
9443
9414
  // we don't support permuted src0 or src1
9444
9415
  GGML_ASSERT(nb00 == ggml_type_size(type));
9445
- GGML_ASSERT(nb10 == sizeof(float));
9416
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9446
9417
 
9447
9418
  // dst cannot be transposed or permuted
9448
9419
  GGML_ASSERT(nb0 == sizeof(float));
@@ -11340,9 +11311,9 @@ static void ggml_compute_forward_rope_back(
11340
11311
  }
11341
11312
  }
11342
11313
 
11343
- // ggml_compute_forward_conv_1d
11314
+ // ggml_compute_forward_conv_transpose_1d
11344
11315
 
11345
- static void ggml_compute_forward_conv_1d_f16_f32(
11316
+ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11346
11317
  const struct ggml_compute_params * params,
11347
11318
  const struct ggml_tensor * src0,
11348
11319
  const struct ggml_tensor * src1,
@@ -11359,14 +11330,7 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11359
11330
  const int ith = params->ith;
11360
11331
  const int nth = params->nth;
11361
11332
 
11362
- const int nk = ne00;
11363
-
11364
- // size of the convolution row - the kernel size unrolled across all input channels
11365
- const int ew0 = nk*ne01;
11366
-
11367
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11368
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11369
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11333
+ const int nk = ne00*ne01*ne02;
11370
11334
 
11371
11335
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11372
11336
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11374,23 +11338,37 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11374
11338
  if (params->type == GGML_TASK_INIT) {
11375
11339
  memset(params->wdata, 0, params->wsize);
11376
11340
 
11377
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11341
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11342
+ {
11343
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11344
+
11345
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11346
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11347
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11348
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11349
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11350
+ dst_data[i00*ne02 + i02] = src[i00];
11351
+ }
11352
+ }
11353
+ }
11354
+ }
11378
11355
 
11379
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11380
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11356
+ // permute source data (src1) from (L x Cin) to (Cin x L)
11357
+ {
11358
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11381
11359
  ggml_fp16_t * dst_data = wdata;
11382
11360
 
11383
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11384
- for (int64_t ik = 0; ik < nk; ik++) {
11385
- const int idx0 = i0*s0 + ik*d0 - p0;
11386
-
11387
- if(!(idx0 < 0 || idx0 >= ne10)) {
11388
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
11389
- }
11361
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
11362
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
11363
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
11364
+ dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11390
11365
  }
11391
11366
  }
11392
11367
  }
11393
11368
 
11369
+ // need to zero dst since we are accumulating into it
11370
+ memset(dst->data, 0, ggml_nbytes(dst));
11371
+
11394
11372
  return;
11395
11373
  }
11396
11374
 
@@ -11398,8 +11376,10 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11398
11376
  return;
11399
11377
  }
11400
11378
 
11379
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11380
+
11401
11381
  // total rows in dst
11402
- const int nr = ne2;
11382
+ const int nr = ne1;
11403
11383
 
11404
11384
  // rows per thread
11405
11385
  const int dr = (nr + nth - 1)/nth;
@@ -11408,22 +11388,26 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11408
11388
  const int ir0 = dr*ith;
11409
11389
  const int ir1 = MIN(ir0 + dr, nr);
11410
11390
 
11411
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11412
-
11413
- for (int i2 = 0; i2 < ne2; i2++) {
11414
- for (int i1 = ir0; i1 < ir1; i1++) {
11415
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11391
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11392
+ ggml_fp16_t * const wdata_src = wdata + nk;
11416
11393
 
11417
- for (int i0 = 0; i0 < ne0; i0++) {
11418
- ggml_vec_dot_f16(ew0, dst_data + i0,
11419
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
11420
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
11394
+ for (int i1 = ir0; i1 < ir1; i1++) {
11395
+ float * dst_data = (float *)((char *) dst->data + i1*nb1);
11396
+ ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11397
+ for (int i10 = 0; i10 < ne10; i10++) {
11398
+ const int i1n = i10*ne11;
11399
+ for (int i00 = 0; i00 < ne00; i00++) {
11400
+ float v = 0;
11401
+ ggml_vec_dot_f16(ne02, &v,
11402
+ (ggml_fp16_t *) wdata_src + i1n,
11403
+ (ggml_fp16_t *) wdata_kernel + i00*ne02);
11404
+ dst_data[i10*s0 + i00] += v;
11421
11405
  }
11422
11406
  }
11423
11407
  }
11424
11408
  }
11425
11409
 
11426
- static void ggml_compute_forward_conv_1d_f32(
11410
+ static void ggml_compute_forward_conv_transpose_1d_f32(
11427
11411
  const struct ggml_compute_params * params,
11428
11412
  const struct ggml_tensor * src0,
11429
11413
  const struct ggml_tensor * src1,
@@ -11440,13 +11424,7 @@ static void ggml_compute_forward_conv_1d_f32(
11440
11424
  const int ith = params->ith;
11441
11425
  const int nth = params->nth;
11442
11426
 
11443
- const int nk = ne00;
11444
-
11445
- const int ew0 = nk*ne01;
11446
-
11447
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11448
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11449
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11427
+ const int nk = ne00*ne01*ne02;
11450
11428
 
11451
11429
  GGML_ASSERT(nb00 == sizeof(float));
11452
11430
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11454,23 +11432,37 @@ static void ggml_compute_forward_conv_1d_f32(
11454
11432
  if (params->type == GGML_TASK_INIT) {
11455
11433
  memset(params->wdata, 0, params->wsize);
11456
11434
 
11457
- float * const wdata = (float *) params->wdata + 0;
11435
+ // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11436
+ {
11437
+ float * const wdata = (float *) params->wdata + 0;
11438
+
11439
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11440
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11441
+ const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
11442
+ float * dst_data = wdata + i01*ne00*ne02;
11443
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11444
+ dst_data[i00*ne02 + i02] = src[i00];
11445
+ }
11446
+ }
11447
+ }
11448
+ }
11458
11449
 
11459
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11460
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11450
+ // prepare source data (src1)
11451
+ {
11452
+ float * const wdata = (float *) params->wdata + nk;
11461
11453
  float * dst_data = wdata;
11462
11454
 
11463
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11464
- for (int64_t ik = 0; ik < nk; ik++) {
11465
- const int idx0 = i0*s0 + ik*d0 - p0;
11466
-
11467
- if(!(idx0 < 0 || idx0 >= ne10)) {
11468
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
11469
- }
11455
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
11456
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
11457
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
11458
+ dst_data[i10*ne11 + i11] = src[i10];
11470
11459
  }
11471
11460
  }
11472
11461
  }
11473
11462
 
11463
+ // need to zero dst since we are accumulating into it
11464
+ memset(dst->data, 0, ggml_nbytes(dst));
11465
+
11474
11466
  return;
11475
11467
  }
11476
11468
 
@@ -11478,8 +11470,10 @@ static void ggml_compute_forward_conv_1d_f32(
11478
11470
  return;
11479
11471
  }
11480
11472
 
11473
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11474
+
11481
11475
  // total rows in dst
11482
- const int nr = ne02;
11476
+ const int nr = ne1;
11483
11477
 
11484
11478
  // rows per thread
11485
11479
  const int dr = (nr + nth - 1)/nth;
@@ -11488,441 +11482,8 @@ static void ggml_compute_forward_conv_1d_f32(
11488
11482
  const int ir0 = dr*ith;
11489
11483
  const int ir1 = MIN(ir0 + dr, nr);
11490
11484
 
11491
- float * const wdata = (float *) params->wdata + 0;
11492
-
11493
- for (int i2 = 0; i2 < ne2; i2++) {
11494
- for (int i1 = ir0; i1 < ir1; i1++) {
11495
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11496
-
11497
- for (int i0 = 0; i0 < ne0; i0++) {
11498
- ggml_vec_dot_f32(ew0, dst_data + i0,
11499
- (float *) ((char *) src0->data + i1*nb02),
11500
- (float *) wdata + i2*nb2 + i0*ew0);
11501
- }
11502
- }
11503
- }
11504
- }
11505
-
11506
- // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
11507
- static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
11508
- ggml_fp16_t * A,
11509
- ggml_fp16_t * B,
11510
- float * C,
11511
- const int ith, const int nth) {
11512
- // does not seem to make a difference
11513
- int64_t m0, m1, n0, n1;
11514
- // patches per thread
11515
- if (m > n) {
11516
- n0 = 0;
11517
- n1 = n;
11518
-
11519
- // total patches in dst
11520
- const int np = m;
11521
-
11522
- // patches per thread
11523
- const int dp = (np + nth - 1)/nth;
11524
-
11525
- // patch range for this thread
11526
- m0 = dp*ith;
11527
- m1 = MIN(m0 + dp, np);
11528
- } else {
11529
- m0 = 0;
11530
- m1 = m;
11531
-
11532
- // total patches in dst
11533
- const int np = n;
11534
-
11535
- // patches per thread
11536
- const int dp = (np + nth - 1)/nth;
11537
-
11538
- // patch range for this thread
11539
- n0 = dp*ith;
11540
- n1 = MIN(n0 + dp, np);
11541
- }
11542
-
11543
- // block-tiling attempt
11544
- int64_t blck_n = 16;
11545
- int64_t blck_m = 16;
11546
-
11547
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
11548
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
11549
- // if (blck_size > 0) {
11550
- // blck_0 = 4;
11551
- // blck_1 = blck_size / blck_0;
11552
- // if (blck_1 < 0) {
11553
- // blck_1 = 1;
11554
- // }
11555
- // // blck_0 = (int64_t)sqrt(blck_size);
11556
- // // blck_1 = blck_0;
11557
- // }
11558
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
11559
-
11560
- for (int j = n0; j < n1; j+=blck_n) {
11561
- for (int i = m0; i < m1; i+=blck_m) {
11562
- // printf("i j k => %d %d %d\n", i, j, K);
11563
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
11564
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
11565
- ggml_vec_dot_f16(k,
11566
- C + ii*n + jj,
11567
- A + ii * k,
11568
- B + jj * k);
11569
- }
11570
- }
11571
- }
11572
- }
11573
- }
11574
-
11575
- // src0: kernel [OC, IC, K]
11576
- // src1: signal [N, IC, IL]
11577
- // dst: result [N, OL, IC*K]
11578
- static void ggml_compute_forward_conv_1d_stage_0_f32(
11579
- const struct ggml_compute_params * params,
11580
- const struct ggml_tensor * src0,
11581
- const struct ggml_tensor * src1,
11582
- struct ggml_tensor * dst) {
11583
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11584
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11585
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
11586
-
11587
- int64_t t0 = ggml_perf_time_us();
11588
- UNUSED(t0);
11589
-
11590
- GGML_TENSOR_BINARY_OP_LOCALS;
11591
-
11592
- const int64_t N = ne12;
11593
- const int64_t IC = ne11;
11594
- const int64_t IL = ne10;
11595
-
11596
- const int64_t K = ne00;
11597
-
11598
- const int64_t OL = ne1;
11599
-
11600
- const int ith = params->ith;
11601
- const int nth = params->nth;
11602
-
11603
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11604
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11605
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11606
-
11607
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11608
- GGML_ASSERT(nb10 == sizeof(float));
11609
-
11610
- if (params->type == GGML_TASK_INIT) {
11611
- memset(dst->data, 0, ggml_nbytes(dst));
11612
- return;
11613
- }
11614
-
11615
- if (params->type == GGML_TASK_FINALIZE) {
11616
- return;
11617
- }
11618
-
11619
- // im2col: [N, IC, IL] => [N, OL, IC*K]
11620
- {
11621
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
11622
-
11623
- for (int64_t in = 0; in < N; in++) {
11624
- for (int64_t iol = 0; iol < OL; iol++) {
11625
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11626
-
11627
- // micro kernel
11628
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
11629
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
11630
-
11631
- for (int64_t ik = 0; ik < K; ik++) {
11632
- const int64_t iil = iol*s0 + ik*d0 - p0;
11633
-
11634
- if (!(iil < 0 || iil >= IL)) {
11635
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
11636
- }
11637
- }
11638
- }
11639
- }
11640
- }
11641
- }
11642
- }
11643
-
11644
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11645
- // src0: [OC, IC, K]
11646
- // src1: [N, OL, IC * K]
11647
- // result: [N, OC, OL]
11648
- static void ggml_compute_forward_conv_1d_stage_1_f16(
11649
- const struct ggml_compute_params * params,
11650
- const struct ggml_tensor * src0,
11651
- const struct ggml_tensor * src1,
11652
- struct ggml_tensor * dst) {
11653
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11654
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
11655
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11656
-
11657
- int64_t t0 = ggml_perf_time_us();
11658
- UNUSED(t0);
11659
-
11660
- if (params->type == GGML_TASK_INIT) {
11661
- return;
11662
- }
11663
-
11664
- if (params->type == GGML_TASK_FINALIZE) {
11665
- return;
11666
- }
11667
-
11668
- GGML_TENSOR_BINARY_OP_LOCALS;
11669
-
11670
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11671
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
11672
- GGML_ASSERT(nb0 == sizeof(float));
11673
-
11674
- const int N = ne12;
11675
- const int OL = ne11;
11676
-
11677
- const int OC = ne02;
11678
- const int IC = ne01;
11679
- const int K = ne00;
11680
-
11681
- const int ith = params->ith;
11682
- const int nth = params->nth;
11683
-
11684
- int64_t m = OC;
11685
- int64_t n = OL;
11686
- int64_t k = IC * K;
11687
-
11688
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11689
- for (int i = 0; i < N; i++) {
11690
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
11691
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
11692
- float * C = (float *)dst->data + i * m * n; // [m, n]
11693
-
11694
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
11695
- }
11696
- }
11697
-
11698
- static void ggml_compute_forward_conv_1d(
11699
- const struct ggml_compute_params * params,
11700
- const struct ggml_tensor * src0,
11701
- const struct ggml_tensor * src1,
11702
- struct ggml_tensor * dst) {
11703
- switch(src0->type) {
11704
- case GGML_TYPE_F16:
11705
- {
11706
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
11707
- } break;
11708
- case GGML_TYPE_F32:
11709
- {
11710
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
11711
- } break;
11712
- default:
11713
- {
11714
- GGML_ASSERT(false);
11715
- } break;
11716
- }
11717
- }
11718
-
11719
- static void ggml_compute_forward_conv_1d_stage_0(
11720
- const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
- const struct ggml_tensor * src1,
11723
- struct ggml_tensor * dst) {
11724
- switch(src0->type) {
11725
- case GGML_TYPE_F16:
11726
- {
11727
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
11728
- } break;
11729
- default:
11730
- {
11731
- GGML_ASSERT(false);
11732
- } break;
11733
- }
11734
- }
11735
-
11736
- static void ggml_compute_forward_conv_1d_stage_1(
11737
- const struct ggml_compute_params * params,
11738
- const struct ggml_tensor * src0,
11739
- const struct ggml_tensor * src1,
11740
- struct ggml_tensor * dst) {
11741
- switch(src0->type) {
11742
- case GGML_TYPE_F16:
11743
- {
11744
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
11745
- } break;
11746
- default:
11747
- {
11748
- GGML_ASSERT(false);
11749
- } break;
11750
- }
11751
- }
11752
-
11753
- // ggml_compute_forward_conv_transpose_1d
11754
-
11755
- static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11756
- const struct ggml_compute_params * params,
11757
- const struct ggml_tensor * src0,
11758
- const struct ggml_tensor * src1,
11759
- struct ggml_tensor * dst) {
11760
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11761
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11762
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11763
-
11764
- int64_t t0 = ggml_perf_time_us();
11765
- UNUSED(t0);
11766
-
11767
- GGML_TENSOR_BINARY_OP_LOCALS
11768
-
11769
- const int ith = params->ith;
11770
- const int nth = params->nth;
11771
-
11772
- const int nk = ne00*ne01*ne02;
11773
-
11774
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11775
- GGML_ASSERT(nb10 == sizeof(float));
11776
-
11777
- if (params->type == GGML_TASK_INIT) {
11778
- memset(params->wdata, 0, params->wsize);
11779
-
11780
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11781
- {
11782
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11783
-
11784
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11785
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11786
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11787
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11788
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11789
- dst_data[i00*ne02 + i02] = src[i00];
11790
- }
11791
- }
11792
- }
11793
- }
11794
-
11795
- // permute source data (src1) from (L x Cin) to (Cin x L)
11796
- {
11797
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11798
- ggml_fp16_t * dst_data = wdata;
11799
-
11800
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11801
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11802
- for (int64_t i10 = 0; i10 < ne10; i10++) {
11803
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11804
- }
11805
- }
11806
- }
11807
-
11808
- // need to zero dst since we are accumulating into it
11809
- memset(dst->data, 0, ggml_nbytes(dst));
11810
-
11811
- return;
11812
- }
11813
-
11814
- if (params->type == GGML_TASK_FINALIZE) {
11815
- return;
11816
- }
11817
-
11818
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11819
-
11820
- // total rows in dst
11821
- const int nr = ne1;
11822
-
11823
- // rows per thread
11824
- const int dr = (nr + nth - 1)/nth;
11825
-
11826
- // row range for this thread
11827
- const int ir0 = dr*ith;
11828
- const int ir1 = MIN(ir0 + dr, nr);
11829
-
11830
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11831
- ggml_fp16_t * const wdata_src = wdata + nk;
11832
-
11833
- for (int i1 = ir0; i1 < ir1; i1++) {
11834
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
11835
- ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11836
- for (int i10 = 0; i10 < ne10; i10++) {
11837
- const int i1n = i10*ne11;
11838
- for (int i00 = 0; i00 < ne00; i00++) {
11839
- float v = 0;
11840
- ggml_vec_dot_f16(ne02, &v,
11841
- (ggml_fp16_t *) wdata_src + i1n,
11842
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
11843
- dst_data[i10*s0 + i00] += v;
11844
- }
11845
- }
11846
- }
11847
- }
11848
-
11849
- static void ggml_compute_forward_conv_transpose_1d_f32(
11850
- const struct ggml_compute_params * params,
11851
- const struct ggml_tensor * src0,
11852
- const struct ggml_tensor * src1,
11853
- struct ggml_tensor * dst) {
11854
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
11855
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11856
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11857
-
11858
- int64_t t0 = ggml_perf_time_us();
11859
- UNUSED(t0);
11860
-
11861
- GGML_TENSOR_BINARY_OP_LOCALS
11862
-
11863
- const int ith = params->ith;
11864
- const int nth = params->nth;
11865
-
11866
- const int nk = ne00*ne01*ne02;
11867
-
11868
- GGML_ASSERT(nb00 == sizeof(float));
11869
- GGML_ASSERT(nb10 == sizeof(float));
11870
-
11871
- if (params->type == GGML_TASK_INIT) {
11872
- memset(params->wdata, 0, params->wsize);
11873
-
11874
- // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11875
- {
11876
- float * const wdata = (float *) params->wdata + 0;
11877
-
11878
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11879
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11880
- const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
11881
- float * dst_data = wdata + i01*ne00*ne02;
11882
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11883
- dst_data[i00*ne02 + i02] = src[i00];
11884
- }
11885
- }
11886
- }
11887
- }
11888
-
11889
- // prepare source data (src1)
11890
- {
11891
- float * const wdata = (float *) params->wdata + nk;
11892
- float * dst_data = wdata;
11893
-
11894
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11895
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11896
- for (int64_t i10 = 0; i10 < ne10; i10++) {
11897
- dst_data[i10*ne11 + i11] = src[i10];
11898
- }
11899
- }
11900
- }
11901
-
11902
- // need to zero dst since we are accumulating into it
11903
- memset(dst->data, 0, ggml_nbytes(dst));
11904
-
11905
- return;
11906
- }
11907
-
11908
- if (params->type == GGML_TASK_FINALIZE) {
11909
- return;
11910
- }
11911
-
11912
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11913
-
11914
- // total rows in dst
11915
- const int nr = ne1;
11916
-
11917
- // rows per thread
11918
- const int dr = (nr + nth - 1)/nth;
11919
-
11920
- // row range for this thread
11921
- const int ir0 = dr*ith;
11922
- const int ir1 = MIN(ir0 + dr, nr);
11923
-
11924
- float * const wdata = (float *) params->wdata + 0;
11925
- float * const wdata_src = wdata + nk;
11485
+ float * const wdata = (float *) params->wdata + 0;
11486
+ float * const wdata_src = wdata + nk;
11926
11487
 
11927
11488
  for (int i1 = ir0; i1 < ir1; i1++) {
11928
11489
  float * dst_data = (float *)((char *) dst->data + i1*nb1);
@@ -11961,12 +11522,10 @@ static void ggml_compute_forward_conv_transpose_1d(
11961
11522
  }
11962
11523
  }
11963
11524
 
11964
- // ggml_compute_forward_conv_2d
11965
-
11966
11525
  // src0: kernel [OC, IC, KH, KW]
11967
11526
  // src1: image [N, IC, IH, IW]
11968
11527
  // dst: result [N, OH, OW, IC*KH*KW]
11969
- static void ggml_compute_forward_conv_2d_stage_0_f32(
11528
+ static void ggml_compute_forward_im2col_f16(
11970
11529
  const struct ggml_compute_params * params,
11971
11530
  const struct ggml_tensor * src0,
11972
11531
  const struct ggml_tensor * src1,
@@ -11980,34 +11539,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
11980
11539
 
11981
11540
  GGML_TENSOR_BINARY_OP_LOCALS;
11982
11541
 
11983
- const int64_t N = ne13;
11984
- const int64_t IC = ne12;
11985
- const int64_t IH = ne11;
11542
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
11543
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
11544
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
11545
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
11546
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
11547
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
11548
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
11549
+
11550
+ const int ith = params->ith;
11551
+ const int nth = params->nth;
11552
+
11553
+ const int64_t N = is_2D ? ne13 : ne12;
11554
+ const int64_t IC = is_2D ? ne12 : ne11;
11555
+ const int64_t IH = is_2D ? ne11 : 1;
11986
11556
  const int64_t IW = ne10;
11987
11557
 
11988
- // const int64_t OC = ne03;
11989
- // const int64_t IC = ne02;
11990
- const int64_t KH = ne01;
11558
+ const int64_t KH = is_2D ? ne01 : 1;
11991
11559
  const int64_t KW = ne00;
11992
11560
 
11993
- const int64_t OH = ne2;
11561
+ const int64_t OH = is_2D ? ne2 : 1;
11994
11562
  const int64_t OW = ne1;
11995
11563
 
11996
- const int ith = params->ith;
11997
- const int nth = params->nth;
11998
-
11999
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12000
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12001
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12002
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12003
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12004
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
11564
+ int ofs0 = is_2D ? nb13 : nb12;
11565
+ int ofs1 = is_2D ? nb12 : nb11;
12005
11566
 
12006
11567
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12007
11568
  GGML_ASSERT(nb10 == sizeof(float));
12008
11569
 
12009
11570
  if (params->type == GGML_TASK_INIT) {
12010
- memset(dst->data, 0, ggml_nbytes(dst));
12011
11571
  return;
12012
11572
  }
12013
11573
 
@@ -12020,20 +11580,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12020
11580
  ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
12021
11581
 
12022
11582
  for (int64_t in = 0; in < N; in++) {
12023
- for (int64_t ioh = 0; ioh < OH; ioh++) {
11583
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12024
11584
  for (int64_t iow = 0; iow < OW; iow++) {
12025
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11585
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12026
11586
 
12027
11587
  // micro kernel
12028
11588
  ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12029
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
11589
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12030
11590
 
12031
- for (int64_t ikh = 0; ikh < KH; ikh++) {
11591
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12032
11592
  for (int64_t ikw = 0; ikw < KW; ikw++) {
12033
11593
  const int64_t iiw = iow*s0 + ikw*d0 - p0;
12034
11594
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
12035
11595
 
12036
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
11596
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
11597
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
11598
+ } else {
12037
11599
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12038
11600
  }
12039
11601
  }
@@ -12045,223 +11607,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12045
11607
  }
12046
11608
  }
12047
11609
 
12048
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12049
- // src0: [OC, IC, KH, KW]
12050
- // src1: [N, OH, OW, IC * KH * KW]
12051
- // result: [N, OC, OH, OW]
12052
- static void ggml_compute_forward_conv_2d_stage_1_f16(
12053
- const struct ggml_compute_params * params,
12054
- const struct ggml_tensor * src0,
12055
- const struct ggml_tensor * src1,
12056
- struct ggml_tensor * dst) {
12057
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12058
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
12059
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12060
-
12061
- int64_t t0 = ggml_perf_time_us();
12062
- UNUSED(t0);
12063
-
12064
- if (params->type == GGML_TASK_INIT) {
12065
- return;
12066
- }
12067
-
12068
- if (params->type == GGML_TASK_FINALIZE) {
12069
- return;
12070
- }
12071
-
12072
- GGML_TENSOR_BINARY_OP_LOCALS;
12073
-
12074
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12075
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
12076
- GGML_ASSERT(nb0 == sizeof(float));
12077
-
12078
- const int N = ne13;
12079
- const int OH = ne12;
12080
- const int OW = ne11;
12081
-
12082
- const int OC = ne03;
12083
- const int IC = ne02;
12084
- const int KH = ne01;
12085
- const int KW = ne00;
12086
-
12087
- const int ith = params->ith;
12088
- const int nth = params->nth;
12089
-
12090
- int64_t m = OC;
12091
- int64_t n = OH * OW;
12092
- int64_t k = IC * KH * KW;
12093
-
12094
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12095
- for (int i = 0; i < N; i++) {
12096
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12097
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
12098
- float * C = (float *)dst->data + i * m * n; // [m, n]
12099
-
12100
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12101
- }
12102
- }
12103
-
12104
- static void ggml_compute_forward_conv_2d_f16_f32(
12105
- const struct ggml_compute_params * params,
12106
- const struct ggml_tensor * src0,
12107
- const struct ggml_tensor * src1,
12108
- struct ggml_tensor * dst) {
12109
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12110
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12111
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12112
-
12113
- int64_t t0 = ggml_perf_time_us();
12114
- UNUSED(t0);
12115
-
12116
- GGML_TENSOR_BINARY_OP_LOCALS
12117
-
12118
- // src1: image [N, IC, IH, IW]
12119
- // src0: kernel [OC, IC, KH, KW]
12120
- // dst: result [N, OC, OH, OW]
12121
- // ne12: IC
12122
- // ne0: OW
12123
- // ne1: OH
12124
- // nk0: KW
12125
- // nk1: KH
12126
- // ne13: N
12127
-
12128
- const int N = ne13;
12129
- const int IC = ne12;
12130
- const int IH = ne11;
12131
- const int IW = ne10;
12132
-
12133
- const int OC = ne03;
12134
- // const int IC = ne02;
12135
- const int KH = ne01;
12136
- const int KW = ne00;
12137
-
12138
- const int OH = ne1;
12139
- const int OW = ne0;
12140
-
12141
- const int ith = params->ith;
12142
- const int nth = params->nth;
12143
-
12144
- // const int nk0 = ne00;
12145
- // const int nk1 = ne01;
12146
-
12147
- // size of the convolution row - the kernel size unrolled across all channels
12148
- // const int ew0 = nk0*nk1*ne02;
12149
- // ew0: IC*KH*KW
12150
-
12151
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12152
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12153
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12154
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12155
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12156
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12157
-
12158
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12159
- GGML_ASSERT(nb10 == sizeof(float));
12160
-
12161
- if (params->type == GGML_TASK_INIT) {
12162
- memset(params->wdata, 0, params->wsize);
12163
-
12164
- // prepare source data (src1)
12165
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
12166
-
12167
- {
12168
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12169
-
12170
- for (int in = 0; in < N; in++) {
12171
- for (int iic = 0; iic < IC; iic++) {
12172
- for (int ioh = 0; ioh < OH; ioh++) {
12173
- for (int iow = 0; iow < OW; iow++) {
12174
-
12175
- // micro kernel
12176
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12177
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
12178
-
12179
- for (int ikh = 0; ikh < KH; ikh++) {
12180
- for (int ikw = 0; ikw < KW; ikw++) {
12181
- const int iiw = iow*s0 + ikw*d0 - p0;
12182
- const int iih = ioh*s1 + ikh*d1 - p1;
12183
-
12184
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
12185
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12186
- }
12187
- }
12188
- }
12189
- }
12190
- }
12191
- }
12192
- }
12193
- }
12194
-
12195
- return;
12196
- }
12197
-
12198
- if (params->type == GGML_TASK_FINALIZE) {
12199
- return;
12200
- }
12201
-
12202
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12203
- // wdata: [N*OH*OW, IC*KH*KW]
12204
- // dst: result [N, OC, OH, OW]
12205
- // src0: kernel [OC, IC, KH, KW]
12206
-
12207
- int64_t m = OC;
12208
- int64_t n = OH * OW;
12209
- int64_t k = IC * KH * KW;
12210
-
12211
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12212
- for (int i = 0; i < N; i++) {
12213
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12214
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
12215
- float * C = (float *)dst->data + i * m * n; // [m * k]
12216
-
12217
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12218
- }
12219
- }
12220
-
12221
- static void ggml_compute_forward_conv_2d(
12222
- const struct ggml_compute_params * params,
12223
- const struct ggml_tensor * src0,
12224
- const struct ggml_tensor * src1,
12225
- struct ggml_tensor * dst) {
12226
- switch (src0->type) {
12227
- case GGML_TYPE_F16:
12228
- {
12229
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
12230
- } break;
12231
- case GGML_TYPE_F32:
12232
- {
12233
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
12234
- GGML_ASSERT(false);
12235
- } break;
12236
- default:
12237
- {
12238
- GGML_ASSERT(false);
12239
- } break;
12240
- }
12241
- }
12242
-
12243
- static void ggml_compute_forward_conv_2d_stage_0(
12244
- const struct ggml_compute_params * params,
12245
- const struct ggml_tensor * src0,
12246
- const struct ggml_tensor * src1,
12247
- struct ggml_tensor * dst) {
12248
- switch (src0->type) {
12249
- case GGML_TYPE_F16:
12250
- {
12251
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
12252
- } break;
12253
- case GGML_TYPE_F32:
12254
- {
12255
- GGML_ASSERT(false);
12256
- } break;
12257
- default:
12258
- {
12259
- GGML_ASSERT(false);
12260
- } break;
12261
- }
12262
- }
12263
-
12264
- static void ggml_compute_forward_conv_2d_stage_1(
11610
+ static void ggml_compute_forward_im2col(
12265
11611
  const struct ggml_compute_params * params,
12266
11612
  const struct ggml_tensor * src0,
12267
11613
  const struct ggml_tensor * src1,
@@ -12269,7 +11615,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
12269
11615
  switch (src0->type) {
12270
11616
  case GGML_TYPE_F16:
12271
11617
  {
12272
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
11618
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12273
11619
  } break;
12274
11620
  case GGML_TYPE_F32:
12275
11621
  {
@@ -12454,14 +11800,11 @@ static void ggml_compute_forward_pool_1d(
12454
11800
  ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
12455
11801
  }
12456
11802
 
12457
- // ggml_compute_forward_pool_2d_sk_p0
11803
+ // ggml_compute_forward_pool_2d
12458
11804
 
12459
- static void ggml_compute_forward_pool_2d_sk_p0(
11805
+ static void ggml_compute_forward_pool_2d(
12460
11806
  const struct ggml_compute_params * params,
12461
- const enum ggml_op_pool op,
12462
11807
  const struct ggml_tensor * src,
12463
- const int k0,
12464
- const int k1,
12465
11808
  struct ggml_tensor * dst) {
12466
11809
  assert(src->type == GGML_TYPE_F32);
12467
11810
  assert(params->ith == 0);
@@ -12470,6 +11813,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12470
11813
  return;
12471
11814
  }
12472
11815
 
11816
+ const int32_t * opts = (const int32_t *)dst->op_params;
11817
+ enum ggml_op_pool op = opts[0];
11818
+ const int k0 = opts[1];
11819
+ const int k1 = opts[2];
11820
+ const int s0 = opts[3];
11821
+ const int s1 = opts[4];
11822
+ const int p0 = opts[5];
11823
+ const int p1 = opts[6];
12473
11824
  const char * cdata = (const char*)src->data;
12474
11825
  const char * const data_end = cdata + ggml_nbytes(src);
12475
11826
 
@@ -12480,6 +11831,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12480
11831
  float * dplane = (float *)dst->data;
12481
11832
 
12482
11833
  const int ka = k0 * k1;
11834
+ const int offset0 = -p0;
11835
+ const int offset1 = -p1;
12483
11836
 
12484
11837
  while (cdata < data_end) {
12485
11838
  for (int oy = 0; oy < py; ++oy) {
@@ -12492,13 +11845,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12492
11845
  case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
12493
11846
  }
12494
11847
 
12495
- const int ix = ox * k0;
12496
- const int iy = oy * k1;
11848
+ const int ix = offset0 + ox * s0;
11849
+ const int iy = offset1 + oy * s1;
12497
11850
 
12498
11851
  for (int ky = 0; ky < k1; ++ky) {
11852
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
12499
11853
  const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
12500
11854
  for (int kx = 0; kx < k0; ++kx) {
12501
11855
  int j = ix + kx;
11856
+ if (j < 0 || j >= src->ne[0]) continue;
12502
11857
  switch (op) {
12503
11858
  case GGML_OP_POOL_AVG: *out += srow[j]; break;
12504
11859
  case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
@@ -12515,31 +11870,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12515
11870
  }
12516
11871
 
12517
11872
  cdata += src->nb[2];
12518
- dplane += pa;
12519
- }
12520
- }
12521
-
12522
- // ggml_compute_forward_pool_2d
12523
-
12524
- static void ggml_compute_forward_pool_2d(
12525
- const struct ggml_compute_params * params,
12526
- const struct ggml_tensor * src0,
12527
- struct ggml_tensor * dst) {
12528
-
12529
- const int32_t * opts = (const int32_t *)dst->op_params;
12530
- enum ggml_op_pool op = opts[0];
12531
- const int k0 = opts[1];
12532
- const int k1 = opts[2];
12533
- const int s0 = opts[3];
12534
- const int s1 = opts[4];
12535
- const int p0 = opts[5];
12536
- const int p1 = opts[6];
12537
- GGML_ASSERT(p0 == 0);
12538
- GGML_ASSERT(p1 == 0); // padding not supported
12539
- GGML_ASSERT(k0 == s0);
12540
- GGML_ASSERT(k1 == s1); // only s = k supported
12541
-
12542
- ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
11873
+ dplane += pa;
11874
+ }
12543
11875
  }
12544
11876
 
12545
11877
  // ggml_compute_forward_upscale
@@ -13743,6 +13075,10 @@ static void ggml_compute_forward_unary(
13743
13075
  {
13744
13076
  ggml_compute_forward_silu(params, src0, dst);
13745
13077
  } break;
13078
+ case GGML_UNARY_OP_LEAKY:
13079
+ {
13080
+ ggml_compute_forward_leaky(params, src0, dst);
13081
+ } break;
13746
13082
  default:
13747
13083
  {
13748
13084
  GGML_ASSERT(false);
@@ -14496,33 +13832,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14496
13832
  {
14497
13833
  ggml_compute_forward_clamp(params, tensor->src[0], tensor);
14498
13834
  } break;
14499
- case GGML_OP_CONV_1D:
14500
- {
14501
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
14502
- } break;
14503
- case GGML_OP_CONV_1D_STAGE_0:
14504
- {
14505
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14506
- } break;
14507
- case GGML_OP_CONV_1D_STAGE_1:
14508
- {
14509
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
14510
- } break;
14511
13835
  case GGML_OP_CONV_TRANSPOSE_1D:
14512
13836
  {
14513
13837
  ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
14514
13838
  } break;
14515
- case GGML_OP_CONV_2D:
14516
- {
14517
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14518
- } break;
14519
- case GGML_OP_CONV_2D_STAGE_0:
13839
+ case GGML_OP_IM2COL:
14520
13840
  {
14521
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14522
- } break;
14523
- case GGML_OP_CONV_2D_STAGE_1:
14524
- {
14525
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
13841
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
14526
13842
  } break;
14527
13843
  case GGML_OP_CONV_TRANSPOSE_2D:
14528
13844
  {
@@ -14651,62 +13967,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14651
13967
 
14652
13968
  ////////////////////////////////////////////////////////////////////////////////
14653
13969
 
14654
- static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
13970
+ static size_t ggml_hash_size(size_t min_sz) {
13971
+ // next primes after powers of two
13972
+ static const size_t primes[] = {
13973
+ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
13974
+ 2053, 4099, 8209, 16411, 32771, 65537, 131101,
13975
+ 262147, 524309, 1048583, 2097169, 4194319, 8388617,
13976
+ 16777259, 33554467, 67108879, 134217757, 268435459,
13977
+ 536870923, 1073741827, 2147483659
13978
+ };
13979
+ static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
13980
+
13981
+ // find the smallest prime that is larger or equal to min_sz
13982
+ size_t l = 0;
13983
+ size_t r = n_primes;
13984
+ while (l < r) {
13985
+ size_t m = (l + r)/2;
13986
+ if (primes[m] < min_sz) {
13987
+ l = m + 1;
13988
+ } else {
13989
+ r = m;
13990
+ }
13991
+ }
13992
+ size_t sz = l < n_primes ? primes[l] : min_sz | 1;
13993
+ return sz;
13994
+ }
14655
13995
 
14656
- static size_t hash(void * p) {
14657
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
13996
+ static size_t ggml_hash(const void * p) {
13997
+ return (size_t)p;
14658
13998
  }
14659
13999
 
14660
- static size_t hash_find(void * hash_table[], void * p) {
14661
- size_t h = hash(p);
14000
+ size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14001
+ size_t h = ggml_hash(key) % hash_set.size;
14662
14002
 
14663
14003
  // linear probing
14664
14004
  size_t i = h;
14665
- while (hash_table[i] != NULL && hash_table[i] != p) {
14666
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
14005
+ while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
14006
+ i = (i + 1) % hash_set.size;
14667
14007
  if (i == h) {
14668
14008
  // visited all hash table entries -> not found
14669
- return GGML_GRAPH_HASHTABLE_SIZE;
14009
+ return GGML_HASHTABLE_FULL;
14670
14010
  }
14671
14011
  }
14672
14012
  return i;
14673
14013
  }
14674
14014
 
14675
- static bool hash_insert(void * hash_table[], void * p) {
14676
- size_t i = hash_find(hash_table, p);
14015
+ bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14016
+ size_t i = ggml_hash_find(hash_set, key);
14017
+ return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
14018
+ }
14019
+
14020
+ size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14021
+ size_t i = ggml_hash_find(hash_set, key);
14677
14022
 
14678
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14023
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14679
14024
 
14680
- if (hash_table[i] == p) {
14681
- return true;
14025
+ if (hash_set.keys[i] == key) {
14026
+ return GGML_HASHTABLE_ALREADY_EXISTS;
14682
14027
  }
14683
14028
 
14684
14029
  // insert
14685
- GGML_ASSERT(hash_table[i] == NULL);
14686
- hash_table[i] = p;
14687
- return false;
14030
+ GGML_ASSERT(hash_set.keys[i] == NULL);
14031
+ hash_set.keys[i] = key;
14032
+ return i;
14033
+ }
14034
+
14035
+ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14036
+ size_t i = ggml_hash_find(hash_set, key);
14037
+
14038
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14039
+
14040
+ hash_set.keys[i] = key;
14041
+ return i;
14042
+ }
14043
+
14044
+ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14045
+ size = ggml_hash_size(size);
14046
+ struct ggml_hash_set result;
14047
+ result.size = size;
14048
+ result.keys = malloc(sizeof(struct ggml_tensor *) * size);
14049
+ memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14050
+ return result;
14688
14051
  }
14689
14052
 
14690
- static bool hash_contains(void * hash_table[], void * p) {
14691
- size_t i = hash_find(hash_table, p);
14692
- return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
14053
+ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14054
+ free(hash_set.keys);
14693
14055
  }
14694
14056
 
14695
14057
  struct hash_map {
14696
- void * keys[GGML_GRAPH_HASHTABLE_SIZE];
14697
- void * vals[GGML_GRAPH_HASHTABLE_SIZE];
14058
+ struct ggml_hash_set set;
14059
+ struct ggml_tensor ** vals;
14698
14060
  };
14699
14061
 
14700
- static struct hash_map * new_hash_map(void) {
14062
+ static struct hash_map * ggml_new_hash_map(size_t size) {
14701
14063
  struct hash_map * result = malloc(sizeof(struct hash_map));
14702
- for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
14703
- result->keys[i] = NULL;
14704
- result->vals[i] = NULL;
14705
- }
14064
+ result->set = ggml_hash_set_new(size);
14065
+ result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
14066
+ memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14706
14067
  return result;
14707
14068
  }
14708
14069
 
14709
- static void free_hash_map(struct hash_map * map) {
14070
+ static void ggml_hash_map_free(struct hash_map * map) {
14071
+ ggml_hash_set_free(map->set);
14072
+ free(map->vals);
14710
14073
  free(map);
14711
14074
  }
14712
14075
 
@@ -14726,7 +14089,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14726
14089
  return node;
14727
14090
  }
14728
14091
 
14729
- if (!hash_contains(graph->visited_hash_table, node)) {
14092
+ if (!ggml_hash_contains(graph->visited_hash_table, node)) {
14730
14093
  return node;
14731
14094
  }
14732
14095
 
@@ -14741,17 +14104,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14741
14104
  return node;
14742
14105
  }
14743
14106
 
14744
- size_t i = hash_find(replacements->keys, node);
14745
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14746
- if (replacements->keys[i] == node) {
14747
- return (struct ggml_tensor *) replacements->vals[i];
14107
+ size_t i = ggml_hash_find(replacements->set, node);
14108
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
14109
+ if (replacements->set.keys[i] == node) {
14110
+ return replacements->vals[i];
14748
14111
  }
14749
14112
 
14750
14113
  struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14751
14114
 
14752
14115
  // insert clone into replacements
14753
- GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
14754
- replacements->keys[i] = node;
14116
+ GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
14117
+ replacements->set.keys[i] = node;
14755
14118
  replacements->vals[i] = clone;
14756
14119
 
14757
14120
  clone->op = node->op;
@@ -14788,26 +14151,26 @@ void ggml_build_backward_gradient_checkpointing(
14788
14151
  struct ggml_cgraph * gb_tmp,
14789
14152
  struct ggml_tensor * * checkpoints,
14790
14153
  int n_checkpoints) {
14791
- *gb_tmp = *gf;
14154
+ ggml_graph_cpy(gf, gb_tmp);
14792
14155
  ggml_build_backward_expand(ctx, gf, gb_tmp, true);
14793
14156
 
14794
14157
  if (n_checkpoints <= 0) {
14795
- *gb = *gb_tmp;
14158
+ ggml_graph_cpy(gb_tmp, gb);
14796
14159
  return;
14797
14160
  }
14798
14161
 
14799
- struct hash_map * replacements = new_hash_map();
14162
+ struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
14800
14163
 
14801
14164
  // insert checkpoints in replacements
14802
14165
  for (int i = 0; i < n_checkpoints; ++i) {
14803
- size_t k = hash_find(replacements->keys, checkpoints[i]);
14804
- GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14805
- GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
14806
- replacements->keys[k] = checkpoints[i];
14807
- replacements->vals[k] = checkpoints[i];
14166
+ size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
14167
+ GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
14168
+ GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
14169
+ replacements->set.keys[k] = checkpoints[i];
14170
+ replacements->vals[k] = checkpoints[i];
14808
14171
  }
14809
14172
 
14810
- *gb = *gf;
14173
+ ggml_graph_cpy(gf, gb);
14811
14174
  // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
14812
14175
  // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
14813
14176
  // by recomputing them from checkpoints
@@ -14824,21 +14187,21 @@ void ggml_build_backward_gradient_checkpointing(
14824
14187
  ggml_build_forward_expand(gb, node);
14825
14188
  }
14826
14189
 
14827
- free_hash_map(replacements);
14190
+ ggml_hash_map_free(replacements);
14828
14191
  }
14829
14192
 
14830
14193
  // functions to change gradients considering the case that input a might be initial gradient with zero value
14831
14194
 
14832
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14833
- if (hash_contains(zero_table, a)) {
14195
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14196
+ if (ggml_hash_contains(zero_table, a)) {
14834
14197
  return b;
14835
14198
  } else {
14836
14199
  return ggml_add_impl(ctx, a, b, false);
14837
14200
  }
14838
14201
  }
14839
14202
 
14840
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
14841
- if (hash_contains(zero_table, a)) {
14203
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14204
+ if (ggml_hash_contains(zero_table, a)) {
14842
14205
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14843
14206
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14844
14207
  } else {
@@ -14846,23 +14209,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
14846
14209
  }
14847
14210
  }
14848
14211
 
14849
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14850
- if (hash_contains(zero_table, a)) {
14212
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14213
+ if (ggml_hash_contains(zero_table, a)) {
14851
14214
  return ggml_repeat(ctx, b, a);
14852
14215
  } else {
14853
14216
  return ggml_add1_impl(ctx, a, b, false);
14854
14217
  }
14855
14218
  }
14856
14219
 
14857
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14858
- if (hash_contains(zero_table, a)) {
14220
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14221
+ if (ggml_hash_contains(zero_table, a)) {
14859
14222
  return ggml_neg(ctx, b);
14860
14223
  } else {
14861
14224
  return ggml_sub_impl(ctx, a, b, false);
14862
14225
  }
14863
14226
  }
14864
14227
 
14865
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
14228
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
14866
14229
  struct ggml_tensor * src0 = tensor->src[0];
14867
14230
  struct ggml_tensor * src1 = tensor->src[1];
14868
14231
 
@@ -15457,31 +14820,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15457
14820
  {
15458
14821
  GGML_ASSERT(false); // TODO: not implemented
15459
14822
  } break;
15460
- case GGML_OP_CONV_1D:
15461
- {
15462
- GGML_ASSERT(false); // TODO: not implemented
15463
- } break;
15464
- case GGML_OP_CONV_1D_STAGE_0:
15465
- {
15466
- GGML_ASSERT(false); // TODO: not implemented
15467
- } break;
15468
- case GGML_OP_CONV_1D_STAGE_1:
15469
- {
15470
- GGML_ASSERT(false); // TODO: not implemented
15471
- } break;
15472
14823
  case GGML_OP_CONV_TRANSPOSE_1D:
15473
14824
  {
15474
14825
  GGML_ASSERT(false); // TODO: not implemented
15475
14826
  } break;
15476
- case GGML_OP_CONV_2D:
15477
- {
15478
- GGML_ASSERT(false); // TODO: not implemented
15479
- } break;
15480
- case GGML_OP_CONV_2D_STAGE_0:
15481
- {
15482
- GGML_ASSERT(false); // TODO: not implemented
15483
- } break;
15484
- case GGML_OP_CONV_2D_STAGE_1:
14827
+ case GGML_OP_IM2COL:
15485
14828
  {
15486
14829
  GGML_ASSERT(false); // TODO: not implemented
15487
14830
  } break;
@@ -15695,7 +15038,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15695
15038
  }
15696
15039
 
15697
15040
  // check if already visited
15698
- if (hash_insert(cgraph->visited_hash_table, node)) {
15041
+ if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
15699
15042
  return;
15700
15043
  }
15701
15044
 
@@ -15711,7 +15054,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15711
15054
 
15712
15055
  if (node->op == GGML_OP_NONE && node->grad == NULL) {
15713
15056
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
15714
- GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15057
+ GGML_ASSERT(cgraph->n_leafs < cgraph->size);
15715
15058
 
15716
15059
  if (strlen(node->name) == 0) {
15717
15060
  ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15720,22 +15063,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15720
15063
  cgraph->leafs[cgraph->n_leafs] = node;
15721
15064
  cgraph->n_leafs++;
15722
15065
  } else {
15723
- GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15066
+ GGML_ASSERT(cgraph->n_nodes < cgraph->size);
15724
15067
 
15725
15068
  if (strlen(node->name) == 0) {
15726
15069
  ggml_format_name(node, "node_%d", cgraph->n_nodes);
15727
15070
  }
15728
15071
 
15729
15072
  cgraph->nodes[cgraph->n_nodes] = node;
15730
- cgraph->grads[cgraph->n_nodes] = node->grad;
15073
+ if (cgraph->grads) {
15074
+ cgraph->grads[cgraph->n_nodes] = node->grad;
15075
+ }
15731
15076
  cgraph->n_nodes++;
15732
15077
  }
15733
15078
  }
15734
15079
 
15735
15080
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
15736
15081
  if (!expand) {
15737
- cgraph->n_nodes = 0;
15738
- cgraph->n_leafs = 0;
15082
+ // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
15083
+ ggml_graph_clear(cgraph);
15739
15084
  }
15740
15085
 
15741
15086
  const int n0 = cgraph->n_nodes;
@@ -15756,25 +15101,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
15756
15101
  ggml_build_forward_impl(cgraph, tensor, true);
15757
15102
  }
15758
15103
 
15759
- struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15760
- struct ggml_cgraph result = {
15761
- /*.n_nodes =*/ 0,
15762
- /*.n_leafs =*/ 0,
15763
- /*.nodes =*/ { NULL },
15764
- /*.grads =*/ { NULL },
15765
- /*.leafs =*/ { NULL },
15766
- /*.hash_table =*/ { NULL },
15767
- /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15768
- /*.perf_runs =*/ 0,
15769
- /*.perf_cycles =*/ 0,
15770
- /*.perf_time_us =*/ 0,
15771
- };
15772
-
15773
- ggml_build_forward_impl(&result, tensor, false);
15774
-
15775
- return result;
15776
- }
15777
-
15778
15104
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
15779
15105
  GGML_ASSERT(gf->n_nodes > 0);
15780
15106
 
@@ -15791,11 +15117,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15791
15117
  }
15792
15118
 
15793
15119
  // remember original gradients which start with zero values
15794
- void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
15795
- memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
15120
+ struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
15796
15121
  for (int i = 0; i < gf->n_nodes; i++) {
15797
15122
  if (gf->grads[i]) {
15798
- hash_insert(zero_table, gf->grads[i]);
15123
+ ggml_hash_insert(zero_table, gf->grads[i]);
15799
15124
  }
15800
15125
  }
15801
15126
 
@@ -15818,26 +15143,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15818
15143
  }
15819
15144
  }
15820
15145
 
15821
- free(zero_table);
15146
+ ggml_hash_set_free(zero_table);
15822
15147
  }
15823
15148
 
15824
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
15825
- struct ggml_cgraph result = *gf;
15826
- ggml_build_backward_expand(ctx, gf, &result, keep);
15827
- return result;
15149
+ static size_t ggml_graph_nbytes(size_t size, bool grads) {
15150
+ size_t nbytes = sizeof(struct ggml_cgraph);
15151
+ nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
15152
+ if (grads) {
15153
+ nbytes += size * sizeof(struct ggml_tensor *); // grads
15154
+ }
15155
+ nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
15156
+ return nbytes;
15828
15157
  }
15829
15158
 
15830
- struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15831
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15159
+ size_t ggml_graph_overhead_custom(size_t size, bool grads) {
15160
+ return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
15161
+ }
15162
+
15163
+ size_t ggml_graph_overhead(void) {
15164
+ return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
15165
+ }
15166
+
15167
+ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
15168
+ const size_t obj_size = ggml_graph_nbytes(size, grads);
15169
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15832
15170
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15833
15171
 
15172
+ struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
15173
+
15174
+ size_t hash_size = ggml_hash_size(size * 2);
15175
+ struct ggml_tensor ** nodes_ptr = data_start;
15176
+ struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
15177
+ struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
15178
+ struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
15179
+
15180
+ // check that we allocated the correct amount of memory
15181
+ assert(obj_size == (size_t) (
15182
+ (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
15183
+
15184
+ memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
15185
+
15834
15186
  *cgraph = (struct ggml_cgraph) {
15187
+ /*.size =*/ size,
15835
15188
  /*.n_nodes =*/ 0,
15836
15189
  /*.n_leafs =*/ 0,
15837
- /*.nodes =*/ { NULL },
15838
- /*.grads =*/ { NULL },
15839
- /*.leafs =*/ { NULL },
15840
- /*.hash_table =*/ { NULL },
15190
+ /*.nodes =*/ nodes_ptr,
15191
+ /*.grads =*/ grads_ptr,
15192
+ /*.leafs =*/ leafs_ptr,
15193
+ /*.hash_table =*/ { hash_size, hash_keys_ptr },
15841
15194
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15842
15195
  /*.perf_runs =*/ 0,
15843
15196
  /*.perf_cycles =*/ 0,
@@ -15847,14 +15200,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15847
15200
  return cgraph;
15848
15201
  }
15849
15202
 
15850
- struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15851
- struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15852
- ggml_build_forward_impl(cgraph, tensor, false);
15203
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15204
+ return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15205
+ }
15206
+
15207
+ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15208
+ const size_t obj_size = sizeof(struct ggml_cgraph);
15209
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15210
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15211
+
15212
+ *cgraph = (struct ggml_cgraph) {
15213
+ /*.size =*/ 0,
15214
+ /*.n_nodes =*/ i1 - i0,
15215
+ /*.n_leafs =*/ 0,
15216
+ /*.nodes =*/ cgraph0->nodes + i0,
15217
+ /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
15218
+ /*.leafs =*/ NULL,
15219
+ /*.hash_table =*/ { 0, NULL },
15220
+ /*.order =*/ cgraph0->order,
15221
+ /*.perf_runs =*/ 0,
15222
+ /*.perf_cycles =*/ 0,
15223
+ /*.perf_time_us =*/ 0,
15224
+ };
15225
+
15853
15226
  return cgraph;
15854
15227
  }
15855
15228
 
15856
- size_t ggml_graph_overhead(void) {
15857
- return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15229
+ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
15230
+ GGML_ASSERT(dst->size >= src->n_leafs);
15231
+ GGML_ASSERT(dst->size >= src->n_nodes);
15232
+ GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
15233
+
15234
+ dst->n_leafs = src->n_leafs;
15235
+ dst->n_nodes = src->n_nodes;
15236
+ dst->order = src->order;
15237
+
15238
+ for (int i = 0; i < src->n_leafs; ++i) {
15239
+ dst->leafs[i] = src->leafs[i];
15240
+ }
15241
+
15242
+ for (int i = 0; i < src->n_nodes; ++i) {
15243
+ dst->nodes[i] = src->nodes[i];
15244
+ }
15245
+
15246
+ if (src->grads) {
15247
+ GGML_ASSERT(dst->grads != NULL);
15248
+ for (int i = 0; i < src->n_nodes; ++i) {
15249
+ dst->grads[i] = src->grads[i];
15250
+ }
15251
+ }
15252
+
15253
+ for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
15254
+ if (src->visited_hash_table.keys[i]) {
15255
+ ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
15256
+ }
15257
+ }
15258
+ }
15259
+
15260
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
15261
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
15262
+ ggml_graph_cpy(cgraph, result);
15263
+ return result;
15264
+ }
15265
+
15266
+ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
15267
+ GGML_ASSERT(cgraph->grads != NULL);
15268
+
15269
+ for (int i = 0; i < cgraph->n_nodes; i++) {
15270
+ struct ggml_tensor * grad = cgraph->grads[i];
15271
+
15272
+ if (grad) {
15273
+ ggml_set_zero(grad);
15274
+ }
15275
+ }
15276
+ }
15277
+
15278
+ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
15279
+ cgraph->n_leafs = 0;
15280
+ cgraph->n_nodes = 0;
15281
+ memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
15858
15282
  }
15859
15283
 
15860
15284
  //
@@ -15966,45 +15390,266 @@ static void clear_numa_thread_affinity(void) {
15966
15390
  strerror(rv));
15967
15391
  }
15968
15392
 
15969
- CPU_FREE(cpus);
15970
- }
15971
- #else
15972
- // TODO: Windows etc.
15973
- // (the linux implementation may also work on BSD, someone should test)
15974
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15975
- static void clear_numa_thread_affinity(void) {}
15976
- #endif
15977
-
15978
- struct ggml_compute_state_shared {
15979
- const struct ggml_cgraph * cgraph;
15980
- const struct ggml_cplan * cplan;
15981
-
15982
- int64_t perf_node_start_cycles;
15983
- int64_t perf_node_start_time_us;
15984
-
15985
- const int n_threads;
15986
-
15987
- // synchronization primitives
15988
- atomic_int n_active; // num active threads
15989
- atomic_int node_n; // active graph node
15990
-
15991
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15992
- void * abort_callback_data;
15993
- };
15994
-
15995
- struct ggml_compute_state {
15996
- ggml_thread_t thrd;
15997
- int ith;
15998
- struct ggml_compute_state_shared * shared;
15999
- };
16000
-
16001
- static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16002
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16003
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15393
+ CPU_FREE(cpus);
15394
+ }
15395
+ #else
15396
+ // TODO: Windows etc.
15397
+ // (the linux implementation may also work on BSD, someone should test)
15398
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15399
+ static void clear_numa_thread_affinity(void) {}
15400
+ #endif
15401
+
15402
+ struct ggml_compute_state_shared {
15403
+ const struct ggml_cgraph * cgraph;
15404
+ const struct ggml_cplan * cplan;
15405
+
15406
+ int64_t perf_node_start_cycles;
15407
+ int64_t perf_node_start_time_us;
15408
+
15409
+ const int n_threads;
15410
+
15411
+ // synchronization primitives
15412
+ atomic_int n_active; // num active threads
15413
+ atomic_int node_n; // active graph node
15414
+
15415
+ bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15416
+ void * abort_callback_data;
15417
+ };
15418
+
15419
+ struct ggml_compute_state {
15420
+ ggml_thread_t thrd;
15421
+ int ith;
15422
+ struct ggml_compute_state_shared * shared;
15423
+ };
15424
+
15425
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
15426
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
15427
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15428
+
15429
+ node->perf_runs++;
15430
+ node->perf_cycles += cycles_cur;
15431
+ node->perf_time_us += time_us_cur;
15432
+ }
15433
+
15434
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15435
+ int n_tasks = 0;
15436
+
15437
+ switch (node->op) {
15438
+ case GGML_OP_CPY:
15439
+ case GGML_OP_DUP:
15440
+ case GGML_OP_ADD:
15441
+ case GGML_OP_ADD1:
15442
+ case GGML_OP_ACC:
15443
+ {
15444
+ n_tasks = n_threads;
15445
+ } break;
15446
+ case GGML_OP_SUB:
15447
+ case GGML_OP_DIV:
15448
+ case GGML_OP_SQR:
15449
+ case GGML_OP_SQRT:
15450
+ case GGML_OP_LOG:
15451
+ case GGML_OP_SUM:
15452
+ case GGML_OP_SUM_ROWS:
15453
+ case GGML_OP_MEAN:
15454
+ case GGML_OP_ARGMAX:
15455
+ case GGML_OP_REPEAT:
15456
+ case GGML_OP_REPEAT_BACK:
15457
+ {
15458
+ n_tasks = 1;
15459
+ } break;
15460
+ case GGML_OP_UNARY:
15461
+ switch (ggml_get_unary_op(node)) {
15462
+ case GGML_UNARY_OP_ABS:
15463
+ case GGML_UNARY_OP_SGN:
15464
+ case GGML_UNARY_OP_NEG:
15465
+ case GGML_UNARY_OP_STEP:
15466
+ case GGML_UNARY_OP_TANH:
15467
+ case GGML_UNARY_OP_ELU:
15468
+ case GGML_UNARY_OP_RELU:
15469
+ case GGML_UNARY_OP_LEAKY:
15470
+ {
15471
+ n_tasks = 1;
15472
+ } break;
15473
+
15474
+ case GGML_UNARY_OP_GELU:
15475
+ case GGML_UNARY_OP_GELU_QUICK:
15476
+ case GGML_UNARY_OP_SILU:
15477
+ {
15478
+ n_tasks = n_threads;
15479
+ } break;
15480
+ }
15481
+ break;
15482
+ case GGML_OP_SILU_BACK:
15483
+ case GGML_OP_MUL:
15484
+ case GGML_OP_NORM:
15485
+ case GGML_OP_RMS_NORM:
15486
+ case GGML_OP_RMS_NORM_BACK:
15487
+ case GGML_OP_GROUP_NORM:
15488
+ case GGML_OP_CONCAT:
15489
+ {
15490
+ n_tasks = n_threads;
15491
+ } break;
15492
+ case GGML_OP_MUL_MAT:
15493
+ {
15494
+ n_tasks = n_threads;
15495
+
15496
+ // TODO: use different scheduling for different matrix sizes
15497
+ //const int nr0 = ggml_nrows(node->src[0]);
15498
+ //const int nr1 = ggml_nrows(node->src[1]);
15499
+
15500
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
15501
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
15502
+
15503
+ #if defined(GGML_USE_CUBLAS)
15504
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
15505
+ n_tasks = 1; // TODO: this actually is doing nothing
15506
+ // the threads are still spinning
15507
+ }
15508
+ #elif defined(GGML_USE_CLBLAST)
15509
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
15510
+ n_tasks = 1; // TODO: this actually is doing nothing
15511
+ // the threads are still spinning
15512
+ }
15513
+ #endif
15514
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
15515
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
15516
+ n_tasks = 1; // TODO: this actually is doing nothing
15517
+ // the threads are still spinning
15518
+ }
15519
+ #endif
15520
+ } break;
15521
+ case GGML_OP_OUT_PROD:
15522
+ {
15523
+ n_tasks = n_threads;
15524
+ } break;
15525
+ case GGML_OP_SCALE:
15526
+ case GGML_OP_SET:
15527
+ case GGML_OP_CONT:
15528
+ case GGML_OP_RESHAPE:
15529
+ case GGML_OP_VIEW:
15530
+ case GGML_OP_PERMUTE:
15531
+ case GGML_OP_TRANSPOSE:
15532
+ case GGML_OP_GET_ROWS:
15533
+ case GGML_OP_GET_ROWS_BACK:
15534
+ case GGML_OP_DIAG:
15535
+ {
15536
+ n_tasks = 1;
15537
+ } break;
15538
+ case GGML_OP_DIAG_MASK_ZERO:
15539
+ case GGML_OP_DIAG_MASK_INF:
15540
+ case GGML_OP_SOFT_MAX:
15541
+ case GGML_OP_SOFT_MAX_BACK:
15542
+ case GGML_OP_ROPE:
15543
+ case GGML_OP_ROPE_BACK:
15544
+ case GGML_OP_ADD_REL_POS:
15545
+ {
15546
+ n_tasks = n_threads;
15547
+ } break;
15548
+ case GGML_OP_ALIBI:
15549
+ {
15550
+ n_tasks = 1; //TODO
15551
+ } break;
15552
+ case GGML_OP_CLAMP:
15553
+ {
15554
+ n_tasks = 1; //TODO
15555
+ } break;
15556
+ case GGML_OP_CONV_TRANSPOSE_1D:
15557
+ {
15558
+ n_tasks = n_threads;
15559
+ } break;
15560
+ case GGML_OP_IM2COL:
15561
+ {
15562
+ n_tasks = n_threads;
15563
+ } break;
15564
+ case GGML_OP_CONV_TRANSPOSE_2D:
15565
+ {
15566
+ n_tasks = n_threads;
15567
+ } break;
15568
+ case GGML_OP_POOL_1D:
15569
+ case GGML_OP_POOL_2D:
15570
+ {
15571
+ n_tasks = 1;
15572
+ } break;
15573
+ case GGML_OP_UPSCALE:
15574
+ {
15575
+ n_tasks = n_threads;
15576
+ } break;
15577
+ case GGML_OP_FLASH_ATTN:
15578
+ {
15579
+ n_tasks = n_threads;
15580
+ } break;
15581
+ case GGML_OP_FLASH_FF:
15582
+ {
15583
+ n_tasks = n_threads;
15584
+ } break;
15585
+ case GGML_OP_FLASH_ATTN_BACK:
15586
+ {
15587
+ n_tasks = n_threads;
15588
+ } break;
15589
+ case GGML_OP_WIN_PART:
15590
+ case GGML_OP_WIN_UNPART:
15591
+ case GGML_OP_GET_REL_POS:
15592
+ case GGML_OP_MAP_UNARY:
15593
+ case GGML_OP_MAP_BINARY:
15594
+ case GGML_OP_MAP_CUSTOM1_F32:
15595
+ case GGML_OP_MAP_CUSTOM2_F32:
15596
+ case GGML_OP_MAP_CUSTOM3_F32:
15597
+ {
15598
+ n_tasks = 1;
15599
+ } break;
15600
+ case GGML_OP_MAP_CUSTOM1:
15601
+ {
15602
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
15603
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15604
+ n_tasks = n_threads;
15605
+ } else {
15606
+ n_tasks = MIN(p->n_tasks, n_threads);
15607
+ }
15608
+ } break;
15609
+ case GGML_OP_MAP_CUSTOM2:
15610
+ {
15611
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
15612
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15613
+ n_tasks = n_threads;
15614
+ } else {
15615
+ n_tasks = MIN(p->n_tasks, n_threads);
15616
+ }
15617
+ } break;
15618
+ case GGML_OP_MAP_CUSTOM3:
15619
+ {
15620
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
15621
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15622
+ n_tasks = n_threads;
15623
+ } else {
15624
+ n_tasks = MIN(p->n_tasks, n_threads);
15625
+ }
15626
+ } break;
15627
+ case GGML_OP_CROSS_ENTROPY_LOSS:
15628
+ {
15629
+ n_tasks = n_threads;
15630
+ } break;
15631
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15632
+ {
15633
+ n_tasks = n_threads;
15634
+ } break;
15635
+ case GGML_OP_NONE:
15636
+ {
15637
+ n_tasks = 1;
15638
+ } break;
15639
+ case GGML_OP_COUNT:
15640
+ {
15641
+ GGML_ASSERT(false);
15642
+ } break;
15643
+ default:
15644
+ {
15645
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15646
+ GGML_ASSERT(false);
15647
+ } break;
15648
+ }
15649
+
15650
+ assert(n_tasks > 0);
16004
15651
 
16005
- node->perf_runs++;
16006
- node->perf_cycles += cycles_cur;
16007
- node->perf_time_us += time_us_cur;
15652
+ return n_tasks;
16008
15653
  }
16009
15654
 
16010
15655
  static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -16013,7 +15658,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16013
15658
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
16014
15659
  const struct ggml_cplan * cplan = state->shared->cplan;
16015
15660
 
16016
- const int * n_tasks_arr = cplan->n_tasks;
16017
15661
  const int n_threads = state->shared->n_threads;
16018
15662
 
16019
15663
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16038,9 +15682,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16038
15682
 
16039
15683
  if (node_n != -1) {
16040
15684
  /* FINALIZE */
16041
- struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
15685
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16042
15686
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16043
- params.nth = n_tasks_arr[node_n];
15687
+ params.nth = ggml_get_n_tasks(node, n_threads);
16044
15688
  ggml_compute_forward(&params, node);
16045
15689
  }
16046
15690
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16051,7 +15695,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16051
15695
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16052
15696
 
16053
15697
  struct ggml_tensor * node = cgraph->nodes[node_n];
16054
- const int n_tasks = n_tasks_arr[node_n];
15698
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16055
15699
 
16056
15700
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16057
15701
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -16109,7 +15753,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16109
15753
 
16110
15754
  /* COMPUTE */
16111
15755
  struct ggml_tensor * node = cgraph->nodes[node_n];
16112
- const int n_tasks = n_tasks_arr[node_n];
15756
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16113
15757
 
16114
15758
  struct ggml_compute_params params = {
16115
15759
  /*.type =*/ GGML_TASK_COMPUTE,
@@ -16143,121 +15787,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16143
15787
 
16144
15788
  struct ggml_tensor * node = cgraph->nodes[i];
16145
15789
 
15790
+ size_t cur = 0;
15791
+
16146
15792
  switch (node->op) {
16147
15793
  case GGML_OP_CPY:
16148
15794
  case GGML_OP_DUP:
16149
15795
  {
16150
15796
  n_tasks = n_threads;
16151
15797
 
16152
- size_t cur = 0;
16153
15798
  if (ggml_is_quantized(node->type)) {
16154
15799
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16155
15800
  }
16156
-
16157
- work_size = MAX(work_size, cur);
16158
15801
  } break;
16159
15802
  case GGML_OP_ADD:
16160
15803
  case GGML_OP_ADD1:
16161
15804
  {
16162
15805
  n_tasks = n_threads;
16163
15806
 
16164
- size_t cur = 0;
16165
-
16166
15807
  if (ggml_is_quantized(node->src[0]->type)) {
16167
15808
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16168
15809
  }
16169
-
16170
- work_size = MAX(work_size, cur);
16171
15810
  } break;
16172
15811
  case GGML_OP_ACC:
16173
15812
  {
16174
15813
  n_tasks = n_threads;
16175
15814
 
16176
- size_t cur = 0;
16177
-
16178
15815
  if (ggml_is_quantized(node->src[0]->type)) {
16179
15816
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16180
15817
  }
16181
-
16182
- work_size = MAX(work_size, cur);
16183
- } break;
16184
- case GGML_OP_SUB:
16185
- case GGML_OP_DIV:
16186
- case GGML_OP_SQR:
16187
- case GGML_OP_SQRT:
16188
- case GGML_OP_LOG:
16189
- case GGML_OP_SUM:
16190
- case GGML_OP_SUM_ROWS:
16191
- case GGML_OP_MEAN:
16192
- case GGML_OP_ARGMAX:
16193
- case GGML_OP_REPEAT:
16194
- case GGML_OP_REPEAT_BACK:
16195
- {
16196
- n_tasks = 1;
16197
- } break;
16198
-
16199
- case GGML_OP_UNARY:
16200
- {
16201
- switch (ggml_get_unary_op(node)) {
16202
- case GGML_UNARY_OP_ABS:
16203
- case GGML_UNARY_OP_SGN:
16204
- case GGML_UNARY_OP_NEG:
16205
- case GGML_UNARY_OP_STEP:
16206
- case GGML_UNARY_OP_TANH:
16207
- case GGML_UNARY_OP_ELU:
16208
- case GGML_UNARY_OP_RELU:
16209
- {
16210
- n_tasks = 1;
16211
- } break;
16212
-
16213
- case GGML_UNARY_OP_GELU:
16214
- case GGML_UNARY_OP_GELU_QUICK:
16215
- case GGML_UNARY_OP_SILU:
16216
- {
16217
- n_tasks = n_threads;
16218
- } break;
16219
- }
16220
15818
  } break;
16221
- case GGML_OP_SILU_BACK:
16222
- case GGML_OP_MUL:
16223
- case GGML_OP_NORM:
16224
- case GGML_OP_RMS_NORM:
16225
- case GGML_OP_RMS_NORM_BACK:
16226
- case GGML_OP_GROUP_NORM:
16227
- {
16228
- n_tasks = n_threads;
16229
- } break;
16230
- case GGML_OP_CONCAT:
16231
15819
  case GGML_OP_MUL_MAT:
16232
15820
  {
16233
- n_tasks = n_threads;
16234
-
16235
- // TODO: use different scheduling for different matrix sizes
16236
- //const int nr0 = ggml_nrows(node->src[0]);
16237
- //const int nr1 = ggml_nrows(node->src[1]);
16238
-
16239
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16240
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16241
-
16242
- size_t cur = 0;
16243
15821
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16244
15822
 
16245
- #if defined(GGML_USE_CUBLAS)
16246
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16247
- n_tasks = 1; // TODO: this actually is doing nothing
16248
- // the threads are still spinning
16249
- } else
16250
- #elif defined(GGML_USE_CLBLAST)
15823
+ #if defined(GGML_USE_CLBLAST)
16251
15824
  if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16252
- n_tasks = 1; // TODO: this actually is doing nothing
16253
- // the threads are still spinning
16254
15825
  cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16255
15826
  } else
16256
15827
  #endif
16257
15828
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16258
15829
  if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16259
- n_tasks = 1; // TODO: this actually is doing nothing
16260
- // the threads are still spinning
16261
15830
  if (node->src[0]->type != GGML_TYPE_F32) {
16262
15831
  // here we need memory just for single 2D matrix from src0
16263
15832
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16266,108 +15835,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16266
15835
  #endif
16267
15836
  if (node->src[1]->type != vec_dot_type) {
16268
15837
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16269
- } else {
16270
- cur = 0;
16271
15838
  }
16272
-
16273
- work_size = MAX(work_size, cur);
16274
15839
  } break;
16275
15840
  case GGML_OP_OUT_PROD:
16276
15841
  {
16277
15842
  n_tasks = n_threads;
16278
15843
 
16279
- size_t cur = 0;
16280
-
16281
15844
  if (ggml_is_quantized(node->src[0]->type)) {
16282
15845
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16283
15846
  }
16284
-
16285
- work_size = MAX(work_size, cur);
16286
- } break;
16287
- case GGML_OP_SCALE:
16288
- {
16289
- n_tasks = 1;
16290
- } break;
16291
- case GGML_OP_SET:
16292
- case GGML_OP_CONT:
16293
- case GGML_OP_RESHAPE:
16294
- case GGML_OP_VIEW:
16295
- case GGML_OP_PERMUTE:
16296
- case GGML_OP_TRANSPOSE:
16297
- case GGML_OP_GET_ROWS:
16298
- case GGML_OP_GET_ROWS_BACK:
16299
- case GGML_OP_DIAG:
16300
- {
16301
- n_tasks = 1;
16302
- } break;
16303
- case GGML_OP_DIAG_MASK_ZERO:
16304
- case GGML_OP_DIAG_MASK_INF:
16305
- case GGML_OP_SOFT_MAX:
16306
- case GGML_OP_SOFT_MAX_BACK:
16307
- case GGML_OP_ROPE:
16308
- case GGML_OP_ROPE_BACK:
16309
- case GGML_OP_ADD_REL_POS:
16310
- {
16311
- n_tasks = n_threads;
16312
- } break;
16313
- case GGML_OP_ALIBI:
16314
- {
16315
- n_tasks = 1; //TODO
16316
- } break;
16317
- case GGML_OP_CLAMP:
16318
- {
16319
- n_tasks = 1; //TODO
16320
- } break;
16321
- case GGML_OP_CONV_1D:
16322
- {
16323
- n_tasks = n_threads;
16324
-
16325
- GGML_ASSERT(node->src[0]->ne[3] == 1);
16326
- GGML_ASSERT(node->src[1]->ne[2] == 1);
16327
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16328
-
16329
- const int64_t ne00 = node->src[0]->ne[0];
16330
- const int64_t ne01 = node->src[0]->ne[1];
16331
- const int64_t ne02 = node->src[0]->ne[2];
16332
-
16333
- const int64_t ne10 = node->src[1]->ne[0];
16334
- const int64_t ne11 = node->src[1]->ne[1];
16335
-
16336
- const int64_t ne0 = node->ne[0];
16337
- const int64_t ne1 = node->ne[1];
16338
- const int64_t nk = ne00;
16339
- const int64_t ew0 = nk * ne01;
16340
-
16341
- UNUSED(ne02);
16342
- UNUSED(ne10);
16343
- UNUSED(ne11);
16344
-
16345
- size_t cur = 0;
16346
-
16347
- if (node->src[0]->type == GGML_TYPE_F16 &&
16348
- node->src[1]->type == GGML_TYPE_F32) {
16349
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16350
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16351
- node->src[1]->type == GGML_TYPE_F32) {
16352
- cur = sizeof(float)*(ne0*ne1*ew0);
16353
- } else {
16354
- GGML_ASSERT(false);
16355
- }
16356
-
16357
- work_size = MAX(work_size, cur);
16358
- } break;
16359
- case GGML_OP_CONV_1D_STAGE_0:
16360
- {
16361
- n_tasks = n_threads;
16362
- } break;
16363
- case GGML_OP_CONV_1D_STAGE_1:
16364
- {
16365
- n_tasks = n_threads;
16366
15847
  } break;
16367
15848
  case GGML_OP_CONV_TRANSPOSE_1D:
16368
15849
  {
16369
- n_tasks = n_threads;
16370
-
16371
15850
  GGML_ASSERT(node->src[0]->ne[3] == 1);
16372
15851
  GGML_ASSERT(node->src[1]->ne[2] == 1);
16373
15852
  GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16379,7 +15858,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16379
15858
  const int64_t ne10 = node->src[1]->ne[0]; // L
16380
15859
  const int64_t ne11 = node->src[1]->ne[1]; // Cin
16381
15860
 
16382
- size_t cur = 0;
16383
15861
  if (node->src[0]->type == GGML_TYPE_F16 &&
16384
15862
  node->src[1]->type == GGML_TYPE_F32) {
16385
15863
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
@@ -16391,59 +15869,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16391
15869
  } else {
16392
15870
  GGML_ASSERT(false);
16393
15871
  }
16394
-
16395
- work_size = MAX(work_size, cur);
16396
- } break;
16397
- case GGML_OP_CONV_2D:
16398
- {
16399
- n_tasks = n_threads;
16400
-
16401
- const int64_t ne00 = node->src[0]->ne[0]; // W
16402
- const int64_t ne01 = node->src[0]->ne[1]; // H
16403
- const int64_t ne02 = node->src[0]->ne[2]; // C
16404
- const int64_t ne03 = node->src[0]->ne[3]; // N
16405
-
16406
- const int64_t ne10 = node->src[1]->ne[0]; // W
16407
- const int64_t ne11 = node->src[1]->ne[1]; // H
16408
- const int64_t ne12 = node->src[1]->ne[2]; // C
16409
-
16410
- const int64_t ne0 = node->ne[0];
16411
- const int64_t ne1 = node->ne[1];
16412
- const int64_t ne2 = node->ne[2];
16413
- const int64_t ne3 = node->ne[3];
16414
- const int64_t nk = ne00*ne01;
16415
- const int64_t ew0 = nk * ne02;
16416
-
16417
- UNUSED(ne03);
16418
- UNUSED(ne2);
16419
-
16420
- size_t cur = 0;
16421
-
16422
- if (node->src[0]->type == GGML_TYPE_F16 &&
16423
- node->src[1]->type == GGML_TYPE_F32) {
16424
- // im2col: [N*OH*OW, IC*KH*KW]
16425
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
16426
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16427
- node->src[1]->type == GGML_TYPE_F32) {
16428
- cur = sizeof(float)* (ne10*ne11*ne12);
16429
- } else {
16430
- GGML_ASSERT(false);
16431
- }
16432
-
16433
- work_size = MAX(work_size, cur);
16434
- } break;
16435
- case GGML_OP_CONV_2D_STAGE_0:
16436
- {
16437
- n_tasks = n_threads;
16438
15872
  } break;
16439
- case GGML_OP_CONV_2D_STAGE_1:
15873
+ case GGML_OP_IM2COL:
16440
15874
  {
16441
15875
  n_tasks = n_threads;
16442
15876
  } break;
16443
15877
  case GGML_OP_CONV_TRANSPOSE_2D:
16444
15878
  {
16445
- n_tasks = n_threads;
16446
-
16447
15879
  const int64_t ne00 = node->src[0]->ne[0]; // W
16448
15880
  const int64_t ne01 = node->src[0]->ne[1]; // H
16449
15881
  const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16453,141 +15885,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16453
15885
  const int64_t ne11 = node->src[1]->ne[1]; // H
16454
15886
  const int64_t ne12 = node->src[1]->ne[2]; // Channels In
16455
15887
 
16456
- size_t cur = 0;
16457
15888
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
16458
15889
  cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
16459
-
16460
- work_size = MAX(work_size, cur);
16461
- } break;
16462
- case GGML_OP_POOL_1D:
16463
- case GGML_OP_POOL_2D:
16464
- {
16465
- n_tasks = 1;
16466
- } break;
16467
- case GGML_OP_UPSCALE:
16468
- {
16469
- n_tasks = n_threads;
16470
15890
  } break;
16471
15891
  case GGML_OP_FLASH_ATTN:
16472
15892
  {
16473
15893
  n_tasks = n_threads;
16474
15894
 
16475
- size_t cur = 0;
16476
-
16477
15895
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16478
15896
 
16479
15897
  if (node->src[1]->type == GGML_TYPE_F32) {
16480
15898
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16481
15899
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16482
- }
16483
-
16484
- if (node->src[1]->type == GGML_TYPE_F16) {
15900
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16485
15901
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16486
15902
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16487
15903
  }
16488
-
16489
- work_size = MAX(work_size, cur);
16490
15904
  } break;
16491
15905
  case GGML_OP_FLASH_FF:
16492
15906
  {
16493
15907
  n_tasks = n_threads;
16494
15908
 
16495
- size_t cur = 0;
16496
-
16497
15909
  if (node->src[1]->type == GGML_TYPE_F32) {
16498
15910
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16499
15911
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16500
- }
16501
-
16502
- if (node->src[1]->type == GGML_TYPE_F16) {
15912
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16503
15913
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16504
15914
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16505
15915
  }
16506
-
16507
- work_size = MAX(work_size, cur);
16508
15916
  } break;
16509
15917
  case GGML_OP_FLASH_ATTN_BACK:
16510
15918
  {
16511
15919
  n_tasks = n_threads;
16512
15920
 
16513
- size_t cur = 0;
16514
-
16515
15921
  const int64_t D = node->src[0]->ne[0];
16516
15922
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16517
15923
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16518
15924
  if (node->src[1]->type == GGML_TYPE_F32) {
16519
15925
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16520
15926
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16521
- }
16522
-
16523
- if (node->src[1]->type == GGML_TYPE_F16) {
15927
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16524
15928
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16525
15929
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16526
15930
  }
16527
-
16528
- work_size = MAX(work_size, cur);
16529
- } break;
16530
- case GGML_OP_WIN_PART:
16531
- case GGML_OP_WIN_UNPART:
16532
- case GGML_OP_GET_REL_POS:
16533
- case GGML_OP_MAP_UNARY:
16534
- case GGML_OP_MAP_BINARY:
16535
- case GGML_OP_MAP_CUSTOM1_F32:
16536
- case GGML_OP_MAP_CUSTOM2_F32:
16537
- case GGML_OP_MAP_CUSTOM3_F32:
16538
- {
16539
- n_tasks = 1;
16540
- } break;
16541
- case GGML_OP_MAP_CUSTOM1:
16542
- {
16543
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16544
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16545
- n_tasks = n_threads;
16546
- } else {
16547
- n_tasks = MIN(p->n_tasks, n_threads);
16548
- }
16549
- } break;
16550
- case GGML_OP_MAP_CUSTOM2:
16551
- {
16552
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16553
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16554
- n_tasks = n_threads;
16555
- } else {
16556
- n_tasks = MIN(p->n_tasks, n_threads);
16557
- }
16558
- } break;
16559
- case GGML_OP_MAP_CUSTOM3:
16560
- {
16561
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16562
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16563
- n_tasks = n_threads;
16564
- } else {
16565
- n_tasks = MIN(p->n_tasks, n_threads);
16566
- }
16567
15931
  } break;
15932
+
16568
15933
  case GGML_OP_CROSS_ENTROPY_LOSS:
16569
15934
  {
16570
15935
  n_tasks = n_threads;
16571
15936
 
16572
- size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16573
-
16574
- work_size = MAX(work_size, cur);
16575
- } break;
16576
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16577
- {
16578
- n_tasks = n_threads;
16579
- } break;
16580
- case GGML_OP_NONE:
16581
- {
16582
- n_tasks = 1;
15937
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16583
15938
  } break;
16584
15939
  case GGML_OP_COUNT:
16585
15940
  {
16586
15941
  GGML_ASSERT(false);
16587
15942
  } break;
15943
+ default:
15944
+ break;
16588
15945
  }
16589
15946
 
16590
- cplan.n_tasks[i] = n_tasks;
15947
+ work_size = MAX(work_size, cur);
16591
15948
  }
16592
15949
 
16593
15950
  if (work_size > 0) {
@@ -16609,12 +15966,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16609
15966
  if (cplan->work_size > 0) {
16610
15967
  GGML_ASSERT(cplan->work_data);
16611
15968
  }
16612
-
16613
- for (int i = 0; i < cgraph->n_nodes; ++i) {
16614
- if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16615
- GGML_ASSERT(cplan->n_tasks[i] > 0);
16616
- }
16617
- }
16618
15969
  }
16619
15970
 
16620
15971
  const int n_threads = cplan->n_threads;
@@ -16687,16 +16038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16687
16038
  return compute_status;
16688
16039
  }
16689
16040
 
16690
- void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16691
- for (int i = 0; i < cgraph->n_nodes; i++) {
16692
- struct ggml_tensor * grad = cgraph->grads[i];
16693
-
16694
- if (grad) {
16695
- ggml_set_zero(grad);
16696
- }
16697
- }
16698
- }
16699
-
16700
16041
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16701
16042
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16702
16043
 
@@ -16823,12 +16164,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16823
16164
  const uint32_t magic = GGML_FILE_MAGIC;
16824
16165
  const uint32_t version = GGML_FILE_VERSION;
16825
16166
  const uint32_t n_leafs = cgraph->n_leafs;
16826
- const uint32_t nodes = cgraph->n_nodes;
16167
+ const uint32_t n_nodes = cgraph->n_nodes;
16827
16168
 
16828
16169
  fwrite(&magic, sizeof(uint32_t), 1, fout);
16829
16170
  fwrite(&version, sizeof(uint32_t), 1, fout);
16830
16171
  fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
16831
- fwrite(&nodes, sizeof(uint32_t), 1, fout);
16172
+ fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
16832
16173
  fwrite(&size_eval, sizeof(uint64_t), 1, fout);
16833
16174
  }
16834
16175
 
@@ -16916,7 +16257,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16916
16257
  if (idx == -1) {
16917
16258
  for (int k = 0; k < cgraph->n_nodes; ++k) {
16918
16259
  if (args[j] == cgraph->nodes[k]) {
16919
- idx = GGML_MAX_NODES + k;
16260
+ idx = cgraph->n_leafs + k;
16920
16261
  break;
16921
16262
  }
16922
16263
  }
@@ -16943,11 +16284,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16943
16284
  }
16944
16285
  }
16945
16286
 
16946
- struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16287
+ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16947
16288
  assert(*ctx_data == NULL);
16948
16289
  assert(*ctx_eval == NULL);
16949
16290
 
16950
- struct ggml_cgraph result = { 0 };
16291
+ struct ggml_cgraph * result = NULL;
16951
16292
 
16952
16293
  struct ggml_tensor * data = NULL;
16953
16294
 
@@ -17019,13 +16360,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17019
16360
  const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
17020
16361
  const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
17021
16362
  const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
17022
-
17023
- result.n_leafs = n_leafs;
17024
- result.n_nodes = n_nodes;
16363
+ const int graph_size = MAX(n_leafs, n_nodes);
17025
16364
 
17026
16365
  // create the data context
17027
16366
  {
17028
- const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
16367
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
17029
16368
 
17030
16369
  struct ggml_init_params params = {
17031
16370
  .mem_size = size_eval + overhead,
@@ -17041,6 +16380,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17041
16380
  }
17042
16381
  }
17043
16382
 
16383
+ result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
16384
+
16385
+ result->n_leafs = n_leafs;
16386
+ result->n_nodes = n_nodes;
16387
+
16388
+
17044
16389
  // leafs
17045
16390
  {
17046
16391
  uint32_t type;
@@ -17079,7 +16424,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17079
16424
  tensor->nb[j] = nb[j];
17080
16425
  }
17081
16426
 
17082
- result.leafs[i] = tensor;
16427
+ result->leafs[i] = tensor;
17083
16428
 
17084
16429
  ptr += ggml_nbytes(tensor);
17085
16430
 
@@ -17131,10 +16476,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17131
16476
  continue;
17132
16477
  }
17133
16478
 
17134
- if (arg_idx < GGML_MAX_NODES) {
17135
- args[j] = result.leafs[arg_idx];
16479
+ if (arg_idx < result->n_leafs) {
16480
+ args[j] = result->leafs[arg_idx];
17136
16481
  } else {
17137
- args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
16482
+ args[j] = result->nodes[arg_idx - result->n_leafs];
17138
16483
  }
17139
16484
  }
17140
16485
 
@@ -17186,7 +16531,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17186
16531
  tensor->src[j] = args[j];
17187
16532
  }
17188
16533
 
17189
- result.nodes[i] = tensor;
16534
+ result->nodes[i] = tensor;
17190
16535
 
17191
16536
  fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17192
16537
  }
@@ -18091,10 +17436,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18091
17436
  case GGML_OPT_ADAM:
18092
17437
  {
18093
17438
  result = (struct ggml_opt_params) {
18094
- .type = GGML_OPT_ADAM,
18095
- .n_threads = 1,
18096
- .past = 0,
18097
- .delta = 1e-5f,
17439
+ .type = GGML_OPT_ADAM,
17440
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17441
+ .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
17442
+ .past = 0,
17443
+ .delta = 1e-5f,
18098
17444
 
18099
17445
  .max_no_improvement = 100,
18100
17446
 
@@ -18121,10 +17467,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18121
17467
  case GGML_OPT_LBFGS:
18122
17468
  {
18123
17469
  result = (struct ggml_opt_params) {
18124
- .type = GGML_OPT_LBFGS,
18125
- .n_threads = 1,
18126
- .past = 0,
18127
- .delta = 1e-5f,
17470
+ .type = GGML_OPT_LBFGS,
17471
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17472
+ .n_threads = 1,
17473
+ .past = 0,
17474
+ .delta = 1e-5f,
18128
17475
 
18129
17476
  .max_no_improvement = 0,
18130
17477
 
@@ -18266,14 +17613,11 @@ enum ggml_opt_result ggml_opt_resume(
18266
17613
  struct ggml_tensor * f) {
18267
17614
 
18268
17615
  // build forward + backward compute graphs
18269
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18270
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18271
-
18272
- struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18273
- struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
17616
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
17617
+ ggml_build_forward_expand(gf, f);
18274
17618
 
18275
- *gf = ggml_build_forward (f);
18276
- *gb = ggml_build_backward(ctx, gf, true);
17619
+ struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
17620
+ ggml_build_backward_expand(ctx, gf, gb, true);
18277
17621
 
18278
17622
  return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18279
17623
  }
@@ -18729,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18729
18073
  {
18730
18074
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18731
18075
 
18732
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18076
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18733
18077
  struct gguf_kv * kv = &ctx->kv[i];
18734
18078
 
18735
18079
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18776,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18776
18120
  case GGUF_TYPE_STRING:
18777
18121
  {
18778
18122
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18779
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18123
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18780
18124
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18781
18125
  }
18782
18126
  } break;
@@ -18804,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18804
18148
  {
18805
18149
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18806
18150
 
18807
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18151
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18808
18152
  struct gguf_tensor_info * info = &ctx->infos[i];
18809
18153
 
18810
18154
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18851,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18851
18195
  // compute the total size of the data section, taking into account the alignment
18852
18196
  {
18853
18197
  ctx->size = 0;
18854
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18198
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18855
18199
  struct gguf_tensor_info * info = &ctx->infos[i];
18856
18200
 
18857
18201
  const int64_t ne =
@@ -18920,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18920
18264
  ggml_set_no_alloc(ctx_data, true);
18921
18265
 
18922
18266
  // create the tensors
18923
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18267
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18924
18268
  const int64_t ne[GGML_MAX_DIMS] = {
18925
18269
  ctx->infos[i].ne[0],
18926
18270
  ctx->infos[i].ne[1],