llama_cpp 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
100
100
  #include <hbwmalloc.h>
101
101
  #endif
102
102
 
103
+ #if defined(__APPLE__)
104
+ #include <TargetConditionals.h>
105
+ #endif
106
+
107
+ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
108
+ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
109
+
110
+ #include <sys/wait.h>
111
+
112
+ void ggml_print_backtrace(void) {
113
+ /*
114
+ #include <execinfo.h>
115
+ #include <dlfcn.h>
116
+
117
+ void * trace[100];
118
+
119
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
120
+
121
+ backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
122
+ */
123
+
124
+ // backtrack_symbols does not show line numbers, use gdb instead
125
+ char attach[32];
126
+ snprintf(attach, sizeof(attach), "attach %d", getpid());
127
+ int pid = fork();
128
+ if (pid == 0) {
129
+ execlp("gdb", "gdb", "--batch",
130
+ "-ex", "set style enabled on",
131
+ "-ex", attach,
132
+ "-ex", "bt -frame-info source-and-location",
133
+ "-ex", "detach",
134
+ "-ex", "quit",
135
+ NULL);
136
+ } else {
137
+ waitpid(pid, NULL, 0);
138
+ }
139
+ }
140
+ #else
141
+ void ggml_print_backtrace(void) {
142
+ // platform not supported
143
+ }
144
+ #endif
145
+
103
146
  /*#define GGML_PERF*/
104
147
  #define GGML_DEBUG 0
105
148
  #define GGML_GELU_FP16
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
228
271
  // floating point type used to accumulate sums
229
272
  typedef double ggml_float;
230
273
 
274
+ #undef MIN
275
+ #undef MAX
276
+
277
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
278
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
279
+
231
280
  //
232
281
  // global data
233
282
  //
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
561
610
  // simd mappings
562
611
  //
563
612
 
613
+ #if defined(__ARM_NEON)
614
+ #if !defined(__aarch64__)
615
+
616
+ // 64-bit compatibility
617
+
618
+ inline static float vaddvq_f32(float32x4_t v) {
619
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620
+ }
621
+
622
+ #endif
623
+ #endif
624
+
564
625
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
565
626
  // we then implement the fundamental computation operations below using only these macros
566
627
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1352
1413
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1353
1414
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1354
1415
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
+ inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1355
1417
 
1356
1418
  static const float GELU_COEF_A = 0.044715f;
1357
1419
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1572
1634
  "ROPE_BACK",
1573
1635
  "ALIBI",
1574
1636
  "CLAMP",
1575
- "CONV_1D",
1576
- "CONV_1D_STAGE_0",
1577
- "CONV_1D_STAGE_1",
1578
1637
  "CONV_TRANSPOSE_1D",
1579
- "CONV_2D",
1580
- "CONV_2D_STAGE_0",
1581
- "CONV_2D_STAGE_1",
1638
+ "IM2COL",
1582
1639
  "CONV_TRANSPOSE_2D",
1583
1640
  "POOL_1D",
1584
1641
  "POOL_2D",
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1609
1666
  "CROSS_ENTROPY_LOSS_BACK",
1610
1667
  };
1611
1668
 
1612
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1669
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1613
1670
 
1614
1671
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1615
1672
  "none",
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1659
1716
  "rope_back(x)",
1660
1717
  "alibi(x)",
1661
1718
  "clamp(x)",
1662
- "conv_1d(x)",
1663
- "conv_1d_stage_0(x)",
1664
- "conv_1d_stage_1(x)",
1665
1719
  "conv_transpose_1d(x)",
1666
- "conv_2d(x)",
1667
- "conv_2d_stage_0(x)",
1668
- "conv_2d_stage_1(x)",
1720
+ "im2col(x)",
1669
1721
  "conv_transpose_2d(x)",
1670
1722
  "pool_1d(x)",
1671
1723
  "pool_2d(x)",
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1696
1748
  "cross_entropy_loss_back(x,y)",
1697
1749
  };
1698
1750
 
1699
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1751
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1700
1752
 
1701
1753
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1702
1754
 
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
1724
1776
  p[GGML_OP_GET_ROWS_BACK ] = true;
1725
1777
  p[GGML_OP_DIAG_MASK_INF ] = true;
1726
1778
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
1727
- p[GGML_OP_CONV_1D ] = true;
1728
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
1729
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
1730
1779
  p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
1731
- p[GGML_OP_CONV_2D ] = true;
1732
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
1733
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
1734
1780
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
1735
1781
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
1736
1782
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
3769
3815
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3770
3816
  }
3771
3817
 
3818
+ // ggml_leaky
3819
+
3820
+ struct ggml_tensor * ggml_leaky(
3821
+ struct ggml_context * ctx,
3822
+ struct ggml_tensor * a) {
3823
+ return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3824
+ }
3825
+
3772
3826
  // ggml_gelu
3773
3827
 
3774
3828
  struct ggml_tensor * ggml_gelu(
@@ -5076,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
5076
5130
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
5077
5131
  }
5078
5132
 
5079
- // im2col: [N, IC, IL] => [N, OL, IC*K]
5080
- // a: [OC,IC, K]
5081
- // b: [N, IC, IL]
5082
- // result: [N, OL, IC*K]
5083
- static struct ggml_tensor * ggml_conv_1d_stage_0(
5084
- struct ggml_context * ctx,
5085
- struct ggml_tensor * a,
5086
- struct ggml_tensor * b,
5087
- int s0,
5088
- int p0,
5089
- int d0) {
5090
- GGML_ASSERT(a->ne[1] == b->ne[1]);
5091
- bool is_node = false;
5092
-
5093
- if (a->grad || b->grad) {
5094
- GGML_ASSERT(false); // TODO: implement backward
5095
- is_node = true;
5096
- }
5097
-
5098
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5099
-
5100
- const int64_t ne[4] = {
5101
- a->ne[1] * a->ne[0],
5102
- OL,
5103
- b->ne[2],
5104
- 1,
5105
- };
5106
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5107
-
5108
- int32_t params[] = { s0, p0, d0 };
5109
- ggml_set_op_params(result, params, sizeof(params));
5110
-
5111
- result->op = GGML_OP_CONV_1D_STAGE_0;
5112
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5113
- result->src[0] = a;
5114
- result->src[1] = b;
5115
-
5116
- return result;
5117
- }
5118
-
5119
- // ggml_conv_1d_stage_1
5120
-
5121
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
5122
- // a: [OC, IC, K]
5123
- // b: [N, OL, IC * K]
5124
- // result: [N, OC, OL]
5125
- static struct ggml_tensor * ggml_conv_1d_stage_1(
5126
- struct ggml_context * ctx,
5127
- struct ggml_tensor * a,
5128
- struct ggml_tensor * b) {
5129
-
5130
- bool is_node = false;
5131
-
5132
- if (a->grad || b->grad) {
5133
- GGML_ASSERT(false); // TODO: implement backward
5134
- is_node = true;
5135
- }
5136
-
5137
- const int64_t ne[4] = {
5138
- b->ne[1],
5139
- a->ne[2],
5140
- b->ne[2],
5141
- 1,
5142
- };
5143
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5144
-
5145
- result->op = GGML_OP_CONV_1D_STAGE_1;
5146
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5147
- result->src[0] = a;
5148
- result->src[1] = b;
5149
-
5150
- return result;
5151
- }
5152
-
5153
- // ggml_conv_1d
5154
-
5155
5133
  GGML_API struct ggml_tensor * ggml_conv_1d(
5156
5134
  struct ggml_context * ctx,
5157
5135
  struct ggml_tensor * a,
@@ -5159,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5159
5137
  int s0,
5160
5138
  int p0,
5161
5139
  int d0) {
5162
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
5163
- result = ggml_conv_1d_stage_1(ctx, a, result);
5164
- return result;
5165
- }
5166
-
5167
- // GGML_API struct ggml_tensor * ggml_conv_1d(
5168
- // struct ggml_context * ctx,
5169
- // struct ggml_tensor * a,
5170
- // struct ggml_tensor * b,
5171
- // int s0,
5172
- // int p0,
5173
- // int d0) {
5174
- // GGML_ASSERT(ggml_is_matrix(b));
5175
- // GGML_ASSERT(a->ne[1] == b->ne[1]);
5176
- // bool is_node = false;
5177
-
5178
- // if (a->grad || b->grad) {
5179
- // GGML_ASSERT(false); // TODO: implement backward
5180
- // is_node = true;
5181
- // }
5140
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5182
5141
 
5183
- // const int64_t ne[4] = {
5184
- // ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
5185
- // a->ne[2], 1, 1,
5186
- // };
5187
- // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5142
+ struct ggml_tensor * result =
5143
+ ggml_mul_mat(ctx,
5144
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
5145
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
5188
5146
 
5189
- // int32_t params[] = { s0, p0, d0 };
5190
- // ggml_set_op_params(result, params, sizeof(params));
5147
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
5191
5148
 
5192
- // result->op = GGML_OP_CONV_1D;
5193
- // result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5194
- // result->src[0] = a;
5195
- // result->src[1] = b;
5196
-
5197
- // return result;
5198
- // }
5149
+ return result;
5150
+ }
5199
5151
 
5200
5152
  // ggml_conv_1d_ph
5201
5153
 
@@ -5258,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5258
5210
  // a: [OC,IC, KH, KW]
5259
5211
  // b: [N, IC, IH, IW]
5260
5212
  // result: [N, OH, OW, IC*KH*KW]
5261
- static struct ggml_tensor * ggml_conv_2d_stage_0(
5213
+ struct ggml_tensor * ggml_im2col(
5262
5214
  struct ggml_context * ctx,
5263
5215
  struct ggml_tensor * a,
5264
5216
  struct ggml_tensor * b,
@@ -5267,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5267
5219
  int p0,
5268
5220
  int p1,
5269
5221
  int d0,
5270
- int d1) {
5222
+ int d1,
5223
+ bool is_2D) {
5271
5224
 
5272
- GGML_ASSERT(a->ne[2] == b->ne[2]);
5225
+ if(is_2D) {
5226
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
5227
+ } else {
5228
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
5229
+ }
5273
5230
  bool is_node = false;
5274
5231
 
5275
5232
  if (a->grad || b->grad) {
@@ -5277,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5277
5234
  is_node = true;
5278
5235
  }
5279
5236
 
5280
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
5281
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5237
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
5238
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5282
5239
 
5283
5240
  const int64_t ne[4] = {
5284
- a->ne[2] * a->ne[1] * a->ne[0],
5241
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
5285
5242
  OW,
5286
- OH,
5287
- b->ne[3],
5243
+ is_2D ? OH : b->ne[2],
5244
+ is_2D ? b->ne[3] : 1,
5288
5245
  };
5289
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5290
5246
 
5291
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
5247
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5248
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5292
5249
  ggml_set_op_params(result, params, sizeof(params));
5293
5250
 
5294
- result->op = GGML_OP_CONV_2D_STAGE_0;
5295
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5296
- result->src[0] = a;
5297
- result->src[1] = b;
5298
-
5299
- return result;
5300
-
5301
- }
5302
-
5303
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
5304
- // a: [OC, IC, KH, KW]
5305
- // b: [N, OH, OW, IC * KH * KW]
5306
- // result: [N, OC, OH, OW]
5307
- static struct ggml_tensor * ggml_conv_2d_stage_1(
5308
- struct ggml_context * ctx,
5309
- struct ggml_tensor * a,
5310
- struct ggml_tensor * b) {
5311
-
5312
- bool is_node = false;
5313
-
5314
- if (a->grad || b->grad) {
5315
- GGML_ASSERT(false); // TODO: implement backward
5316
- is_node = true;
5317
- }
5318
-
5319
- const int64_t ne[4] = {
5320
- b->ne[1],
5321
- b->ne[2],
5322
- a->ne[3],
5323
- b->ne[3],
5324
- };
5325
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5326
-
5327
- result->op = GGML_OP_CONV_2D_STAGE_1;
5251
+ result->op = GGML_OP_IM2COL;
5328
5252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5329
5253
  result->src[0] = a;
5330
5254
  result->src[1] = b;
5331
5255
 
5332
5256
  return result;
5333
-
5334
5257
  }
5335
5258
 
5336
5259
  // a: [OC,IC, KH, KW]
5337
5260
  // b: [N, IC, IH, IW]
5338
5261
  // result: [N, OC, OH, OW]
5339
5262
  struct ggml_tensor * ggml_conv_2d(
5340
- struct ggml_context * ctx,
5341
- struct ggml_tensor * a,
5342
- struct ggml_tensor * b,
5343
- int s0,
5344
- int s1,
5345
- int p0,
5346
- int p1,
5347
- int d0,
5348
- int d1) {
5263
+ struct ggml_context * ctx,
5264
+ struct ggml_tensor * a,
5265
+ struct ggml_tensor * b,
5266
+ int s0,
5267
+ int s1,
5268
+ int p0,
5269
+ int p1,
5270
+ int d0,
5271
+ int d1) {
5272
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5349
5273
 
5350
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
5351
- result = ggml_conv_2d_stage_1(ctx, a, result);
5274
+ struct ggml_tensor * result =
5275
+ ggml_mul_mat(ctx,
5276
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5277
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5352
5278
 
5353
- return result;
5279
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5354
5280
 
5281
+ return result;
5355
5282
  }
5356
5283
 
5357
5284
  // ggml_conv_2d_sk_p0
@@ -5411,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
5411
5338
 
5412
5339
  // ggml_pool_*
5413
5340
 
5414
- static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
5341
+ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
5415
5342
  return (ins + 2 * p - ks) / s + 1;
5416
5343
  }
5417
5344
 
@@ -5458,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
5458
5385
  int k1,
5459
5386
  int s0,
5460
5387
  int s1,
5461
- int p0,
5462
- int p1) {
5388
+ float p0,
5389
+ float p1) {
5463
5390
 
5464
5391
  bool is_node = false;
5465
5392
 
@@ -8921,6 +8848,48 @@ static void ggml_compute_forward_silu(
8921
8848
  }
8922
8849
  }
8923
8850
 
8851
+ // ggml_compute_forward_leaky
8852
+
8853
+ static void ggml_compute_forward_leaky_f32(
8854
+ const struct ggml_compute_params * params,
8855
+ const struct ggml_tensor * src0,
8856
+ struct ggml_tensor * dst) {
8857
+ assert(params->ith == 0);
8858
+ assert(ggml_are_same_shape(src0, dst));
8859
+
8860
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8861
+ return;
8862
+ }
8863
+
8864
+ const int n = ggml_nrows(src0);
8865
+ const int nc = src0->ne[0];
8866
+
8867
+ assert(dst->nb[0] == sizeof(float));
8868
+ assert(src0->nb[0] == sizeof(float));
8869
+
8870
+ for (int i = 0; i < n; i++) {
8871
+ ggml_vec_leaky_f32(nc,
8872
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8873
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8874
+ }
8875
+ }
8876
+
8877
+ static void ggml_compute_forward_leaky(
8878
+ const struct ggml_compute_params * params,
8879
+ const struct ggml_tensor * src0,
8880
+ struct ggml_tensor * dst) {
8881
+ switch (src0->type) {
8882
+ case GGML_TYPE_F32:
8883
+ {
8884
+ ggml_compute_forward_leaky_f32(params, src0, dst);
8885
+ } break;
8886
+ default:
8887
+ {
8888
+ GGML_ASSERT(false);
8889
+ } break;
8890
+ }
8891
+ }
8892
+
8924
8893
  // ggml_compute_forward_silu_back
8925
8894
 
8926
8895
  static void ggml_compute_forward_silu_back_f32(
@@ -9404,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9404
9373
  // TODO: find the optimal values for these
9405
9374
  if (ggml_is_contiguous(src0) &&
9406
9375
  ggml_is_contiguous(src1) &&
9376
+ src0->type == GGML_TYPE_F32 &&
9377
+ src1->type == GGML_TYPE_F32 &&
9407
9378
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9408
9379
 
9409
9380
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9442,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
9442
9413
 
9443
9414
  // we don't support permuted src0 or src1
9444
9415
  GGML_ASSERT(nb00 == ggml_type_size(type));
9445
- GGML_ASSERT(nb10 == sizeof(float));
9416
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9446
9417
 
9447
9418
  // dst cannot be transposed or permuted
9448
9419
  GGML_ASSERT(nb0 == sizeof(float));
@@ -9640,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
9640
9611
  const int ith = params->ith;
9641
9612
  const int nth = params->nth;
9642
9613
 
9614
+ GGML_ASSERT(ne0 == ne00);
9615
+ GGML_ASSERT(ne1 == ne10);
9616
+ GGML_ASSERT(ne2 == ne02);
9643
9617
  GGML_ASSERT(ne02 == ne12);
9644
- GGML_ASSERT(ne03 == ne13);
9645
- GGML_ASSERT(ne2 == ne12);
9646
9618
  GGML_ASSERT(ne3 == ne13);
9619
+ GGML_ASSERT(ne03 == ne13);
9647
9620
 
9648
9621
  // we don't support permuted src0 or src1
9649
9622
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9654,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
9654
9627
  // GGML_ASSERT(nb1 <= nb2);
9655
9628
  // GGML_ASSERT(nb2 <= nb3);
9656
9629
 
9657
- GGML_ASSERT(ne0 == ne00);
9658
- GGML_ASSERT(ne1 == ne10);
9659
- GGML_ASSERT(ne2 == ne02);
9660
- GGML_ASSERT(ne3 == ne03);
9661
-
9662
9630
  // nb01 >= nb00 - src0 is not transposed
9663
9631
  // compute by src0 rows
9664
9632
 
9665
9633
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
9666
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9634
+ // TODO: #if defined(GGML_USE_CLBLAST)
9635
+
9636
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9637
+ bool use_blas = ggml_is_matrix(src0) &&
9638
+ ggml_is_matrix(src1) &&
9639
+ ggml_is_contiguous(src0) &&
9640
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
9641
+ #endif
9667
9642
 
9668
9643
  if (params->type == GGML_TASK_INIT) {
9644
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
9645
+ if (use_blas) {
9646
+ return;
9647
+ }
9648
+ #endif
9669
9649
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
9670
9650
  return;
9671
9651
  }
@@ -9674,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
9674
9654
  return;
9675
9655
  }
9676
9656
 
9657
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9658
+ if (use_blas) {
9659
+ if (params->ith != 0) { // All threads other than the first do no work.
9660
+ return;
9661
+ }
9662
+ // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
9663
+ // src0: (k,n)
9664
+ // src1: (k,m)
9665
+ // dst: (m,n)
9666
+ //
9667
+ // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
9668
+ // Also expressed as (major,minor)
9669
+ // a: (m,k): so src1 transposed
9670
+ // b: (k,n): so src0
9671
+ // c: (m,n)
9672
+ //
9673
+ // However, if ggml_is_transposed(src1) is true, then
9674
+ // src1->data already contains a transposed version, so sgemm mustn't
9675
+ // transpose it further.
9676
+
9677
+ int n = src0->ne[0];
9678
+ int k = src0->ne[1];
9679
+ int m = src1->ne[0];
9680
+
9681
+ int transposeA, lda;
9682
+
9683
+ if (!ggml_is_transposed(src1)) {
9684
+ transposeA = CblasTrans;
9685
+ lda = m;
9686
+ } else {
9687
+ transposeA = CblasNoTrans;
9688
+ lda = k;
9689
+ }
9690
+
9691
+ float * a = (float *) ((char *) src1->data);
9692
+ float * b = (float *) ((char *) src0->data);
9693
+ float * c = (float *) ((char *) dst->data);
9694
+
9695
+ cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
9696
+
9697
+ return;
9698
+ }
9699
+ #endif
9700
+
9677
9701
  // dst[:,:,:,:] = 0
9678
9702
  // for i2,i3:
9679
9703
  // for i1:
@@ -11340,9 +11364,9 @@ static void ggml_compute_forward_rope_back(
11340
11364
  }
11341
11365
  }
11342
11366
 
11343
- // ggml_compute_forward_conv_1d
11367
+ // ggml_compute_forward_conv_transpose_1d
11344
11368
 
11345
- static void ggml_compute_forward_conv_1d_f16_f32(
11369
+ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11346
11370
  const struct ggml_compute_params * params,
11347
11371
  const struct ggml_tensor * src0,
11348
11372
  const struct ggml_tensor * src1,
@@ -11359,14 +11383,7 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11359
11383
  const int ith = params->ith;
11360
11384
  const int nth = params->nth;
11361
11385
 
11362
- const int nk = ne00;
11363
-
11364
- // size of the convolution row - the kernel size unrolled across all input channels
11365
- const int ew0 = nk*ne01;
11366
-
11367
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11368
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11369
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11386
+ const int nk = ne00*ne01*ne02;
11370
11387
 
11371
11388
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11372
11389
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11374,23 +11391,37 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11374
11391
  if (params->type == GGML_TASK_INIT) {
11375
11392
  memset(params->wdata, 0, params->wsize);
11376
11393
 
11377
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11394
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11395
+ {
11396
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11397
+
11398
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11399
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11400
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11401
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11402
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11403
+ dst_data[i00*ne02 + i02] = src[i00];
11404
+ }
11405
+ }
11406
+ }
11407
+ }
11378
11408
 
11379
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11380
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11409
+ // permute source data (src1) from (L x Cin) to (Cin x L)
11410
+ {
11411
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11381
11412
  ggml_fp16_t * dst_data = wdata;
11382
11413
 
11383
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11384
- for (int64_t ik = 0; ik < nk; ik++) {
11385
- const int idx0 = i0*s0 + ik*d0 - p0;
11386
-
11387
- if(!(idx0 < 0 || idx0 >= ne10)) {
11388
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
11389
- }
11414
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
11415
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
11416
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
11417
+ dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11390
11418
  }
11391
11419
  }
11392
11420
  }
11393
11421
 
11422
+ // need to zero dst since we are accumulating into it
11423
+ memset(dst->data, 0, ggml_nbytes(dst));
11424
+
11394
11425
  return;
11395
11426
  }
11396
11427
 
@@ -11398,8 +11429,10 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11398
11429
  return;
11399
11430
  }
11400
11431
 
11432
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11433
+
11401
11434
  // total rows in dst
11402
- const int nr = ne2;
11435
+ const int nr = ne1;
11403
11436
 
11404
11437
  // rows per thread
11405
11438
  const int dr = (nr + nth - 1)/nth;
@@ -11408,22 +11441,26 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11408
11441
  const int ir0 = dr*ith;
11409
11442
  const int ir1 = MIN(ir0 + dr, nr);
11410
11443
 
11411
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11412
-
11413
- for (int i2 = 0; i2 < ne2; i2++) {
11414
- for (int i1 = ir0; i1 < ir1; i1++) {
11415
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11444
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11445
+ ggml_fp16_t * const wdata_src = wdata + nk;
11416
11446
 
11417
- for (int i0 = 0; i0 < ne0; i0++) {
11418
- ggml_vec_dot_f16(ew0, dst_data + i0,
11419
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
11420
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
11447
+ for (int i1 = ir0; i1 < ir1; i1++) {
11448
+ float * dst_data = (float *)((char *) dst->data + i1*nb1);
11449
+ ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11450
+ for (int i10 = 0; i10 < ne10; i10++) {
11451
+ const int i1n = i10*ne11;
11452
+ for (int i00 = 0; i00 < ne00; i00++) {
11453
+ float v = 0;
11454
+ ggml_vec_dot_f16(ne02, &v,
11455
+ (ggml_fp16_t *) wdata_src + i1n,
11456
+ (ggml_fp16_t *) wdata_kernel + i00*ne02);
11457
+ dst_data[i10*s0 + i00] += v;
11421
11458
  }
11422
11459
  }
11423
11460
  }
11424
11461
  }
11425
11462
 
11426
- static void ggml_compute_forward_conv_1d_f32(
11463
+ static void ggml_compute_forward_conv_transpose_1d_f32(
11427
11464
  const struct ggml_compute_params * params,
11428
11465
  const struct ggml_tensor * src0,
11429
11466
  const struct ggml_tensor * src1,
@@ -11440,430 +11477,7 @@ static void ggml_compute_forward_conv_1d_f32(
11440
11477
  const int ith = params->ith;
11441
11478
  const int nth = params->nth;
11442
11479
 
11443
- const int nk = ne00;
11444
-
11445
- const int ew0 = nk*ne01;
11446
-
11447
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11448
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11449
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11450
-
11451
- GGML_ASSERT(nb00 == sizeof(float));
11452
- GGML_ASSERT(nb10 == sizeof(float));
11453
-
11454
- if (params->type == GGML_TASK_INIT) {
11455
- memset(params->wdata, 0, params->wsize);
11456
-
11457
- float * const wdata = (float *) params->wdata + 0;
11458
-
11459
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11460
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11461
- float * dst_data = wdata;
11462
-
11463
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11464
- for (int64_t ik = 0; ik < nk; ik++) {
11465
- const int idx0 = i0*s0 + ik*d0 - p0;
11466
-
11467
- if(!(idx0 < 0 || idx0 >= ne10)) {
11468
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
11469
- }
11470
- }
11471
- }
11472
- }
11473
-
11474
- return;
11475
- }
11476
-
11477
- if (params->type == GGML_TASK_FINALIZE) {
11478
- return;
11479
- }
11480
-
11481
- // total rows in dst
11482
- const int nr = ne02;
11483
-
11484
- // rows per thread
11485
- const int dr = (nr + nth - 1)/nth;
11486
-
11487
- // row range for this thread
11488
- const int ir0 = dr*ith;
11489
- const int ir1 = MIN(ir0 + dr, nr);
11490
-
11491
- float * const wdata = (float *) params->wdata + 0;
11492
-
11493
- for (int i2 = 0; i2 < ne2; i2++) {
11494
- for (int i1 = ir0; i1 < ir1; i1++) {
11495
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11496
-
11497
- for (int i0 = 0; i0 < ne0; i0++) {
11498
- ggml_vec_dot_f32(ew0, dst_data + i0,
11499
- (float *) ((char *) src0->data + i1*nb02),
11500
- (float *) wdata + i2*nb2 + i0*ew0);
11501
- }
11502
- }
11503
- }
11504
- }
11505
-
11506
- // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
11507
- static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
11508
- ggml_fp16_t * A,
11509
- ggml_fp16_t * B,
11510
- float * C,
11511
- const int ith, const int nth) {
11512
- // does not seem to make a difference
11513
- int64_t m0, m1, n0, n1;
11514
- // patches per thread
11515
- if (m > n) {
11516
- n0 = 0;
11517
- n1 = n;
11518
-
11519
- // total patches in dst
11520
- const int np = m;
11521
-
11522
- // patches per thread
11523
- const int dp = (np + nth - 1)/nth;
11524
-
11525
- // patch range for this thread
11526
- m0 = dp*ith;
11527
- m1 = MIN(m0 + dp, np);
11528
- } else {
11529
- m0 = 0;
11530
- m1 = m;
11531
-
11532
- // total patches in dst
11533
- const int np = n;
11534
-
11535
- // patches per thread
11536
- const int dp = (np + nth - 1)/nth;
11537
-
11538
- // patch range for this thread
11539
- n0 = dp*ith;
11540
- n1 = MIN(n0 + dp, np);
11541
- }
11542
-
11543
- // block-tiling attempt
11544
- int64_t blck_n = 16;
11545
- int64_t blck_m = 16;
11546
-
11547
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
11548
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
11549
- // if (blck_size > 0) {
11550
- // blck_0 = 4;
11551
- // blck_1 = blck_size / blck_0;
11552
- // if (blck_1 < 0) {
11553
- // blck_1 = 1;
11554
- // }
11555
- // // blck_0 = (int64_t)sqrt(blck_size);
11556
- // // blck_1 = blck_0;
11557
- // }
11558
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
11559
-
11560
- for (int j = n0; j < n1; j+=blck_n) {
11561
- for (int i = m0; i < m1; i+=blck_m) {
11562
- // printf("i j k => %d %d %d\n", i, j, K);
11563
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
11564
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
11565
- ggml_vec_dot_f16(k,
11566
- C + ii*n + jj,
11567
- A + ii * k,
11568
- B + jj * k);
11569
- }
11570
- }
11571
- }
11572
- }
11573
- }
11574
-
11575
- // src0: kernel [OC, IC, K]
11576
- // src1: signal [N, IC, IL]
11577
- // dst: result [N, OL, IC*K]
11578
- static void ggml_compute_forward_conv_1d_stage_0_f32(
11579
- const struct ggml_compute_params * params,
11580
- const struct ggml_tensor * src0,
11581
- const struct ggml_tensor * src1,
11582
- struct ggml_tensor * dst) {
11583
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11584
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11585
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
11586
-
11587
- int64_t t0 = ggml_perf_time_us();
11588
- UNUSED(t0);
11589
-
11590
- GGML_TENSOR_BINARY_OP_LOCALS;
11591
-
11592
- const int64_t N = ne12;
11593
- const int64_t IC = ne11;
11594
- const int64_t IL = ne10;
11595
-
11596
- const int64_t K = ne00;
11597
-
11598
- const int64_t OL = ne1;
11599
-
11600
- const int ith = params->ith;
11601
- const int nth = params->nth;
11602
-
11603
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11604
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11605
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11606
-
11607
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11608
- GGML_ASSERT(nb10 == sizeof(float));
11609
-
11610
- if (params->type == GGML_TASK_INIT) {
11611
- memset(dst->data, 0, ggml_nbytes(dst));
11612
- return;
11613
- }
11614
-
11615
- if (params->type == GGML_TASK_FINALIZE) {
11616
- return;
11617
- }
11618
-
11619
- // im2col: [N, IC, IL] => [N, OL, IC*K]
11620
- {
11621
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
11622
-
11623
- for (int64_t in = 0; in < N; in++) {
11624
- for (int64_t iol = 0; iol < OL; iol++) {
11625
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11626
-
11627
- // micro kernel
11628
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
11629
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
11630
-
11631
- for (int64_t ik = 0; ik < K; ik++) {
11632
- const int64_t iil = iol*s0 + ik*d0 - p0;
11633
-
11634
- if (!(iil < 0 || iil >= IL)) {
11635
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
11636
- }
11637
- }
11638
- }
11639
- }
11640
- }
11641
- }
11642
- }
11643
-
11644
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11645
- // src0: [OC, IC, K]
11646
- // src1: [N, OL, IC * K]
11647
- // result: [N, OC, OL]
11648
- static void ggml_compute_forward_conv_1d_stage_1_f16(
11649
- const struct ggml_compute_params * params,
11650
- const struct ggml_tensor * src0,
11651
- const struct ggml_tensor * src1,
11652
- struct ggml_tensor * dst) {
11653
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11654
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
11655
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11656
-
11657
- int64_t t0 = ggml_perf_time_us();
11658
- UNUSED(t0);
11659
-
11660
- if (params->type == GGML_TASK_INIT) {
11661
- return;
11662
- }
11663
-
11664
- if (params->type == GGML_TASK_FINALIZE) {
11665
- return;
11666
- }
11667
-
11668
- GGML_TENSOR_BINARY_OP_LOCALS;
11669
-
11670
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11671
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
11672
- GGML_ASSERT(nb0 == sizeof(float));
11673
-
11674
- const int N = ne12;
11675
- const int OL = ne11;
11676
-
11677
- const int OC = ne02;
11678
- const int IC = ne01;
11679
- const int K = ne00;
11680
-
11681
- const int ith = params->ith;
11682
- const int nth = params->nth;
11683
-
11684
- int64_t m = OC;
11685
- int64_t n = OL;
11686
- int64_t k = IC * K;
11687
-
11688
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11689
- for (int i = 0; i < N; i++) {
11690
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
11691
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
11692
- float * C = (float *)dst->data + i * m * n; // [m, n]
11693
-
11694
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
11695
- }
11696
- }
11697
-
11698
- static void ggml_compute_forward_conv_1d(
11699
- const struct ggml_compute_params * params,
11700
- const struct ggml_tensor * src0,
11701
- const struct ggml_tensor * src1,
11702
- struct ggml_tensor * dst) {
11703
- switch(src0->type) {
11704
- case GGML_TYPE_F16:
11705
- {
11706
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
11707
- } break;
11708
- case GGML_TYPE_F32:
11709
- {
11710
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
11711
- } break;
11712
- default:
11713
- {
11714
- GGML_ASSERT(false);
11715
- } break;
11716
- }
11717
- }
11718
-
11719
- static void ggml_compute_forward_conv_1d_stage_0(
11720
- const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
- const struct ggml_tensor * src1,
11723
- struct ggml_tensor * dst) {
11724
- switch(src0->type) {
11725
- case GGML_TYPE_F16:
11726
- {
11727
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
11728
- } break;
11729
- default:
11730
- {
11731
- GGML_ASSERT(false);
11732
- } break;
11733
- }
11734
- }
11735
-
11736
- static void ggml_compute_forward_conv_1d_stage_1(
11737
- const struct ggml_compute_params * params,
11738
- const struct ggml_tensor * src0,
11739
- const struct ggml_tensor * src1,
11740
- struct ggml_tensor * dst) {
11741
- switch(src0->type) {
11742
- case GGML_TYPE_F16:
11743
- {
11744
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
11745
- } break;
11746
- default:
11747
- {
11748
- GGML_ASSERT(false);
11749
- } break;
11750
- }
11751
- }
11752
-
11753
- // ggml_compute_forward_conv_transpose_1d
11754
-
11755
- static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11756
- const struct ggml_compute_params * params,
11757
- const struct ggml_tensor * src0,
11758
- const struct ggml_tensor * src1,
11759
- struct ggml_tensor * dst) {
11760
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11761
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11762
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11763
-
11764
- int64_t t0 = ggml_perf_time_us();
11765
- UNUSED(t0);
11766
-
11767
- GGML_TENSOR_BINARY_OP_LOCALS
11768
-
11769
- const int ith = params->ith;
11770
- const int nth = params->nth;
11771
-
11772
- const int nk = ne00*ne01*ne02;
11773
-
11774
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11775
- GGML_ASSERT(nb10 == sizeof(float));
11776
-
11777
- if (params->type == GGML_TASK_INIT) {
11778
- memset(params->wdata, 0, params->wsize);
11779
-
11780
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11781
- {
11782
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11783
-
11784
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11785
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11786
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11787
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11788
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11789
- dst_data[i00*ne02 + i02] = src[i00];
11790
- }
11791
- }
11792
- }
11793
- }
11794
-
11795
- // permute source data (src1) from (L x Cin) to (Cin x L)
11796
- {
11797
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11798
- ggml_fp16_t * dst_data = wdata;
11799
-
11800
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11801
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11802
- for (int64_t i10 = 0; i10 < ne10; i10++) {
11803
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11804
- }
11805
- }
11806
- }
11807
-
11808
- // need to zero dst since we are accumulating into it
11809
- memset(dst->data, 0, ggml_nbytes(dst));
11810
-
11811
- return;
11812
- }
11813
-
11814
- if (params->type == GGML_TASK_FINALIZE) {
11815
- return;
11816
- }
11817
-
11818
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11819
-
11820
- // total rows in dst
11821
- const int nr = ne1;
11822
-
11823
- // rows per thread
11824
- const int dr = (nr + nth - 1)/nth;
11825
-
11826
- // row range for this thread
11827
- const int ir0 = dr*ith;
11828
- const int ir1 = MIN(ir0 + dr, nr);
11829
-
11830
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11831
- ggml_fp16_t * const wdata_src = wdata + nk;
11832
-
11833
- for (int i1 = ir0; i1 < ir1; i1++) {
11834
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
11835
- ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11836
- for (int i10 = 0; i10 < ne10; i10++) {
11837
- const int i1n = i10*ne11;
11838
- for (int i00 = 0; i00 < ne00; i00++) {
11839
- float v = 0;
11840
- ggml_vec_dot_f16(ne02, &v,
11841
- (ggml_fp16_t *) wdata_src + i1n,
11842
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
11843
- dst_data[i10*s0 + i00] += v;
11844
- }
11845
- }
11846
- }
11847
- }
11848
-
11849
- static void ggml_compute_forward_conv_transpose_1d_f32(
11850
- const struct ggml_compute_params * params,
11851
- const struct ggml_tensor * src0,
11852
- const struct ggml_tensor * src1,
11853
- struct ggml_tensor * dst) {
11854
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
11855
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11856
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11857
-
11858
- int64_t t0 = ggml_perf_time_us();
11859
- UNUSED(t0);
11860
-
11861
- GGML_TENSOR_BINARY_OP_LOCALS
11862
-
11863
- const int ith = params->ith;
11864
- const int nth = params->nth;
11865
-
11866
- const int nk = ne00*ne01*ne02;
11480
+ const int nk = ne00*ne01*ne02;
11867
11481
 
11868
11482
  GGML_ASSERT(nb00 == sizeof(float));
11869
11483
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11961,12 +11575,10 @@ static void ggml_compute_forward_conv_transpose_1d(
11961
11575
  }
11962
11576
  }
11963
11577
 
11964
- // ggml_compute_forward_conv_2d
11965
-
11966
11578
  // src0: kernel [OC, IC, KH, KW]
11967
11579
  // src1: image [N, IC, IH, IW]
11968
11580
  // dst: result [N, OH, OW, IC*KH*KW]
11969
- static void ggml_compute_forward_conv_2d_stage_0_f32(
11581
+ static void ggml_compute_forward_im2col_f16(
11970
11582
  const struct ggml_compute_params * params,
11971
11583
  const struct ggml_tensor * src0,
11972
11584
  const struct ggml_tensor * src1,
@@ -11980,34 +11592,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
11980
11592
 
11981
11593
  GGML_TENSOR_BINARY_OP_LOCALS;
11982
11594
 
11983
- const int64_t N = ne13;
11984
- const int64_t IC = ne12;
11985
- const int64_t IH = ne11;
11595
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
11596
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
11597
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
11598
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
11599
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
11600
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
11601
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
11602
+
11603
+ const int ith = params->ith;
11604
+ const int nth = params->nth;
11605
+
11606
+ const int64_t N = is_2D ? ne13 : ne12;
11607
+ const int64_t IC = is_2D ? ne12 : ne11;
11608
+ const int64_t IH = is_2D ? ne11 : 1;
11986
11609
  const int64_t IW = ne10;
11987
11610
 
11988
- // const int64_t OC = ne03;
11989
- // const int64_t IC = ne02;
11990
- const int64_t KH = ne01;
11611
+ const int64_t KH = is_2D ? ne01 : 1;
11991
11612
  const int64_t KW = ne00;
11992
11613
 
11993
- const int64_t OH = ne2;
11614
+ const int64_t OH = is_2D ? ne2 : 1;
11994
11615
  const int64_t OW = ne1;
11995
11616
 
11996
- const int ith = params->ith;
11997
- const int nth = params->nth;
11998
-
11999
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12000
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12001
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12002
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12003
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12004
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
11617
+ int ofs0 = is_2D ? nb13 : nb12;
11618
+ int ofs1 = is_2D ? nb12 : nb11;
12005
11619
 
12006
11620
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12007
11621
  GGML_ASSERT(nb10 == sizeof(float));
12008
11622
 
12009
11623
  if (params->type == GGML_TASK_INIT) {
12010
- memset(dst->data, 0, ggml_nbytes(dst));
12011
11624
  return;
12012
11625
  }
12013
11626
 
@@ -12020,20 +11633,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12020
11633
  ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
12021
11634
 
12022
11635
  for (int64_t in = 0; in < N; in++) {
12023
- for (int64_t ioh = 0; ioh < OH; ioh++) {
11636
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12024
11637
  for (int64_t iow = 0; iow < OW; iow++) {
12025
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11638
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12026
11639
 
12027
11640
  // micro kernel
12028
11641
  ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12029
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
11642
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12030
11643
 
12031
- for (int64_t ikh = 0; ikh < KH; ikh++) {
11644
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12032
11645
  for (int64_t ikw = 0; ikw < KW; ikw++) {
12033
11646
  const int64_t iiw = iow*s0 + ikw*d0 - p0;
12034
11647
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
12035
11648
 
12036
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
11649
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
11650
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
11651
+ } else {
12037
11652
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12038
11653
  }
12039
11654
  }
@@ -12045,180 +11660,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12045
11660
  }
12046
11661
  }
12047
11662
 
12048
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12049
- // src0: [OC, IC, KH, KW]
12050
- // src1: [N, OH, OW, IC * KH * KW]
12051
- // result: [N, OC, OH, OW]
12052
- static void ggml_compute_forward_conv_2d_stage_1_f16(
12053
- const struct ggml_compute_params * params,
12054
- const struct ggml_tensor * src0,
12055
- const struct ggml_tensor * src1,
12056
- struct ggml_tensor * dst) {
12057
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12058
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
12059
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12060
-
12061
- int64_t t0 = ggml_perf_time_us();
12062
- UNUSED(t0);
12063
-
12064
- if (params->type == GGML_TASK_INIT) {
12065
- return;
12066
- }
12067
-
12068
- if (params->type == GGML_TASK_FINALIZE) {
12069
- return;
12070
- }
12071
-
12072
- GGML_TENSOR_BINARY_OP_LOCALS;
12073
-
12074
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12075
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
12076
- GGML_ASSERT(nb0 == sizeof(float));
12077
-
12078
- const int N = ne13;
12079
- const int OH = ne12;
12080
- const int OW = ne11;
12081
-
12082
- const int OC = ne03;
12083
- const int IC = ne02;
12084
- const int KH = ne01;
12085
- const int KW = ne00;
12086
-
12087
- const int ith = params->ith;
12088
- const int nth = params->nth;
12089
-
12090
- int64_t m = OC;
12091
- int64_t n = OH * OW;
12092
- int64_t k = IC * KH * KW;
12093
-
12094
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12095
- for (int i = 0; i < N; i++) {
12096
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12097
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
12098
- float * C = (float *)dst->data + i * m * n; // [m, n]
12099
-
12100
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12101
- }
12102
- }
12103
-
12104
- static void ggml_compute_forward_conv_2d_f16_f32(
12105
- const struct ggml_compute_params * params,
12106
- const struct ggml_tensor * src0,
12107
- const struct ggml_tensor * src1,
12108
- struct ggml_tensor * dst) {
12109
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12110
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12111
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12112
-
12113
- int64_t t0 = ggml_perf_time_us();
12114
- UNUSED(t0);
12115
-
12116
- GGML_TENSOR_BINARY_OP_LOCALS
12117
-
12118
- // src1: image [N, IC, IH, IW]
12119
- // src0: kernel [OC, IC, KH, KW]
12120
- // dst: result [N, OC, OH, OW]
12121
- // ne12: IC
12122
- // ne0: OW
12123
- // ne1: OH
12124
- // nk0: KW
12125
- // nk1: KH
12126
- // ne13: N
12127
-
12128
- const int N = ne13;
12129
- const int IC = ne12;
12130
- const int IH = ne11;
12131
- const int IW = ne10;
12132
-
12133
- const int OC = ne03;
12134
- // const int IC = ne02;
12135
- const int KH = ne01;
12136
- const int KW = ne00;
12137
-
12138
- const int OH = ne1;
12139
- const int OW = ne0;
12140
-
12141
- const int ith = params->ith;
12142
- const int nth = params->nth;
12143
-
12144
- // const int nk0 = ne00;
12145
- // const int nk1 = ne01;
12146
-
12147
- // size of the convolution row - the kernel size unrolled across all channels
12148
- // const int ew0 = nk0*nk1*ne02;
12149
- // ew0: IC*KH*KW
12150
-
12151
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12152
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12153
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12154
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12155
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12156
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12157
-
12158
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12159
- GGML_ASSERT(nb10 == sizeof(float));
12160
-
12161
- if (params->type == GGML_TASK_INIT) {
12162
- memset(params->wdata, 0, params->wsize);
12163
-
12164
- // prepare source data (src1)
12165
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
12166
-
12167
- {
12168
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12169
-
12170
- for (int in = 0; in < N; in++) {
12171
- for (int iic = 0; iic < IC; iic++) {
12172
- for (int ioh = 0; ioh < OH; ioh++) {
12173
- for (int iow = 0; iow < OW; iow++) {
12174
-
12175
- // micro kernel
12176
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12177
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
12178
-
12179
- for (int ikh = 0; ikh < KH; ikh++) {
12180
- for (int ikw = 0; ikw < KW; ikw++) {
12181
- const int iiw = iow*s0 + ikw*d0 - p0;
12182
- const int iih = ioh*s1 + ikh*d1 - p1;
12183
-
12184
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
12185
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12186
- }
12187
- }
12188
- }
12189
- }
12190
- }
12191
- }
12192
- }
12193
- }
12194
-
12195
- return;
12196
- }
12197
-
12198
- if (params->type == GGML_TASK_FINALIZE) {
12199
- return;
12200
- }
12201
-
12202
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12203
- // wdata: [N*OH*OW, IC*KH*KW]
12204
- // dst: result [N, OC, OH, OW]
12205
- // src0: kernel [OC, IC, KH, KW]
12206
-
12207
- int64_t m = OC;
12208
- int64_t n = OH * OW;
12209
- int64_t k = IC * KH * KW;
12210
-
12211
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12212
- for (int i = 0; i < N; i++) {
12213
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12214
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
12215
- float * C = (float *)dst->data + i * m * n; // [m * k]
12216
-
12217
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12218
- }
12219
- }
12220
-
12221
- static void ggml_compute_forward_conv_2d(
11663
+ static void ggml_compute_forward_im2col(
12222
11664
  const struct ggml_compute_params * params,
12223
11665
  const struct ggml_tensor * src0,
12224
11666
  const struct ggml_tensor * src1,
@@ -12226,50 +11668,7 @@ static void ggml_compute_forward_conv_2d(
12226
11668
  switch (src0->type) {
12227
11669
  case GGML_TYPE_F16:
12228
11670
  {
12229
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
12230
- } break;
12231
- case GGML_TYPE_F32:
12232
- {
12233
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
12234
- GGML_ASSERT(false);
12235
- } break;
12236
- default:
12237
- {
12238
- GGML_ASSERT(false);
12239
- } break;
12240
- }
12241
- }
12242
-
12243
- static void ggml_compute_forward_conv_2d_stage_0(
12244
- const struct ggml_compute_params * params,
12245
- const struct ggml_tensor * src0,
12246
- const struct ggml_tensor * src1,
12247
- struct ggml_tensor * dst) {
12248
- switch (src0->type) {
12249
- case GGML_TYPE_F16:
12250
- {
12251
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
12252
- } break;
12253
- case GGML_TYPE_F32:
12254
- {
12255
- GGML_ASSERT(false);
12256
- } break;
12257
- default:
12258
- {
12259
- GGML_ASSERT(false);
12260
- } break;
12261
- }
12262
- }
12263
-
12264
- static void ggml_compute_forward_conv_2d_stage_1(
12265
- const struct ggml_compute_params * params,
12266
- const struct ggml_tensor * src0,
12267
- const struct ggml_tensor * src1,
12268
- struct ggml_tensor * dst) {
12269
- switch (src0->type) {
12270
- case GGML_TYPE_F16:
12271
- {
12272
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
11671
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12273
11672
  } break;
12274
11673
  case GGML_TYPE_F32:
12275
11674
  {
@@ -12454,14 +11853,11 @@ static void ggml_compute_forward_pool_1d(
12454
11853
  ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
12455
11854
  }
12456
11855
 
12457
- // ggml_compute_forward_pool_2d_sk_p0
11856
+ // ggml_compute_forward_pool_2d
12458
11857
 
12459
- static void ggml_compute_forward_pool_2d_sk_p0(
11858
+ static void ggml_compute_forward_pool_2d(
12460
11859
  const struct ggml_compute_params * params,
12461
- const enum ggml_op_pool op,
12462
11860
  const struct ggml_tensor * src,
12463
- const int k0,
12464
- const int k1,
12465
11861
  struct ggml_tensor * dst) {
12466
11862
  assert(src->type == GGML_TYPE_F32);
12467
11863
  assert(params->ith == 0);
@@ -12470,6 +11866,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12470
11866
  return;
12471
11867
  }
12472
11868
 
11869
+ const int32_t * opts = (const int32_t *)dst->op_params;
11870
+ enum ggml_op_pool op = opts[0];
11871
+ const int k0 = opts[1];
11872
+ const int k1 = opts[2];
11873
+ const int s0 = opts[3];
11874
+ const int s1 = opts[4];
11875
+ const int p0 = opts[5];
11876
+ const int p1 = opts[6];
12473
11877
  const char * cdata = (const char*)src->data;
12474
11878
  const char * const data_end = cdata + ggml_nbytes(src);
12475
11879
 
@@ -12480,6 +11884,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12480
11884
  float * dplane = (float *)dst->data;
12481
11885
 
12482
11886
  const int ka = k0 * k1;
11887
+ const int offset0 = -p0;
11888
+ const int offset1 = -p1;
12483
11889
 
12484
11890
  while (cdata < data_end) {
12485
11891
  for (int oy = 0; oy < py; ++oy) {
@@ -12492,13 +11898,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12492
11898
  case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
12493
11899
  }
12494
11900
 
12495
- const int ix = ox * k0;
12496
- const int iy = oy * k1;
11901
+ const int ix = offset0 + ox * s0;
11902
+ const int iy = offset1 + oy * s1;
12497
11903
 
12498
11904
  for (int ky = 0; ky < k1; ++ky) {
11905
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
12499
11906
  const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
12500
11907
  for (int kx = 0; kx < k0; ++kx) {
12501
11908
  int j = ix + kx;
11909
+ if (j < 0 || j >= src->ne[0]) continue;
12502
11910
  switch (op) {
12503
11911
  case GGML_OP_POOL_AVG: *out += srow[j]; break;
12504
11912
  case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
@@ -12519,29 +11927,6 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12519
11927
  }
12520
11928
  }
12521
11929
 
12522
- // ggml_compute_forward_pool_2d
12523
-
12524
- static void ggml_compute_forward_pool_2d(
12525
- const struct ggml_compute_params * params,
12526
- const struct ggml_tensor * src0,
12527
- struct ggml_tensor * dst) {
12528
-
12529
- const int32_t * opts = (const int32_t *)dst->op_params;
12530
- enum ggml_op_pool op = opts[0];
12531
- const int k0 = opts[1];
12532
- const int k1 = opts[2];
12533
- const int s0 = opts[3];
12534
- const int s1 = opts[4];
12535
- const int p0 = opts[5];
12536
- const int p1 = opts[6];
12537
- GGML_ASSERT(p0 == 0);
12538
- GGML_ASSERT(p1 == 0); // padding not supported
12539
- GGML_ASSERT(k0 == s0);
12540
- GGML_ASSERT(k1 == s1); // only s = k supported
12541
-
12542
- ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
12543
- }
12544
-
12545
11930
  // ggml_compute_forward_upscale
12546
11931
 
12547
11932
  static void ggml_compute_forward_upscale_f32(
@@ -13743,6 +13128,10 @@ static void ggml_compute_forward_unary(
13743
13128
  {
13744
13129
  ggml_compute_forward_silu(params, src0, dst);
13745
13130
  } break;
13131
+ case GGML_UNARY_OP_LEAKY:
13132
+ {
13133
+ ggml_compute_forward_leaky(params, src0, dst);
13134
+ } break;
13746
13135
  default:
13747
13136
  {
13748
13137
  GGML_ASSERT(false);
@@ -14496,33 +13885,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14496
13885
  {
14497
13886
  ggml_compute_forward_clamp(params, tensor->src[0], tensor);
14498
13887
  } break;
14499
- case GGML_OP_CONV_1D:
14500
- {
14501
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
14502
- } break;
14503
- case GGML_OP_CONV_1D_STAGE_0:
14504
- {
14505
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14506
- } break;
14507
- case GGML_OP_CONV_1D_STAGE_1:
14508
- {
14509
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
14510
- } break;
14511
13888
  case GGML_OP_CONV_TRANSPOSE_1D:
14512
13889
  {
14513
13890
  ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
14514
13891
  } break;
14515
- case GGML_OP_CONV_2D:
13892
+ case GGML_OP_IM2COL:
14516
13893
  {
14517
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14518
- } break;
14519
- case GGML_OP_CONV_2D_STAGE_0:
14520
- {
14521
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14522
- } break;
14523
- case GGML_OP_CONV_2D_STAGE_1:
14524
- {
14525
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
13894
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
14526
13895
  } break;
14527
13896
  case GGML_OP_CONV_TRANSPOSE_2D:
14528
13897
  {
@@ -14651,62 +14020,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14651
14020
 
14652
14021
  ////////////////////////////////////////////////////////////////////////////////
14653
14022
 
14654
- static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
14023
+ static size_t ggml_hash_size(size_t min_sz) {
14024
+ // next primes after powers of two
14025
+ static const size_t primes[] = {
14026
+ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
14027
+ 2053, 4099, 8209, 16411, 32771, 65537, 131101,
14028
+ 262147, 524309, 1048583, 2097169, 4194319, 8388617,
14029
+ 16777259, 33554467, 67108879, 134217757, 268435459,
14030
+ 536870923, 1073741827, 2147483659
14031
+ };
14032
+ static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
14033
+
14034
+ // find the smallest prime that is larger or equal to min_sz
14035
+ size_t l = 0;
14036
+ size_t r = n_primes;
14037
+ while (l < r) {
14038
+ size_t m = (l + r)/2;
14039
+ if (primes[m] < min_sz) {
14040
+ l = m + 1;
14041
+ } else {
14042
+ r = m;
14043
+ }
14044
+ }
14045
+ size_t sz = l < n_primes ? primes[l] : min_sz | 1;
14046
+ return sz;
14047
+ }
14655
14048
 
14656
- static size_t hash(void * p) {
14657
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
14049
+ static size_t ggml_hash(const void * p) {
14050
+ return (size_t)p;
14658
14051
  }
14659
14052
 
14660
- static size_t hash_find(void * hash_table[], void * p) {
14661
- size_t h = hash(p);
14053
+ size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14054
+ size_t h = ggml_hash(key) % hash_set.size;
14662
14055
 
14663
14056
  // linear probing
14664
14057
  size_t i = h;
14665
- while (hash_table[i] != NULL && hash_table[i] != p) {
14666
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
14058
+ while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
14059
+ i = (i + 1) % hash_set.size;
14667
14060
  if (i == h) {
14668
14061
  // visited all hash table entries -> not found
14669
- return GGML_GRAPH_HASHTABLE_SIZE;
14062
+ return GGML_HASHTABLE_FULL;
14670
14063
  }
14671
14064
  }
14672
14065
  return i;
14673
14066
  }
14674
14067
 
14675
- static bool hash_insert(void * hash_table[], void * p) {
14676
- size_t i = hash_find(hash_table, p);
14068
+ bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14069
+ size_t i = ggml_hash_find(hash_set, key);
14070
+ return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
14071
+ }
14072
+
14073
+ size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14074
+ size_t i = ggml_hash_find(hash_set, key);
14677
14075
 
14678
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14076
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14679
14077
 
14680
- if (hash_table[i] == p) {
14681
- return true;
14078
+ if (hash_set.keys[i] == key) {
14079
+ return GGML_HASHTABLE_ALREADY_EXISTS;
14682
14080
  }
14683
14081
 
14684
14082
  // insert
14685
- GGML_ASSERT(hash_table[i] == NULL);
14686
- hash_table[i] = p;
14687
- return false;
14083
+ GGML_ASSERT(hash_set.keys[i] == NULL);
14084
+ hash_set.keys[i] = key;
14085
+ return i;
14086
+ }
14087
+
14088
+ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14089
+ size_t i = ggml_hash_find(hash_set, key);
14090
+
14091
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14092
+
14093
+ hash_set.keys[i] = key;
14094
+ return i;
14095
+ }
14096
+
14097
+ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14098
+ size = ggml_hash_size(size);
14099
+ struct ggml_hash_set result;
14100
+ result.size = size;
14101
+ result.keys = malloc(sizeof(struct ggml_tensor *) * size);
14102
+ memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14103
+ return result;
14688
14104
  }
14689
14105
 
14690
- static bool hash_contains(void * hash_table[], void * p) {
14691
- size_t i = hash_find(hash_table, p);
14692
- return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
14106
+ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14107
+ free(hash_set.keys);
14693
14108
  }
14694
14109
 
14695
14110
  struct hash_map {
14696
- void * keys[GGML_GRAPH_HASHTABLE_SIZE];
14697
- void * vals[GGML_GRAPH_HASHTABLE_SIZE];
14111
+ struct ggml_hash_set set;
14112
+ struct ggml_tensor ** vals;
14698
14113
  };
14699
14114
 
14700
- static struct hash_map * new_hash_map(void) {
14115
+ static struct hash_map * ggml_new_hash_map(size_t size) {
14701
14116
  struct hash_map * result = malloc(sizeof(struct hash_map));
14702
- for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
14703
- result->keys[i] = NULL;
14704
- result->vals[i] = NULL;
14705
- }
14117
+ result->set = ggml_hash_set_new(size);
14118
+ result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
14119
+ memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14706
14120
  return result;
14707
14121
  }
14708
14122
 
14709
- static void free_hash_map(struct hash_map * map) {
14123
+ static void ggml_hash_map_free(struct hash_map * map) {
14124
+ ggml_hash_set_free(map->set);
14125
+ free(map->vals);
14710
14126
  free(map);
14711
14127
  }
14712
14128
 
@@ -14726,7 +14142,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14726
14142
  return node;
14727
14143
  }
14728
14144
 
14729
- if (!hash_contains(graph->visited_hash_table, node)) {
14145
+ if (!ggml_hash_contains(graph->visited_hash_table, node)) {
14730
14146
  return node;
14731
14147
  }
14732
14148
 
@@ -14741,17 +14157,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14741
14157
  return node;
14742
14158
  }
14743
14159
 
14744
- size_t i = hash_find(replacements->keys, node);
14745
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14746
- if (replacements->keys[i] == node) {
14747
- return (struct ggml_tensor *) replacements->vals[i];
14160
+ size_t i = ggml_hash_find(replacements->set, node);
14161
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
14162
+ if (replacements->set.keys[i] == node) {
14163
+ return replacements->vals[i];
14748
14164
  }
14749
14165
 
14750
14166
  struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14751
14167
 
14752
14168
  // insert clone into replacements
14753
- GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
14754
- replacements->keys[i] = node;
14169
+ GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
14170
+ replacements->set.keys[i] = node;
14755
14171
  replacements->vals[i] = clone;
14756
14172
 
14757
14173
  clone->op = node->op;
@@ -14788,26 +14204,26 @@ void ggml_build_backward_gradient_checkpointing(
14788
14204
  struct ggml_cgraph * gb_tmp,
14789
14205
  struct ggml_tensor * * checkpoints,
14790
14206
  int n_checkpoints) {
14791
- *gb_tmp = *gf;
14207
+ ggml_graph_cpy(gf, gb_tmp);
14792
14208
  ggml_build_backward_expand(ctx, gf, gb_tmp, true);
14793
14209
 
14794
14210
  if (n_checkpoints <= 0) {
14795
- *gb = *gb_tmp;
14211
+ ggml_graph_cpy(gb_tmp, gb);
14796
14212
  return;
14797
14213
  }
14798
14214
 
14799
- struct hash_map * replacements = new_hash_map();
14215
+ struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
14800
14216
 
14801
14217
  // insert checkpoints in replacements
14802
14218
  for (int i = 0; i < n_checkpoints; ++i) {
14803
- size_t k = hash_find(replacements->keys, checkpoints[i]);
14804
- GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14805
- GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
14806
- replacements->keys[k] = checkpoints[i];
14807
- replacements->vals[k] = checkpoints[i];
14219
+ size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
14220
+ GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
14221
+ GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
14222
+ replacements->set.keys[k] = checkpoints[i];
14223
+ replacements->vals[k] = checkpoints[i];
14808
14224
  }
14809
14225
 
14810
- *gb = *gf;
14226
+ ggml_graph_cpy(gf, gb);
14811
14227
  // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
14812
14228
  // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
14813
14229
  // by recomputing them from checkpoints
@@ -14824,21 +14240,21 @@ void ggml_build_backward_gradient_checkpointing(
14824
14240
  ggml_build_forward_expand(gb, node);
14825
14241
  }
14826
14242
 
14827
- free_hash_map(replacements);
14243
+ ggml_hash_map_free(replacements);
14828
14244
  }
14829
14245
 
14830
14246
  // functions to change gradients considering the case that input a might be initial gradient with zero value
14831
14247
 
14832
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14833
- if (hash_contains(zero_table, a)) {
14248
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14249
+ if (ggml_hash_contains(zero_table, a)) {
14834
14250
  return b;
14835
14251
  } else {
14836
14252
  return ggml_add_impl(ctx, a, b, false);
14837
14253
  }
14838
14254
  }
14839
14255
 
14840
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
14841
- if (hash_contains(zero_table, a)) {
14256
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14257
+ if (ggml_hash_contains(zero_table, a)) {
14842
14258
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14843
14259
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14844
14260
  } else {
@@ -14846,23 +14262,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
14846
14262
  }
14847
14263
  }
14848
14264
 
14849
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14850
- if (hash_contains(zero_table, a)) {
14265
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14266
+ if (ggml_hash_contains(zero_table, a)) {
14851
14267
  return ggml_repeat(ctx, b, a);
14852
14268
  } else {
14853
14269
  return ggml_add1_impl(ctx, a, b, false);
14854
14270
  }
14855
14271
  }
14856
14272
 
14857
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14858
- if (hash_contains(zero_table, a)) {
14273
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14274
+ if (ggml_hash_contains(zero_table, a)) {
14859
14275
  return ggml_neg(ctx, b);
14860
14276
  } else {
14861
14277
  return ggml_sub_impl(ctx, a, b, false);
14862
14278
  }
14863
14279
  }
14864
14280
 
14865
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
14281
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
14866
14282
  struct ggml_tensor * src0 = tensor->src[0];
14867
14283
  struct ggml_tensor * src1 = tensor->src[1];
14868
14284
 
@@ -15457,31 +14873,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15457
14873
  {
15458
14874
  GGML_ASSERT(false); // TODO: not implemented
15459
14875
  } break;
15460
- case GGML_OP_CONV_1D:
15461
- {
15462
- GGML_ASSERT(false); // TODO: not implemented
15463
- } break;
15464
- case GGML_OP_CONV_1D_STAGE_0:
15465
- {
15466
- GGML_ASSERT(false); // TODO: not implemented
15467
- } break;
15468
- case GGML_OP_CONV_1D_STAGE_1:
15469
- {
15470
- GGML_ASSERT(false); // TODO: not implemented
15471
- } break;
15472
14876
  case GGML_OP_CONV_TRANSPOSE_1D:
15473
14877
  {
15474
14878
  GGML_ASSERT(false); // TODO: not implemented
15475
14879
  } break;
15476
- case GGML_OP_CONV_2D:
15477
- {
15478
- GGML_ASSERT(false); // TODO: not implemented
15479
- } break;
15480
- case GGML_OP_CONV_2D_STAGE_0:
15481
- {
15482
- GGML_ASSERT(false); // TODO: not implemented
15483
- } break;
15484
- case GGML_OP_CONV_2D_STAGE_1:
14880
+ case GGML_OP_IM2COL:
15485
14881
  {
15486
14882
  GGML_ASSERT(false); // TODO: not implemented
15487
14883
  } break;
@@ -15695,7 +15091,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15695
15091
  }
15696
15092
 
15697
15093
  // check if already visited
15698
- if (hash_insert(cgraph->visited_hash_table, node)) {
15094
+ if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
15699
15095
  return;
15700
15096
  }
15701
15097
 
@@ -15711,7 +15107,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15711
15107
 
15712
15108
  if (node->op == GGML_OP_NONE && node->grad == NULL) {
15713
15109
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
15714
- GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15110
+ GGML_ASSERT(cgraph->n_leafs < cgraph->size);
15715
15111
 
15716
15112
  if (strlen(node->name) == 0) {
15717
15113
  ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15720,22 +15116,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15720
15116
  cgraph->leafs[cgraph->n_leafs] = node;
15721
15117
  cgraph->n_leafs++;
15722
15118
  } else {
15723
- GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15119
+ GGML_ASSERT(cgraph->n_nodes < cgraph->size);
15724
15120
 
15725
15121
  if (strlen(node->name) == 0) {
15726
15122
  ggml_format_name(node, "node_%d", cgraph->n_nodes);
15727
15123
  }
15728
15124
 
15729
15125
  cgraph->nodes[cgraph->n_nodes] = node;
15730
- cgraph->grads[cgraph->n_nodes] = node->grad;
15126
+ if (cgraph->grads) {
15127
+ cgraph->grads[cgraph->n_nodes] = node->grad;
15128
+ }
15731
15129
  cgraph->n_nodes++;
15732
15130
  }
15733
15131
  }
15734
15132
 
15735
15133
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
15736
15134
  if (!expand) {
15737
- cgraph->n_nodes = 0;
15738
- cgraph->n_leafs = 0;
15135
+ // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
15136
+ ggml_graph_clear(cgraph);
15739
15137
  }
15740
15138
 
15741
15139
  const int n0 = cgraph->n_nodes;
@@ -15756,25 +15154,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
15756
15154
  ggml_build_forward_impl(cgraph, tensor, true);
15757
15155
  }
15758
15156
 
15759
- struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15760
- struct ggml_cgraph result = {
15761
- /*.n_nodes =*/ 0,
15762
- /*.n_leafs =*/ 0,
15763
- /*.nodes =*/ { NULL },
15764
- /*.grads =*/ { NULL },
15765
- /*.leafs =*/ { NULL },
15766
- /*.hash_table =*/ { NULL },
15767
- /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15768
- /*.perf_runs =*/ 0,
15769
- /*.perf_cycles =*/ 0,
15770
- /*.perf_time_us =*/ 0,
15771
- };
15772
-
15773
- ggml_build_forward_impl(&result, tensor, false);
15774
-
15775
- return result;
15776
- }
15777
-
15778
15157
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
15779
15158
  GGML_ASSERT(gf->n_nodes > 0);
15780
15159
 
@@ -15791,11 +15170,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15791
15170
  }
15792
15171
 
15793
15172
  // remember original gradients which start with zero values
15794
- void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
15795
- memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
15173
+ struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
15796
15174
  for (int i = 0; i < gf->n_nodes; i++) {
15797
15175
  if (gf->grads[i]) {
15798
- hash_insert(zero_table, gf->grads[i]);
15176
+ ggml_hash_insert(zero_table, gf->grads[i]);
15799
15177
  }
15800
15178
  }
15801
15179
 
@@ -15818,26 +15196,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15818
15196
  }
15819
15197
  }
15820
15198
 
15821
- free(zero_table);
15199
+ ggml_hash_set_free(zero_table);
15822
15200
  }
15823
15201
 
15824
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
15825
- struct ggml_cgraph result = *gf;
15826
- ggml_build_backward_expand(ctx, gf, &result, keep);
15827
- return result;
15202
+ static size_t ggml_graph_nbytes(size_t size, bool grads) {
15203
+ size_t nbytes = sizeof(struct ggml_cgraph);
15204
+ nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
15205
+ if (grads) {
15206
+ nbytes += size * sizeof(struct ggml_tensor *); // grads
15207
+ }
15208
+ nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
15209
+ return nbytes;
15828
15210
  }
15829
15211
 
15830
- struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15831
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15212
+ size_t ggml_graph_overhead_custom(size_t size, bool grads) {
15213
+ return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
15214
+ }
15215
+
15216
+ size_t ggml_graph_overhead(void) {
15217
+ return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
15218
+ }
15219
+
15220
+ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
15221
+ const size_t obj_size = ggml_graph_nbytes(size, grads);
15222
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15832
15223
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15833
15224
 
15225
+ struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
15226
+
15227
+ size_t hash_size = ggml_hash_size(size * 2);
15228
+ struct ggml_tensor ** nodes_ptr = data_start;
15229
+ struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
15230
+ struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
15231
+ struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
15232
+
15233
+ // check that we allocated the correct amount of memory
15234
+ assert(obj_size == (size_t) (
15235
+ (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
15236
+
15237
+ memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
15238
+
15834
15239
  *cgraph = (struct ggml_cgraph) {
15240
+ /*.size =*/ size,
15835
15241
  /*.n_nodes =*/ 0,
15836
15242
  /*.n_leafs =*/ 0,
15837
- /*.nodes =*/ { NULL },
15838
- /*.grads =*/ { NULL },
15839
- /*.leafs =*/ { NULL },
15840
- /*.hash_table =*/ { NULL },
15243
+ /*.nodes =*/ nodes_ptr,
15244
+ /*.grads =*/ grads_ptr,
15245
+ /*.leafs =*/ leafs_ptr,
15246
+ /*.hash_table =*/ { hash_size, hash_keys_ptr },
15841
15247
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15842
15248
  /*.perf_runs =*/ 0,
15843
15249
  /*.perf_cycles =*/ 0,
@@ -15847,14 +15253,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15847
15253
  return cgraph;
15848
15254
  }
15849
15255
 
15850
- struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15851
- struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15852
- ggml_build_forward_impl(cgraph, tensor, false);
15256
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15257
+ return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15258
+ }
15259
+
15260
+ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15261
+ const size_t obj_size = sizeof(struct ggml_cgraph);
15262
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15263
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15264
+
15265
+ *cgraph = (struct ggml_cgraph) {
15266
+ /*.size =*/ 0,
15267
+ /*.n_nodes =*/ i1 - i0,
15268
+ /*.n_leafs =*/ 0,
15269
+ /*.nodes =*/ cgraph0->nodes + i0,
15270
+ /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
15271
+ /*.leafs =*/ NULL,
15272
+ /*.hash_table =*/ { 0, NULL },
15273
+ /*.order =*/ cgraph0->order,
15274
+ /*.perf_runs =*/ 0,
15275
+ /*.perf_cycles =*/ 0,
15276
+ /*.perf_time_us =*/ 0,
15277
+ };
15278
+
15853
15279
  return cgraph;
15854
15280
  }
15855
15281
 
15856
- size_t ggml_graph_overhead(void) {
15857
- return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15282
+ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
15283
+ GGML_ASSERT(dst->size >= src->n_leafs);
15284
+ GGML_ASSERT(dst->size >= src->n_nodes);
15285
+ GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
15286
+
15287
+ dst->n_leafs = src->n_leafs;
15288
+ dst->n_nodes = src->n_nodes;
15289
+ dst->order = src->order;
15290
+
15291
+ for (int i = 0; i < src->n_leafs; ++i) {
15292
+ dst->leafs[i] = src->leafs[i];
15293
+ }
15294
+
15295
+ for (int i = 0; i < src->n_nodes; ++i) {
15296
+ dst->nodes[i] = src->nodes[i];
15297
+ }
15298
+
15299
+ if (src->grads) {
15300
+ GGML_ASSERT(dst->grads != NULL);
15301
+ for (int i = 0; i < src->n_nodes; ++i) {
15302
+ dst->grads[i] = src->grads[i];
15303
+ }
15304
+ }
15305
+
15306
+ for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
15307
+ if (src->visited_hash_table.keys[i]) {
15308
+ ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
15309
+ }
15310
+ }
15311
+ }
15312
+
15313
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
15314
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
15315
+ ggml_graph_cpy(cgraph, result);
15316
+ return result;
15317
+ }
15318
+
15319
+ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
15320
+ GGML_ASSERT(cgraph->grads != NULL);
15321
+
15322
+ for (int i = 0; i < cgraph->n_nodes; i++) {
15323
+ struct ggml_tensor * grad = cgraph->grads[i];
15324
+
15325
+ if (grad) {
15326
+ ggml_set_zero(grad);
15327
+ }
15328
+ }
15329
+ }
15330
+
15331
+ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
15332
+ cgraph->n_leafs = 0;
15333
+ cgraph->n_nodes = 0;
15334
+ memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
15858
15335
  }
15859
15336
 
15860
15337
  //
@@ -15966,45 +15443,266 @@ static void clear_numa_thread_affinity(void) {
15966
15443
  strerror(rv));
15967
15444
  }
15968
15445
 
15969
- CPU_FREE(cpus);
15970
- }
15971
- #else
15972
- // TODO: Windows etc.
15973
- // (the linux implementation may also work on BSD, someone should test)
15974
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15975
- static void clear_numa_thread_affinity(void) {}
15976
- #endif
15977
-
15978
- struct ggml_compute_state_shared {
15979
- const struct ggml_cgraph * cgraph;
15980
- const struct ggml_cplan * cplan;
15981
-
15982
- int64_t perf_node_start_cycles;
15983
- int64_t perf_node_start_time_us;
15984
-
15985
- const int n_threads;
15986
-
15987
- // synchronization primitives
15988
- atomic_int n_active; // num active threads
15989
- atomic_int node_n; // active graph node
15990
-
15991
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15992
- void * abort_callback_data;
15993
- };
15994
-
15995
- struct ggml_compute_state {
15996
- ggml_thread_t thrd;
15997
- int ith;
15998
- struct ggml_compute_state_shared * shared;
15999
- };
16000
-
16001
- static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16002
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16003
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15446
+ CPU_FREE(cpus);
15447
+ }
15448
+ #else
15449
+ // TODO: Windows etc.
15450
+ // (the linux implementation may also work on BSD, someone should test)
15451
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15452
+ static void clear_numa_thread_affinity(void) {}
15453
+ #endif
15454
+
15455
+ struct ggml_compute_state_shared {
15456
+ const struct ggml_cgraph * cgraph;
15457
+ const struct ggml_cplan * cplan;
15458
+
15459
+ int64_t perf_node_start_cycles;
15460
+ int64_t perf_node_start_time_us;
15461
+
15462
+ const int n_threads;
15463
+
15464
+ // synchronization primitives
15465
+ atomic_int n_active; // num active threads
15466
+ atomic_int node_n; // active graph node
15467
+
15468
+ bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15469
+ void * abort_callback_data;
15470
+ };
15471
+
15472
+ struct ggml_compute_state {
15473
+ ggml_thread_t thrd;
15474
+ int ith;
15475
+ struct ggml_compute_state_shared * shared;
15476
+ };
15477
+
15478
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
15479
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
15480
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15481
+
15482
+ node->perf_runs++;
15483
+ node->perf_cycles += cycles_cur;
15484
+ node->perf_time_us += time_us_cur;
15485
+ }
15486
+
15487
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15488
+ int n_tasks = 0;
15489
+
15490
+ switch (node->op) {
15491
+ case GGML_OP_CPY:
15492
+ case GGML_OP_DUP:
15493
+ case GGML_OP_ADD:
15494
+ case GGML_OP_ADD1:
15495
+ case GGML_OP_ACC:
15496
+ {
15497
+ n_tasks = n_threads;
15498
+ } break;
15499
+ case GGML_OP_SUB:
15500
+ case GGML_OP_DIV:
15501
+ case GGML_OP_SQR:
15502
+ case GGML_OP_SQRT:
15503
+ case GGML_OP_LOG:
15504
+ case GGML_OP_SUM:
15505
+ case GGML_OP_SUM_ROWS:
15506
+ case GGML_OP_MEAN:
15507
+ case GGML_OP_ARGMAX:
15508
+ case GGML_OP_REPEAT:
15509
+ case GGML_OP_REPEAT_BACK:
15510
+ {
15511
+ n_tasks = 1;
15512
+ } break;
15513
+ case GGML_OP_UNARY:
15514
+ switch (ggml_get_unary_op(node)) {
15515
+ case GGML_UNARY_OP_ABS:
15516
+ case GGML_UNARY_OP_SGN:
15517
+ case GGML_UNARY_OP_NEG:
15518
+ case GGML_UNARY_OP_STEP:
15519
+ case GGML_UNARY_OP_TANH:
15520
+ case GGML_UNARY_OP_ELU:
15521
+ case GGML_UNARY_OP_RELU:
15522
+ case GGML_UNARY_OP_LEAKY:
15523
+ {
15524
+ n_tasks = 1;
15525
+ } break;
15526
+
15527
+ case GGML_UNARY_OP_GELU:
15528
+ case GGML_UNARY_OP_GELU_QUICK:
15529
+ case GGML_UNARY_OP_SILU:
15530
+ {
15531
+ n_tasks = n_threads;
15532
+ } break;
15533
+ }
15534
+ break;
15535
+ case GGML_OP_SILU_BACK:
15536
+ case GGML_OP_MUL:
15537
+ case GGML_OP_NORM:
15538
+ case GGML_OP_RMS_NORM:
15539
+ case GGML_OP_RMS_NORM_BACK:
15540
+ case GGML_OP_GROUP_NORM:
15541
+ case GGML_OP_CONCAT:
15542
+ {
15543
+ n_tasks = n_threads;
15544
+ } break;
15545
+ case GGML_OP_MUL_MAT:
15546
+ {
15547
+ n_tasks = n_threads;
15548
+
15549
+ // TODO: use different scheduling for different matrix sizes
15550
+ //const int nr0 = ggml_nrows(node->src[0]);
15551
+ //const int nr1 = ggml_nrows(node->src[1]);
15552
+
15553
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
15554
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
15555
+
15556
+ #if defined(GGML_USE_CUBLAS)
15557
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
15558
+ n_tasks = 1; // TODO: this actually is doing nothing
15559
+ // the threads are still spinning
15560
+ }
15561
+ #elif defined(GGML_USE_CLBLAST)
15562
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
15563
+ n_tasks = 1; // TODO: this actually is doing nothing
15564
+ // the threads are still spinning
15565
+ }
15566
+ #endif
15567
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
15568
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
15569
+ n_tasks = 1; // TODO: this actually is doing nothing
15570
+ // the threads are still spinning
15571
+ }
15572
+ #endif
15573
+ } break;
15574
+ case GGML_OP_OUT_PROD:
15575
+ {
15576
+ n_tasks = n_threads;
15577
+ } break;
15578
+ case GGML_OP_SCALE:
15579
+ case GGML_OP_SET:
15580
+ case GGML_OP_CONT:
15581
+ case GGML_OP_RESHAPE:
15582
+ case GGML_OP_VIEW:
15583
+ case GGML_OP_PERMUTE:
15584
+ case GGML_OP_TRANSPOSE:
15585
+ case GGML_OP_GET_ROWS:
15586
+ case GGML_OP_GET_ROWS_BACK:
15587
+ case GGML_OP_DIAG:
15588
+ {
15589
+ n_tasks = 1;
15590
+ } break;
15591
+ case GGML_OP_DIAG_MASK_ZERO:
15592
+ case GGML_OP_DIAG_MASK_INF:
15593
+ case GGML_OP_SOFT_MAX:
15594
+ case GGML_OP_SOFT_MAX_BACK:
15595
+ case GGML_OP_ROPE:
15596
+ case GGML_OP_ROPE_BACK:
15597
+ case GGML_OP_ADD_REL_POS:
15598
+ {
15599
+ n_tasks = n_threads;
15600
+ } break;
15601
+ case GGML_OP_ALIBI:
15602
+ {
15603
+ n_tasks = 1; //TODO
15604
+ } break;
15605
+ case GGML_OP_CLAMP:
15606
+ {
15607
+ n_tasks = 1; //TODO
15608
+ } break;
15609
+ case GGML_OP_CONV_TRANSPOSE_1D:
15610
+ {
15611
+ n_tasks = n_threads;
15612
+ } break;
15613
+ case GGML_OP_IM2COL:
15614
+ {
15615
+ n_tasks = n_threads;
15616
+ } break;
15617
+ case GGML_OP_CONV_TRANSPOSE_2D:
15618
+ {
15619
+ n_tasks = n_threads;
15620
+ } break;
15621
+ case GGML_OP_POOL_1D:
15622
+ case GGML_OP_POOL_2D:
15623
+ {
15624
+ n_tasks = 1;
15625
+ } break;
15626
+ case GGML_OP_UPSCALE:
15627
+ {
15628
+ n_tasks = n_threads;
15629
+ } break;
15630
+ case GGML_OP_FLASH_ATTN:
15631
+ {
15632
+ n_tasks = n_threads;
15633
+ } break;
15634
+ case GGML_OP_FLASH_FF:
15635
+ {
15636
+ n_tasks = n_threads;
15637
+ } break;
15638
+ case GGML_OP_FLASH_ATTN_BACK:
15639
+ {
15640
+ n_tasks = n_threads;
15641
+ } break;
15642
+ case GGML_OP_WIN_PART:
15643
+ case GGML_OP_WIN_UNPART:
15644
+ case GGML_OP_GET_REL_POS:
15645
+ case GGML_OP_MAP_UNARY:
15646
+ case GGML_OP_MAP_BINARY:
15647
+ case GGML_OP_MAP_CUSTOM1_F32:
15648
+ case GGML_OP_MAP_CUSTOM2_F32:
15649
+ case GGML_OP_MAP_CUSTOM3_F32:
15650
+ {
15651
+ n_tasks = 1;
15652
+ } break;
15653
+ case GGML_OP_MAP_CUSTOM1:
15654
+ {
15655
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
15656
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15657
+ n_tasks = n_threads;
15658
+ } else {
15659
+ n_tasks = MIN(p->n_tasks, n_threads);
15660
+ }
15661
+ } break;
15662
+ case GGML_OP_MAP_CUSTOM2:
15663
+ {
15664
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
15665
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15666
+ n_tasks = n_threads;
15667
+ } else {
15668
+ n_tasks = MIN(p->n_tasks, n_threads);
15669
+ }
15670
+ } break;
15671
+ case GGML_OP_MAP_CUSTOM3:
15672
+ {
15673
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
15674
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15675
+ n_tasks = n_threads;
15676
+ } else {
15677
+ n_tasks = MIN(p->n_tasks, n_threads);
15678
+ }
15679
+ } break;
15680
+ case GGML_OP_CROSS_ENTROPY_LOSS:
15681
+ {
15682
+ n_tasks = n_threads;
15683
+ } break;
15684
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15685
+ {
15686
+ n_tasks = n_threads;
15687
+ } break;
15688
+ case GGML_OP_NONE:
15689
+ {
15690
+ n_tasks = 1;
15691
+ } break;
15692
+ case GGML_OP_COUNT:
15693
+ {
15694
+ GGML_ASSERT(false);
15695
+ } break;
15696
+ default:
15697
+ {
15698
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15699
+ GGML_ASSERT(false);
15700
+ } break;
15701
+ }
15702
+
15703
+ assert(n_tasks > 0);
16004
15704
 
16005
- node->perf_runs++;
16006
- node->perf_cycles += cycles_cur;
16007
- node->perf_time_us += time_us_cur;
15705
+ return n_tasks;
16008
15706
  }
16009
15707
 
16010
15708
  static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -16013,7 +15711,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16013
15711
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
16014
15712
  const struct ggml_cplan * cplan = state->shared->cplan;
16015
15713
 
16016
- const int * n_tasks_arr = cplan->n_tasks;
16017
15714
  const int n_threads = state->shared->n_threads;
16018
15715
 
16019
15716
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16038,9 +15735,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16038
15735
 
16039
15736
  if (node_n != -1) {
16040
15737
  /* FINALIZE */
16041
- struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
15738
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16042
15739
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16043
- params.nth = n_tasks_arr[node_n];
15740
+ params.nth = ggml_get_n_tasks(node, n_threads);
16044
15741
  ggml_compute_forward(&params, node);
16045
15742
  }
16046
15743
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16051,7 +15748,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16051
15748
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16052
15749
 
16053
15750
  struct ggml_tensor * node = cgraph->nodes[node_n];
16054
- const int n_tasks = n_tasks_arr[node_n];
15751
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16055
15752
 
16056
15753
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16057
15754
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -16109,7 +15806,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16109
15806
 
16110
15807
  /* COMPUTE */
16111
15808
  struct ggml_tensor * node = cgraph->nodes[node_n];
16112
- const int n_tasks = n_tasks_arr[node_n];
15809
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16113
15810
 
16114
15811
  struct ggml_compute_params params = {
16115
15812
  /*.type =*/ GGML_TASK_COMPUTE,
@@ -16143,121 +15840,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16143
15840
 
16144
15841
  struct ggml_tensor * node = cgraph->nodes[i];
16145
15842
 
15843
+ size_t cur = 0;
15844
+
16146
15845
  switch (node->op) {
16147
15846
  case GGML_OP_CPY:
16148
15847
  case GGML_OP_DUP:
16149
15848
  {
16150
15849
  n_tasks = n_threads;
16151
15850
 
16152
- size_t cur = 0;
16153
15851
  if (ggml_is_quantized(node->type)) {
16154
15852
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16155
15853
  }
16156
-
16157
- work_size = MAX(work_size, cur);
16158
15854
  } break;
16159
15855
  case GGML_OP_ADD:
16160
15856
  case GGML_OP_ADD1:
16161
15857
  {
16162
15858
  n_tasks = n_threads;
16163
15859
 
16164
- size_t cur = 0;
16165
-
16166
15860
  if (ggml_is_quantized(node->src[0]->type)) {
16167
15861
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16168
15862
  }
16169
-
16170
- work_size = MAX(work_size, cur);
16171
15863
  } break;
16172
15864
  case GGML_OP_ACC:
16173
15865
  {
16174
15866
  n_tasks = n_threads;
16175
15867
 
16176
- size_t cur = 0;
16177
-
16178
15868
  if (ggml_is_quantized(node->src[0]->type)) {
16179
15869
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16180
15870
  }
16181
-
16182
- work_size = MAX(work_size, cur);
16183
- } break;
16184
- case GGML_OP_SUB:
16185
- case GGML_OP_DIV:
16186
- case GGML_OP_SQR:
16187
- case GGML_OP_SQRT:
16188
- case GGML_OP_LOG:
16189
- case GGML_OP_SUM:
16190
- case GGML_OP_SUM_ROWS:
16191
- case GGML_OP_MEAN:
16192
- case GGML_OP_ARGMAX:
16193
- case GGML_OP_REPEAT:
16194
- case GGML_OP_REPEAT_BACK:
16195
- {
16196
- n_tasks = 1;
16197
- } break;
16198
-
16199
- case GGML_OP_UNARY:
16200
- {
16201
- switch (ggml_get_unary_op(node)) {
16202
- case GGML_UNARY_OP_ABS:
16203
- case GGML_UNARY_OP_SGN:
16204
- case GGML_UNARY_OP_NEG:
16205
- case GGML_UNARY_OP_STEP:
16206
- case GGML_UNARY_OP_TANH:
16207
- case GGML_UNARY_OP_ELU:
16208
- case GGML_UNARY_OP_RELU:
16209
- {
16210
- n_tasks = 1;
16211
- } break;
16212
-
16213
- case GGML_UNARY_OP_GELU:
16214
- case GGML_UNARY_OP_GELU_QUICK:
16215
- case GGML_UNARY_OP_SILU:
16216
- {
16217
- n_tasks = n_threads;
16218
- } break;
16219
- }
16220
- } break;
16221
- case GGML_OP_SILU_BACK:
16222
- case GGML_OP_MUL:
16223
- case GGML_OP_NORM:
16224
- case GGML_OP_RMS_NORM:
16225
- case GGML_OP_RMS_NORM_BACK:
16226
- case GGML_OP_GROUP_NORM:
16227
- {
16228
- n_tasks = n_threads;
16229
15871
  } break;
16230
- case GGML_OP_CONCAT:
16231
15872
  case GGML_OP_MUL_MAT:
16232
15873
  {
16233
- n_tasks = n_threads;
16234
-
16235
- // TODO: use different scheduling for different matrix sizes
16236
- //const int nr0 = ggml_nrows(node->src[0]);
16237
- //const int nr1 = ggml_nrows(node->src[1]);
16238
-
16239
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16240
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16241
-
16242
- size_t cur = 0;
16243
15874
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16244
15875
 
16245
- #if defined(GGML_USE_CUBLAS)
16246
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16247
- n_tasks = 1; // TODO: this actually is doing nothing
16248
- // the threads are still spinning
16249
- } else
16250
- #elif defined(GGML_USE_CLBLAST)
15876
+ #if defined(GGML_USE_CLBLAST)
16251
15877
  if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16252
- n_tasks = 1; // TODO: this actually is doing nothing
16253
- // the threads are still spinning
16254
15878
  cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16255
15879
  } else
16256
15880
  #endif
16257
15881
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16258
15882
  if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16259
- n_tasks = 1; // TODO: this actually is doing nothing
16260
- // the threads are still spinning
16261
15883
  if (node->src[0]->type != GGML_TYPE_F32) {
16262
15884
  // here we need memory just for single 2D matrix from src0
16263
15885
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16266,108 +15888,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16266
15888
  #endif
16267
15889
  if (node->src[1]->type != vec_dot_type) {
16268
15890
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16269
- } else {
16270
- cur = 0;
16271
15891
  }
16272
-
16273
- work_size = MAX(work_size, cur);
16274
15892
  } break;
16275
15893
  case GGML_OP_OUT_PROD:
16276
15894
  {
16277
15895
  n_tasks = n_threads;
16278
15896
 
16279
- size_t cur = 0;
16280
-
16281
15897
  if (ggml_is_quantized(node->src[0]->type)) {
16282
15898
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16283
15899
  }
16284
-
16285
- work_size = MAX(work_size, cur);
16286
- } break;
16287
- case GGML_OP_SCALE:
16288
- {
16289
- n_tasks = 1;
16290
- } break;
16291
- case GGML_OP_SET:
16292
- case GGML_OP_CONT:
16293
- case GGML_OP_RESHAPE:
16294
- case GGML_OP_VIEW:
16295
- case GGML_OP_PERMUTE:
16296
- case GGML_OP_TRANSPOSE:
16297
- case GGML_OP_GET_ROWS:
16298
- case GGML_OP_GET_ROWS_BACK:
16299
- case GGML_OP_DIAG:
16300
- {
16301
- n_tasks = 1;
16302
- } break;
16303
- case GGML_OP_DIAG_MASK_ZERO:
16304
- case GGML_OP_DIAG_MASK_INF:
16305
- case GGML_OP_SOFT_MAX:
16306
- case GGML_OP_SOFT_MAX_BACK:
16307
- case GGML_OP_ROPE:
16308
- case GGML_OP_ROPE_BACK:
16309
- case GGML_OP_ADD_REL_POS:
16310
- {
16311
- n_tasks = n_threads;
16312
- } break;
16313
- case GGML_OP_ALIBI:
16314
- {
16315
- n_tasks = 1; //TODO
16316
- } break;
16317
- case GGML_OP_CLAMP:
16318
- {
16319
- n_tasks = 1; //TODO
16320
- } break;
16321
- case GGML_OP_CONV_1D:
16322
- {
16323
- n_tasks = n_threads;
16324
-
16325
- GGML_ASSERT(node->src[0]->ne[3] == 1);
16326
- GGML_ASSERT(node->src[1]->ne[2] == 1);
16327
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16328
-
16329
- const int64_t ne00 = node->src[0]->ne[0];
16330
- const int64_t ne01 = node->src[0]->ne[1];
16331
- const int64_t ne02 = node->src[0]->ne[2];
16332
-
16333
- const int64_t ne10 = node->src[1]->ne[0];
16334
- const int64_t ne11 = node->src[1]->ne[1];
16335
-
16336
- const int64_t ne0 = node->ne[0];
16337
- const int64_t ne1 = node->ne[1];
16338
- const int64_t nk = ne00;
16339
- const int64_t ew0 = nk * ne01;
16340
-
16341
- UNUSED(ne02);
16342
- UNUSED(ne10);
16343
- UNUSED(ne11);
16344
-
16345
- size_t cur = 0;
16346
-
16347
- if (node->src[0]->type == GGML_TYPE_F16 &&
16348
- node->src[1]->type == GGML_TYPE_F32) {
16349
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16350
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16351
- node->src[1]->type == GGML_TYPE_F32) {
16352
- cur = sizeof(float)*(ne0*ne1*ew0);
16353
- } else {
16354
- GGML_ASSERT(false);
16355
- }
16356
-
16357
- work_size = MAX(work_size, cur);
16358
- } break;
16359
- case GGML_OP_CONV_1D_STAGE_0:
16360
- {
16361
- n_tasks = n_threads;
16362
- } break;
16363
- case GGML_OP_CONV_1D_STAGE_1:
16364
- {
16365
- n_tasks = n_threads;
16366
15900
  } break;
16367
15901
  case GGML_OP_CONV_TRANSPOSE_1D:
16368
15902
  {
16369
- n_tasks = n_threads;
16370
-
16371
15903
  GGML_ASSERT(node->src[0]->ne[3] == 1);
16372
15904
  GGML_ASSERT(node->src[1]->ne[2] == 1);
16373
15905
  GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16379,7 +15911,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16379
15911
  const int64_t ne10 = node->src[1]->ne[0]; // L
16380
15912
  const int64_t ne11 = node->src[1]->ne[1]; // Cin
16381
15913
 
16382
- size_t cur = 0;
16383
15914
  if (node->src[0]->type == GGML_TYPE_F16 &&
16384
15915
  node->src[1]->type == GGML_TYPE_F32) {
16385
15916
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
@@ -16391,59 +15922,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16391
15922
  } else {
16392
15923
  GGML_ASSERT(false);
16393
15924
  }
16394
-
16395
- work_size = MAX(work_size, cur);
16396
- } break;
16397
- case GGML_OP_CONV_2D:
16398
- {
16399
- n_tasks = n_threads;
16400
-
16401
- const int64_t ne00 = node->src[0]->ne[0]; // W
16402
- const int64_t ne01 = node->src[0]->ne[1]; // H
16403
- const int64_t ne02 = node->src[0]->ne[2]; // C
16404
- const int64_t ne03 = node->src[0]->ne[3]; // N
16405
-
16406
- const int64_t ne10 = node->src[1]->ne[0]; // W
16407
- const int64_t ne11 = node->src[1]->ne[1]; // H
16408
- const int64_t ne12 = node->src[1]->ne[2]; // C
16409
-
16410
- const int64_t ne0 = node->ne[0];
16411
- const int64_t ne1 = node->ne[1];
16412
- const int64_t ne2 = node->ne[2];
16413
- const int64_t ne3 = node->ne[3];
16414
- const int64_t nk = ne00*ne01;
16415
- const int64_t ew0 = nk * ne02;
16416
-
16417
- UNUSED(ne03);
16418
- UNUSED(ne2);
16419
-
16420
- size_t cur = 0;
16421
-
16422
- if (node->src[0]->type == GGML_TYPE_F16 &&
16423
- node->src[1]->type == GGML_TYPE_F32) {
16424
- // im2col: [N*OH*OW, IC*KH*KW]
16425
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
16426
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16427
- node->src[1]->type == GGML_TYPE_F32) {
16428
- cur = sizeof(float)* (ne10*ne11*ne12);
16429
- } else {
16430
- GGML_ASSERT(false);
16431
- }
16432
-
16433
- work_size = MAX(work_size, cur);
16434
- } break;
16435
- case GGML_OP_CONV_2D_STAGE_0:
16436
- {
16437
- n_tasks = n_threads;
16438
15925
  } break;
16439
- case GGML_OP_CONV_2D_STAGE_1:
15926
+ case GGML_OP_IM2COL:
16440
15927
  {
16441
15928
  n_tasks = n_threads;
16442
15929
  } break;
16443
15930
  case GGML_OP_CONV_TRANSPOSE_2D:
16444
15931
  {
16445
- n_tasks = n_threads;
16446
-
16447
15932
  const int64_t ne00 = node->src[0]->ne[0]; // W
16448
15933
  const int64_t ne01 = node->src[0]->ne[1]; // H
16449
15934
  const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16453,141 +15938,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16453
15938
  const int64_t ne11 = node->src[1]->ne[1]; // H
16454
15939
  const int64_t ne12 = node->src[1]->ne[2]; // Channels In
16455
15940
 
16456
- size_t cur = 0;
16457
15941
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
16458
15942
  cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
16459
-
16460
- work_size = MAX(work_size, cur);
16461
- } break;
16462
- case GGML_OP_POOL_1D:
16463
- case GGML_OP_POOL_2D:
16464
- {
16465
- n_tasks = 1;
16466
- } break;
16467
- case GGML_OP_UPSCALE:
16468
- {
16469
- n_tasks = n_threads;
16470
15943
  } break;
16471
15944
  case GGML_OP_FLASH_ATTN:
16472
15945
  {
16473
15946
  n_tasks = n_threads;
16474
15947
 
16475
- size_t cur = 0;
16476
-
16477
15948
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16478
15949
 
16479
15950
  if (node->src[1]->type == GGML_TYPE_F32) {
16480
15951
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16481
15952
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16482
- }
16483
-
16484
- if (node->src[1]->type == GGML_TYPE_F16) {
15953
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16485
15954
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16486
15955
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16487
15956
  }
16488
-
16489
- work_size = MAX(work_size, cur);
16490
15957
  } break;
16491
15958
  case GGML_OP_FLASH_FF:
16492
15959
  {
16493
15960
  n_tasks = n_threads;
16494
15961
 
16495
- size_t cur = 0;
16496
-
16497
15962
  if (node->src[1]->type == GGML_TYPE_F32) {
16498
15963
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16499
15964
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16500
- }
16501
-
16502
- if (node->src[1]->type == GGML_TYPE_F16) {
15965
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16503
15966
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16504
15967
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16505
15968
  }
16506
-
16507
- work_size = MAX(work_size, cur);
16508
15969
  } break;
16509
15970
  case GGML_OP_FLASH_ATTN_BACK:
16510
15971
  {
16511
15972
  n_tasks = n_threads;
16512
15973
 
16513
- size_t cur = 0;
16514
-
16515
15974
  const int64_t D = node->src[0]->ne[0];
16516
15975
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16517
15976
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16518
15977
  if (node->src[1]->type == GGML_TYPE_F32) {
16519
15978
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16520
15979
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16521
- }
16522
-
16523
- if (node->src[1]->type == GGML_TYPE_F16) {
15980
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16524
15981
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16525
15982
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16526
15983
  }
16527
-
16528
- work_size = MAX(work_size, cur);
16529
- } break;
16530
- case GGML_OP_WIN_PART:
16531
- case GGML_OP_WIN_UNPART:
16532
- case GGML_OP_GET_REL_POS:
16533
- case GGML_OP_MAP_UNARY:
16534
- case GGML_OP_MAP_BINARY:
16535
- case GGML_OP_MAP_CUSTOM1_F32:
16536
- case GGML_OP_MAP_CUSTOM2_F32:
16537
- case GGML_OP_MAP_CUSTOM3_F32:
16538
- {
16539
- n_tasks = 1;
16540
- } break;
16541
- case GGML_OP_MAP_CUSTOM1:
16542
- {
16543
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16544
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16545
- n_tasks = n_threads;
16546
- } else {
16547
- n_tasks = MIN(p->n_tasks, n_threads);
16548
- }
16549
- } break;
16550
- case GGML_OP_MAP_CUSTOM2:
16551
- {
16552
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16553
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16554
- n_tasks = n_threads;
16555
- } else {
16556
- n_tasks = MIN(p->n_tasks, n_threads);
16557
- }
16558
- } break;
16559
- case GGML_OP_MAP_CUSTOM3:
16560
- {
16561
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16562
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16563
- n_tasks = n_threads;
16564
- } else {
16565
- n_tasks = MIN(p->n_tasks, n_threads);
16566
- }
16567
15984
  } break;
15985
+
16568
15986
  case GGML_OP_CROSS_ENTROPY_LOSS:
16569
15987
  {
16570
15988
  n_tasks = n_threads;
16571
15989
 
16572
- size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16573
-
16574
- work_size = MAX(work_size, cur);
16575
- } break;
16576
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16577
- {
16578
- n_tasks = n_threads;
16579
- } break;
16580
- case GGML_OP_NONE:
16581
- {
16582
- n_tasks = 1;
15990
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16583
15991
  } break;
16584
15992
  case GGML_OP_COUNT:
16585
15993
  {
16586
15994
  GGML_ASSERT(false);
16587
15995
  } break;
15996
+ default:
15997
+ break;
16588
15998
  }
16589
15999
 
16590
- cplan.n_tasks[i] = n_tasks;
16000
+ work_size = MAX(work_size, cur);
16591
16001
  }
16592
16002
 
16593
16003
  if (work_size > 0) {
@@ -16609,12 +16019,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16609
16019
  if (cplan->work_size > 0) {
16610
16020
  GGML_ASSERT(cplan->work_data);
16611
16021
  }
16612
-
16613
- for (int i = 0; i < cgraph->n_nodes; ++i) {
16614
- if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16615
- GGML_ASSERT(cplan->n_tasks[i] > 0);
16616
- }
16617
- }
16618
16022
  }
16619
16023
 
16620
16024
  const int n_threads = cplan->n_threads;
@@ -16687,16 +16091,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16687
16091
  return compute_status;
16688
16092
  }
16689
16093
 
16690
- void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16691
- for (int i = 0; i < cgraph->n_nodes; i++) {
16692
- struct ggml_tensor * grad = cgraph->grads[i];
16693
-
16694
- if (grad) {
16695
- ggml_set_zero(grad);
16696
- }
16697
- }
16698
- }
16699
-
16700
16094
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16701
16095
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16702
16096
 
@@ -16823,12 +16217,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16823
16217
  const uint32_t magic = GGML_FILE_MAGIC;
16824
16218
  const uint32_t version = GGML_FILE_VERSION;
16825
16219
  const uint32_t n_leafs = cgraph->n_leafs;
16826
- const uint32_t nodes = cgraph->n_nodes;
16220
+ const uint32_t n_nodes = cgraph->n_nodes;
16827
16221
 
16828
16222
  fwrite(&magic, sizeof(uint32_t), 1, fout);
16829
16223
  fwrite(&version, sizeof(uint32_t), 1, fout);
16830
16224
  fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
16831
- fwrite(&nodes, sizeof(uint32_t), 1, fout);
16225
+ fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
16832
16226
  fwrite(&size_eval, sizeof(uint64_t), 1, fout);
16833
16227
  }
16834
16228
 
@@ -16916,7 +16310,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16916
16310
  if (idx == -1) {
16917
16311
  for (int k = 0; k < cgraph->n_nodes; ++k) {
16918
16312
  if (args[j] == cgraph->nodes[k]) {
16919
- idx = GGML_MAX_NODES + k;
16313
+ idx = cgraph->n_leafs + k;
16920
16314
  break;
16921
16315
  }
16922
16316
  }
@@ -16943,11 +16337,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16943
16337
  }
16944
16338
  }
16945
16339
 
16946
- struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16340
+ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16947
16341
  assert(*ctx_data == NULL);
16948
16342
  assert(*ctx_eval == NULL);
16949
16343
 
16950
- struct ggml_cgraph result = { 0 };
16344
+ struct ggml_cgraph * result = NULL;
16951
16345
 
16952
16346
  struct ggml_tensor * data = NULL;
16953
16347
 
@@ -17019,13 +16413,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17019
16413
  const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
17020
16414
  const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
17021
16415
  const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
17022
-
17023
- result.n_leafs = n_leafs;
17024
- result.n_nodes = n_nodes;
16416
+ const int graph_size = MAX(n_leafs, n_nodes);
17025
16417
 
17026
16418
  // create the data context
17027
16419
  {
17028
- const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
16420
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
17029
16421
 
17030
16422
  struct ggml_init_params params = {
17031
16423
  .mem_size = size_eval + overhead,
@@ -17041,6 +16433,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17041
16433
  }
17042
16434
  }
17043
16435
 
16436
+ result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
16437
+
16438
+ result->n_leafs = n_leafs;
16439
+ result->n_nodes = n_nodes;
16440
+
16441
+
17044
16442
  // leafs
17045
16443
  {
17046
16444
  uint32_t type;
@@ -17079,7 +16477,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17079
16477
  tensor->nb[j] = nb[j];
17080
16478
  }
17081
16479
 
17082
- result.leafs[i] = tensor;
16480
+ result->leafs[i] = tensor;
17083
16481
 
17084
16482
  ptr += ggml_nbytes(tensor);
17085
16483
 
@@ -17131,10 +16529,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17131
16529
  continue;
17132
16530
  }
17133
16531
 
17134
- if (arg_idx < GGML_MAX_NODES) {
17135
- args[j] = result.leafs[arg_idx];
16532
+ if (arg_idx < result->n_leafs) {
16533
+ args[j] = result->leafs[arg_idx];
17136
16534
  } else {
17137
- args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
16535
+ args[j] = result->nodes[arg_idx - result->n_leafs];
17138
16536
  }
17139
16537
  }
17140
16538
 
@@ -17186,7 +16584,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17186
16584
  tensor->src[j] = args[j];
17187
16585
  }
17188
16586
 
17189
- result.nodes[i] = tensor;
16587
+ result->nodes[i] = tensor;
17190
16588
 
17191
16589
  fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17192
16590
  }
@@ -18091,10 +17489,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18091
17489
  case GGML_OPT_ADAM:
18092
17490
  {
18093
17491
  result = (struct ggml_opt_params) {
18094
- .type = GGML_OPT_ADAM,
18095
- .n_threads = 1,
18096
- .past = 0,
18097
- .delta = 1e-5f,
17492
+ .type = GGML_OPT_ADAM,
17493
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17494
+ .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
17495
+ .past = 0,
17496
+ .delta = 1e-5f,
18098
17497
 
18099
17498
  .max_no_improvement = 100,
18100
17499
 
@@ -18121,10 +17520,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18121
17520
  case GGML_OPT_LBFGS:
18122
17521
  {
18123
17522
  result = (struct ggml_opt_params) {
18124
- .type = GGML_OPT_LBFGS,
18125
- .n_threads = 1,
18126
- .past = 0,
18127
- .delta = 1e-5f,
17523
+ .type = GGML_OPT_LBFGS,
17524
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17525
+ .n_threads = 1,
17526
+ .past = 0,
17527
+ .delta = 1e-5f,
18128
17528
 
18129
17529
  .max_no_improvement = 0,
18130
17530
 
@@ -18266,14 +17666,11 @@ enum ggml_opt_result ggml_opt_resume(
18266
17666
  struct ggml_tensor * f) {
18267
17667
 
18268
17668
  // build forward + backward compute graphs
18269
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18270
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18271
-
18272
- struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18273
- struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
17669
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
17670
+ ggml_build_forward_expand(gf, f);
18274
17671
 
18275
- *gf = ggml_build_forward (f);
18276
- *gb = ggml_build_backward(ctx, gf, true);
17672
+ struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
17673
+ ggml_build_backward_expand(ctx, gf, gb, true);
18277
17674
 
18278
17675
  return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18279
17676
  }
@@ -18729,7 +18126,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18729
18126
  {
18730
18127
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18731
18128
 
18732
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18129
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18733
18130
  struct gguf_kv * kv = &ctx->kv[i];
18734
18131
 
18735
18132
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18776,7 +18173,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18776
18173
  case GGUF_TYPE_STRING:
18777
18174
  {
18778
18175
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18779
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18176
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18780
18177
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18781
18178
  }
18782
18179
  } break;
@@ -18804,7 +18201,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18804
18201
  {
18805
18202
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18806
18203
 
18807
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18204
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18808
18205
  struct gguf_tensor_info * info = &ctx->infos[i];
18809
18206
 
18810
18207
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18851,7 +18248,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18851
18248
  // compute the total size of the data section, taking into account the alignment
18852
18249
  {
18853
18250
  ctx->size = 0;
18854
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18251
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18855
18252
  struct gguf_tensor_info * info = &ctx->infos[i];
18856
18253
 
18857
18254
  const int64_t ne =
@@ -18920,7 +18317,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18920
18317
  ggml_set_no_alloc(ctx_data, true);
18921
18318
 
18922
18319
  // create the tensors
18923
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18320
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18924
18321
  const int64_t ne[GGML_MAX_DIMS] = {
18925
18322
  ctx->infos[i].ne[0],
18926
18323
  ctx->infos[i].ne[1],
@@ -19055,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
19055
18452
  }
19056
18453
 
19057
18454
  const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
18455
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19058
18456
  return ctx->kv[key_id].key.data;
19059
18457
  }
19060
18458
 
19061
18459
  enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
18460
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19062
18461
  return ctx->kv[key_id].type;
19063
18462
  }
19064
18463
 
19065
18464
  enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
18465
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19066
18466
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19067
18467
  return ctx->kv[key_id].value.arr.type;
19068
18468
  }
19069
18469
 
19070
18470
  const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
18471
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19071
18472
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19072
18473
  return ctx->kv[key_id].value.arr.data;
19073
18474
  }
19074
18475
 
19075
18476
  const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
18477
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19076
18478
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19077
18479
  struct gguf_kv * kv = &ctx->kv[key_id];
19078
18480
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@@ -19080,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
19080
18482
  }
19081
18483
 
19082
18484
  int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
18485
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19083
18486
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19084
18487
  return ctx->kv[key_id].value.arr.n;
19085
18488
  }
19086
18489
 
19087
18490
  uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
18491
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19088
18492
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
19089
18493
  return ctx->kv[key_id].value.uint8;
19090
18494
  }
19091
18495
 
19092
18496
  int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
18497
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19093
18498
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
19094
18499
  return ctx->kv[key_id].value.int8;
19095
18500
  }
19096
18501
 
19097
18502
  uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
18503
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19098
18504
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
19099
18505
  return ctx->kv[key_id].value.uint16;
19100
18506
  }
19101
18507
 
19102
18508
  int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
18509
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19103
18510
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
19104
18511
  return ctx->kv[key_id].value.int16;
19105
18512
  }
19106
18513
 
19107
18514
  uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
18515
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19108
18516
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
19109
18517
  return ctx->kv[key_id].value.uint32;
19110
18518
  }
19111
18519
 
19112
18520
  int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
18521
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19113
18522
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
19114
18523
  return ctx->kv[key_id].value.int32;
19115
18524
  }
19116
18525
 
19117
18526
  float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
18527
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19118
18528
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
19119
18529
  return ctx->kv[key_id].value.float32;
19120
18530
  }
19121
18531
 
19122
18532
  uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
18533
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19123
18534
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
19124
18535
  return ctx->kv[key_id].value.uint64;
19125
18536
  }
19126
18537
 
19127
18538
  int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
18539
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19128
18540
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
19129
18541
  return ctx->kv[key_id].value.int64;
19130
18542
  }
19131
18543
 
19132
18544
  double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
18545
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19133
18546
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
19134
18547
  return ctx->kv[key_id].value.float64;
19135
18548
  }
19136
18549
 
19137
18550
  bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
18551
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19138
18552
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
19139
18553
  return ctx->kv[key_id].value.bool_;
19140
18554
  }
19141
18555
 
19142
18556
  const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
18557
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19143
18558
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
19144
18559
  return ctx->kv[key_id].value.str.data;
19145
18560
  }
19146
18561
 
18562
+ const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
18563
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18564
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
18565
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
18566
+ return &ctx->kv[key_id].value;
18567
+ }
18568
+
19147
18569
  int gguf_get_n_tensors(const struct gguf_context * ctx) {
19148
18570
  return ctx->header.n_tensors;
19149
18571
  }