llama_cpp 0.9.2 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
100
100
  #include <hbwmalloc.h>
101
101
  #endif
102
102
 
103
+ #if defined(__APPLE__)
104
+ #include <TargetConditionals.h>
105
+ #endif
106
+
107
+ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
108
+ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
109
+
110
+ #include <sys/wait.h>
111
+
112
+ void ggml_print_backtrace(void) {
113
+ /*
114
+ #include <execinfo.h>
115
+ #include <dlfcn.h>
116
+
117
+ void * trace[100];
118
+
119
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
120
+
121
+ backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
122
+ */
123
+
124
+ // backtrack_symbols does not show line numbers, use gdb instead
125
+ char attach[32];
126
+ snprintf(attach, sizeof(attach), "attach %d", getpid());
127
+ int pid = fork();
128
+ if (pid == 0) {
129
+ execlp("gdb", "gdb", "--batch",
130
+ "-ex", "set style enabled on",
131
+ "-ex", attach,
132
+ "-ex", "bt -frame-info source-and-location",
133
+ "-ex", "detach",
134
+ "-ex", "quit",
135
+ NULL);
136
+ } else {
137
+ waitpid(pid, NULL, 0);
138
+ }
139
+ }
140
+ #else
141
+ void ggml_print_backtrace(void) {
142
+ // platform not supported
143
+ }
144
+ #endif
145
+
103
146
  /*#define GGML_PERF*/
104
147
  #define GGML_DEBUG 0
105
148
  #define GGML_GELU_FP16
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
228
271
  // floating point type used to accumulate sums
229
272
  typedef double ggml_float;
230
273
 
274
+ #undef MIN
275
+ #undef MAX
276
+
277
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
278
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
279
+
231
280
  //
232
281
  // global data
233
282
  //
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
561
610
  // simd mappings
562
611
  //
563
612
 
613
+ #if defined(__ARM_NEON)
614
+ #if !defined(__aarch64__)
615
+
616
+ // 64-bit compatibility
617
+
618
+ inline static float vaddvq_f32(float32x4_t v) {
619
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620
+ }
621
+
622
+ #endif
623
+ #endif
624
+
564
625
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
565
626
  // we then implement the fundamental computation operations below using only these macros
566
627
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1352
1413
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1353
1414
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1354
1415
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
+ inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1355
1417
 
1356
1418
  static const float GELU_COEF_A = 0.044715f;
1357
1419
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1572
1634
  "ROPE_BACK",
1573
1635
  "ALIBI",
1574
1636
  "CLAMP",
1575
- "CONV_1D",
1576
- "CONV_1D_STAGE_0",
1577
- "CONV_1D_STAGE_1",
1578
1637
  "CONV_TRANSPOSE_1D",
1579
- "CONV_2D",
1580
- "CONV_2D_STAGE_0",
1581
- "CONV_2D_STAGE_1",
1638
+ "IM2COL",
1582
1639
  "CONV_TRANSPOSE_2D",
1583
1640
  "POOL_1D",
1584
1641
  "POOL_2D",
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1609
1666
  "CROSS_ENTROPY_LOSS_BACK",
1610
1667
  };
1611
1668
 
1612
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1669
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1613
1670
 
1614
1671
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1615
1672
  "none",
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1659
1716
  "rope_back(x)",
1660
1717
  "alibi(x)",
1661
1718
  "clamp(x)",
1662
- "conv_1d(x)",
1663
- "conv_1d_stage_0(x)",
1664
- "conv_1d_stage_1(x)",
1665
1719
  "conv_transpose_1d(x)",
1666
- "conv_2d(x)",
1667
- "conv_2d_stage_0(x)",
1668
- "conv_2d_stage_1(x)",
1720
+ "im2col(x)",
1669
1721
  "conv_transpose_2d(x)",
1670
1722
  "pool_1d(x)",
1671
1723
  "pool_2d(x)",
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1696
1748
  "cross_entropy_loss_back(x,y)",
1697
1749
  };
1698
1750
 
1699
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1751
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1700
1752
 
1701
1753
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1702
1754
 
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
1724
1776
  p[GGML_OP_GET_ROWS_BACK ] = true;
1725
1777
  p[GGML_OP_DIAG_MASK_INF ] = true;
1726
1778
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
1727
- p[GGML_OP_CONV_1D ] = true;
1728
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
1729
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
1730
1779
  p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
1731
- p[GGML_OP_CONV_2D ] = true;
1732
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
1733
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
1734
1780
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
1735
1781
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
1736
1782
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
3769
3815
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3770
3816
  }
3771
3817
 
3818
+ // ggml_leaky
3819
+
3820
+ struct ggml_tensor * ggml_leaky(
3821
+ struct ggml_context * ctx,
3822
+ struct ggml_tensor * a) {
3823
+ return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3824
+ }
3825
+
3772
3826
  // ggml_gelu
3773
3827
 
3774
3828
  struct ggml_tensor * ggml_gelu(
@@ -5076,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
5076
5130
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
5077
5131
  }
5078
5132
 
5079
- // im2col: [N, IC, IL] => [N, OL, IC*K]
5080
- // a: [OC,IC, K]
5081
- // b: [N, IC, IL]
5082
- // result: [N, OL, IC*K]
5083
- static struct ggml_tensor * ggml_conv_1d_stage_0(
5084
- struct ggml_context * ctx,
5085
- struct ggml_tensor * a,
5086
- struct ggml_tensor * b,
5087
- int s0,
5088
- int p0,
5089
- int d0) {
5090
- GGML_ASSERT(a->ne[1] == b->ne[1]);
5091
- bool is_node = false;
5092
-
5093
- if (a->grad || b->grad) {
5094
- GGML_ASSERT(false); // TODO: implement backward
5095
- is_node = true;
5096
- }
5097
-
5098
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5099
-
5100
- const int64_t ne[4] = {
5101
- a->ne[1] * a->ne[0],
5102
- OL,
5103
- b->ne[2],
5104
- 1,
5105
- };
5106
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5107
-
5108
- int32_t params[] = { s0, p0, d0 };
5109
- ggml_set_op_params(result, params, sizeof(params));
5110
-
5111
- result->op = GGML_OP_CONV_1D_STAGE_0;
5112
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5113
- result->src[0] = a;
5114
- result->src[1] = b;
5115
-
5116
- return result;
5117
- }
5118
-
5119
- // ggml_conv_1d_stage_1
5120
-
5121
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
5122
- // a: [OC, IC, K]
5123
- // b: [N, OL, IC * K]
5124
- // result: [N, OC, OL]
5125
- static struct ggml_tensor * ggml_conv_1d_stage_1(
5126
- struct ggml_context * ctx,
5127
- struct ggml_tensor * a,
5128
- struct ggml_tensor * b) {
5129
-
5130
- bool is_node = false;
5131
-
5132
- if (a->grad || b->grad) {
5133
- GGML_ASSERT(false); // TODO: implement backward
5134
- is_node = true;
5135
- }
5136
-
5137
- const int64_t ne[4] = {
5138
- b->ne[1],
5139
- a->ne[2],
5140
- b->ne[2],
5141
- 1,
5142
- };
5143
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5144
-
5145
- result->op = GGML_OP_CONV_1D_STAGE_1;
5146
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5147
- result->src[0] = a;
5148
- result->src[1] = b;
5149
-
5150
- return result;
5151
- }
5152
-
5153
- // ggml_conv_1d
5154
-
5155
5133
  GGML_API struct ggml_tensor * ggml_conv_1d(
5156
5134
  struct ggml_context * ctx,
5157
5135
  struct ggml_tensor * a,
@@ -5159,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5159
5137
  int s0,
5160
5138
  int p0,
5161
5139
  int d0) {
5162
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
5163
- result = ggml_conv_1d_stage_1(ctx, a, result);
5164
- return result;
5165
- }
5166
-
5167
- // GGML_API struct ggml_tensor * ggml_conv_1d(
5168
- // struct ggml_context * ctx,
5169
- // struct ggml_tensor * a,
5170
- // struct ggml_tensor * b,
5171
- // int s0,
5172
- // int p0,
5173
- // int d0) {
5174
- // GGML_ASSERT(ggml_is_matrix(b));
5175
- // GGML_ASSERT(a->ne[1] == b->ne[1]);
5176
- // bool is_node = false;
5177
-
5178
- // if (a->grad || b->grad) {
5179
- // GGML_ASSERT(false); // TODO: implement backward
5180
- // is_node = true;
5181
- // }
5140
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5182
5141
 
5183
- // const int64_t ne[4] = {
5184
- // ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
5185
- // a->ne[2], 1, 1,
5186
- // };
5187
- // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5142
+ struct ggml_tensor * result =
5143
+ ggml_mul_mat(ctx,
5144
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
5145
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
5188
5146
 
5189
- // int32_t params[] = { s0, p0, d0 };
5190
- // ggml_set_op_params(result, params, sizeof(params));
5147
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
5191
5148
 
5192
- // result->op = GGML_OP_CONV_1D;
5193
- // result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5194
- // result->src[0] = a;
5195
- // result->src[1] = b;
5196
-
5197
- // return result;
5198
- // }
5149
+ return result;
5150
+ }
5199
5151
 
5200
5152
  // ggml_conv_1d_ph
5201
5153
 
@@ -5258,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5258
5210
  // a: [OC,IC, KH, KW]
5259
5211
  // b: [N, IC, IH, IW]
5260
5212
  // result: [N, OH, OW, IC*KH*KW]
5261
- static struct ggml_tensor * ggml_conv_2d_stage_0(
5213
+ struct ggml_tensor * ggml_im2col(
5262
5214
  struct ggml_context * ctx,
5263
5215
  struct ggml_tensor * a,
5264
5216
  struct ggml_tensor * b,
@@ -5267,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5267
5219
  int p0,
5268
5220
  int p1,
5269
5221
  int d0,
5270
- int d1) {
5222
+ int d1,
5223
+ bool is_2D) {
5271
5224
 
5272
- GGML_ASSERT(a->ne[2] == b->ne[2]);
5225
+ if(is_2D) {
5226
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
5227
+ } else {
5228
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
5229
+ }
5273
5230
  bool is_node = false;
5274
5231
 
5275
5232
  if (a->grad || b->grad) {
@@ -5277,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5277
5234
  is_node = true;
5278
5235
  }
5279
5236
 
5280
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
5281
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5237
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
5238
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5282
5239
 
5283
5240
  const int64_t ne[4] = {
5284
- a->ne[2] * a->ne[1] * a->ne[0],
5241
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
5285
5242
  OW,
5286
- OH,
5287
- b->ne[3],
5243
+ is_2D ? OH : b->ne[2],
5244
+ is_2D ? b->ne[3] : 1,
5288
5245
  };
5289
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5290
5246
 
5291
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
5247
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5248
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5292
5249
  ggml_set_op_params(result, params, sizeof(params));
5293
5250
 
5294
- result->op = GGML_OP_CONV_2D_STAGE_0;
5295
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5296
- result->src[0] = a;
5297
- result->src[1] = b;
5298
-
5299
- return result;
5300
-
5301
- }
5302
-
5303
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
5304
- // a: [OC, IC, KH, KW]
5305
- // b: [N, OH, OW, IC * KH * KW]
5306
- // result: [N, OC, OH, OW]
5307
- static struct ggml_tensor * ggml_conv_2d_stage_1(
5308
- struct ggml_context * ctx,
5309
- struct ggml_tensor * a,
5310
- struct ggml_tensor * b) {
5311
-
5312
- bool is_node = false;
5313
-
5314
- if (a->grad || b->grad) {
5315
- GGML_ASSERT(false); // TODO: implement backward
5316
- is_node = true;
5317
- }
5318
-
5319
- const int64_t ne[4] = {
5320
- b->ne[1],
5321
- b->ne[2],
5322
- a->ne[3],
5323
- b->ne[3],
5324
- };
5325
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5326
-
5327
- result->op = GGML_OP_CONV_2D_STAGE_1;
5251
+ result->op = GGML_OP_IM2COL;
5328
5252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5329
5253
  result->src[0] = a;
5330
5254
  result->src[1] = b;
5331
5255
 
5332
5256
  return result;
5333
-
5334
5257
  }
5335
5258
 
5336
5259
  // a: [OC,IC, KH, KW]
5337
5260
  // b: [N, IC, IH, IW]
5338
5261
  // result: [N, OC, OH, OW]
5339
5262
  struct ggml_tensor * ggml_conv_2d(
5340
- struct ggml_context * ctx,
5341
- struct ggml_tensor * a,
5342
- struct ggml_tensor * b,
5343
- int s0,
5344
- int s1,
5345
- int p0,
5346
- int p1,
5347
- int d0,
5348
- int d1) {
5263
+ struct ggml_context * ctx,
5264
+ struct ggml_tensor * a,
5265
+ struct ggml_tensor * b,
5266
+ int s0,
5267
+ int s1,
5268
+ int p0,
5269
+ int p1,
5270
+ int d0,
5271
+ int d1) {
5272
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5349
5273
 
5350
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
5351
- result = ggml_conv_2d_stage_1(ctx, a, result);
5274
+ struct ggml_tensor * result =
5275
+ ggml_mul_mat(ctx,
5276
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5277
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5352
5278
 
5353
- return result;
5279
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5354
5280
 
5281
+ return result;
5355
5282
  }
5356
5283
 
5357
5284
  // ggml_conv_2d_sk_p0
@@ -5411,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
5411
5338
 
5412
5339
  // ggml_pool_*
5413
5340
 
5414
- static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
5341
+ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
5415
5342
  return (ins + 2 * p - ks) / s + 1;
5416
5343
  }
5417
5344
 
@@ -5458,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
5458
5385
  int k1,
5459
5386
  int s0,
5460
5387
  int s1,
5461
- int p0,
5462
- int p1) {
5388
+ float p0,
5389
+ float p1) {
5463
5390
 
5464
5391
  bool is_node = false;
5465
5392
 
@@ -8921,6 +8848,48 @@ static void ggml_compute_forward_silu(
8921
8848
  }
8922
8849
  }
8923
8850
 
8851
+ // ggml_compute_forward_leaky
8852
+
8853
+ static void ggml_compute_forward_leaky_f32(
8854
+ const struct ggml_compute_params * params,
8855
+ const struct ggml_tensor * src0,
8856
+ struct ggml_tensor * dst) {
8857
+ assert(params->ith == 0);
8858
+ assert(ggml_are_same_shape(src0, dst));
8859
+
8860
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8861
+ return;
8862
+ }
8863
+
8864
+ const int n = ggml_nrows(src0);
8865
+ const int nc = src0->ne[0];
8866
+
8867
+ assert(dst->nb[0] == sizeof(float));
8868
+ assert(src0->nb[0] == sizeof(float));
8869
+
8870
+ for (int i = 0; i < n; i++) {
8871
+ ggml_vec_leaky_f32(nc,
8872
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8873
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8874
+ }
8875
+ }
8876
+
8877
+ static void ggml_compute_forward_leaky(
8878
+ const struct ggml_compute_params * params,
8879
+ const struct ggml_tensor * src0,
8880
+ struct ggml_tensor * dst) {
8881
+ switch (src0->type) {
8882
+ case GGML_TYPE_F32:
8883
+ {
8884
+ ggml_compute_forward_leaky_f32(params, src0, dst);
8885
+ } break;
8886
+ default:
8887
+ {
8888
+ GGML_ASSERT(false);
8889
+ } break;
8890
+ }
8891
+ }
8892
+
8924
8893
  // ggml_compute_forward_silu_back
8925
8894
 
8926
8895
  static void ggml_compute_forward_silu_back_f32(
@@ -9404,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9404
9373
  // TODO: find the optimal values for these
9405
9374
  if (ggml_is_contiguous(src0) &&
9406
9375
  ggml_is_contiguous(src1) &&
9376
+ src0->type == GGML_TYPE_F32 &&
9377
+ src1->type == GGML_TYPE_F32 &&
9407
9378
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9408
9379
 
9409
9380
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9442,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
9442
9413
 
9443
9414
  // we don't support permuted src0 or src1
9444
9415
  GGML_ASSERT(nb00 == ggml_type_size(type));
9445
- GGML_ASSERT(nb10 == sizeof(float));
9416
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9446
9417
 
9447
9418
  // dst cannot be transposed or permuted
9448
9419
  GGML_ASSERT(nb0 == sizeof(float));
@@ -9640,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
9640
9611
  const int ith = params->ith;
9641
9612
  const int nth = params->nth;
9642
9613
 
9614
+ GGML_ASSERT(ne0 == ne00);
9615
+ GGML_ASSERT(ne1 == ne10);
9616
+ GGML_ASSERT(ne2 == ne02);
9643
9617
  GGML_ASSERT(ne02 == ne12);
9644
- GGML_ASSERT(ne03 == ne13);
9645
- GGML_ASSERT(ne2 == ne12);
9646
9618
  GGML_ASSERT(ne3 == ne13);
9619
+ GGML_ASSERT(ne03 == ne13);
9647
9620
 
9648
9621
  // we don't support permuted src0 or src1
9649
9622
  GGML_ASSERT(nb00 == sizeof(float));
@@ -9654,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
9654
9627
  // GGML_ASSERT(nb1 <= nb2);
9655
9628
  // GGML_ASSERT(nb2 <= nb3);
9656
9629
 
9657
- GGML_ASSERT(ne0 == ne00);
9658
- GGML_ASSERT(ne1 == ne10);
9659
- GGML_ASSERT(ne2 == ne02);
9660
- GGML_ASSERT(ne3 == ne03);
9661
-
9662
9630
  // nb01 >= nb00 - src0 is not transposed
9663
9631
  // compute by src0 rows
9664
9632
 
9665
9633
  // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
9666
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9634
+ // TODO: #if defined(GGML_USE_CLBLAST)
9635
+
9636
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9637
+ bool use_blas = ggml_is_matrix(src0) &&
9638
+ ggml_is_matrix(src1) &&
9639
+ ggml_is_contiguous(src0) &&
9640
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
9641
+ #endif
9667
9642
 
9668
9643
  if (params->type == GGML_TASK_INIT) {
9644
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
9645
+ if (use_blas) {
9646
+ return;
9647
+ }
9648
+ #endif
9669
9649
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
9670
9650
  return;
9671
9651
  }
@@ -9674,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
9674
9654
  return;
9675
9655
  }
9676
9656
 
9657
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9658
+ if (use_blas) {
9659
+ if (params->ith != 0) { // All threads other than the first do no work.
9660
+ return;
9661
+ }
9662
+ // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
9663
+ // src0: (k,n)
9664
+ // src1: (k,m)
9665
+ // dst: (m,n)
9666
+ //
9667
+ // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
9668
+ // Also expressed as (major,minor)
9669
+ // a: (m,k): so src1 transposed
9670
+ // b: (k,n): so src0
9671
+ // c: (m,n)
9672
+ //
9673
+ // However, if ggml_is_transposed(src1) is true, then
9674
+ // src1->data already contains a transposed version, so sgemm mustn't
9675
+ // transpose it further.
9676
+
9677
+ int n = src0->ne[0];
9678
+ int k = src0->ne[1];
9679
+ int m = src1->ne[0];
9680
+
9681
+ int transposeA, lda;
9682
+
9683
+ if (!ggml_is_transposed(src1)) {
9684
+ transposeA = CblasTrans;
9685
+ lda = m;
9686
+ } else {
9687
+ transposeA = CblasNoTrans;
9688
+ lda = k;
9689
+ }
9690
+
9691
+ float * a = (float *) ((char *) src1->data);
9692
+ float * b = (float *) ((char *) src0->data);
9693
+ float * c = (float *) ((char *) dst->data);
9694
+
9695
+ cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
9696
+
9697
+ return;
9698
+ }
9699
+ #endif
9700
+
9677
9701
  // dst[:,:,:,:] = 0
9678
9702
  // for i2,i3:
9679
9703
  // for i1:
@@ -11340,9 +11364,9 @@ static void ggml_compute_forward_rope_back(
11340
11364
  }
11341
11365
  }
11342
11366
 
11343
- // ggml_compute_forward_conv_1d
11367
+ // ggml_compute_forward_conv_transpose_1d
11344
11368
 
11345
- static void ggml_compute_forward_conv_1d_f16_f32(
11369
+ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11346
11370
  const struct ggml_compute_params * params,
11347
11371
  const struct ggml_tensor * src0,
11348
11372
  const struct ggml_tensor * src1,
@@ -11359,14 +11383,7 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11359
11383
  const int ith = params->ith;
11360
11384
  const int nth = params->nth;
11361
11385
 
11362
- const int nk = ne00;
11363
-
11364
- // size of the convolution row - the kernel size unrolled across all input channels
11365
- const int ew0 = nk*ne01;
11366
-
11367
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11368
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11369
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11386
+ const int nk = ne00*ne01*ne02;
11370
11387
 
11371
11388
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11372
11389
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11374,23 +11391,37 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11374
11391
  if (params->type == GGML_TASK_INIT) {
11375
11392
  memset(params->wdata, 0, params->wsize);
11376
11393
 
11377
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11394
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11395
+ {
11396
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11397
+
11398
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11399
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11400
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11401
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11402
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11403
+ dst_data[i00*ne02 + i02] = src[i00];
11404
+ }
11405
+ }
11406
+ }
11407
+ }
11378
11408
 
11379
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11380
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11409
+ // permute source data (src1) from (L x Cin) to (Cin x L)
11410
+ {
11411
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11381
11412
  ggml_fp16_t * dst_data = wdata;
11382
11413
 
11383
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11384
- for (int64_t ik = 0; ik < nk; ik++) {
11385
- const int idx0 = i0*s0 + ik*d0 - p0;
11386
-
11387
- if(!(idx0 < 0 || idx0 >= ne10)) {
11388
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
11389
- }
11414
+ for (int64_t i11 = 0; i11 < ne11; i11++) {
11415
+ const float * const src = (float *)((char *) src1->data + i11*nb11);
11416
+ for (int64_t i10 = 0; i10 < ne10; i10++) {
11417
+ dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11390
11418
  }
11391
11419
  }
11392
11420
  }
11393
11421
 
11422
+ // need to zero dst since we are accumulating into it
11423
+ memset(dst->data, 0, ggml_nbytes(dst));
11424
+
11394
11425
  return;
11395
11426
  }
11396
11427
 
@@ -11398,8 +11429,10 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11398
11429
  return;
11399
11430
  }
11400
11431
 
11432
+ const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11433
+
11401
11434
  // total rows in dst
11402
- const int nr = ne2;
11435
+ const int nr = ne1;
11403
11436
 
11404
11437
  // rows per thread
11405
11438
  const int dr = (nr + nth - 1)/nth;
@@ -11408,22 +11441,26 @@ static void ggml_compute_forward_conv_1d_f16_f32(
11408
11441
  const int ir0 = dr*ith;
11409
11442
  const int ir1 = MIN(ir0 + dr, nr);
11410
11443
 
11411
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11412
-
11413
- for (int i2 = 0; i2 < ne2; i2++) {
11414
- for (int i1 = ir0; i1 < ir1; i1++) {
11415
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11444
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11445
+ ggml_fp16_t * const wdata_src = wdata + nk;
11416
11446
 
11417
- for (int i0 = 0; i0 < ne0; i0++) {
11418
- ggml_vec_dot_f16(ew0, dst_data + i0,
11419
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
11420
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
11447
+ for (int i1 = ir0; i1 < ir1; i1++) {
11448
+ float * dst_data = (float *)((char *) dst->data + i1*nb1);
11449
+ ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11450
+ for (int i10 = 0; i10 < ne10; i10++) {
11451
+ const int i1n = i10*ne11;
11452
+ for (int i00 = 0; i00 < ne00; i00++) {
11453
+ float v = 0;
11454
+ ggml_vec_dot_f16(ne02, &v,
11455
+ (ggml_fp16_t *) wdata_src + i1n,
11456
+ (ggml_fp16_t *) wdata_kernel + i00*ne02);
11457
+ dst_data[i10*s0 + i00] += v;
11421
11458
  }
11422
11459
  }
11423
11460
  }
11424
11461
  }
11425
11462
 
11426
- static void ggml_compute_forward_conv_1d_f32(
11463
+ static void ggml_compute_forward_conv_transpose_1d_f32(
11427
11464
  const struct ggml_compute_params * params,
11428
11465
  const struct ggml_tensor * src0,
11429
11466
  const struct ggml_tensor * src1,
@@ -11440,430 +11477,7 @@ static void ggml_compute_forward_conv_1d_f32(
11440
11477
  const int ith = params->ith;
11441
11478
  const int nth = params->nth;
11442
11479
 
11443
- const int nk = ne00;
11444
-
11445
- const int ew0 = nk*ne01;
11446
-
11447
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11448
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11449
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11450
-
11451
- GGML_ASSERT(nb00 == sizeof(float));
11452
- GGML_ASSERT(nb10 == sizeof(float));
11453
-
11454
- if (params->type == GGML_TASK_INIT) {
11455
- memset(params->wdata, 0, params->wsize);
11456
-
11457
- float * const wdata = (float *) params->wdata + 0;
11458
-
11459
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11460
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11461
- float * dst_data = wdata;
11462
-
11463
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11464
- for (int64_t ik = 0; ik < nk; ik++) {
11465
- const int idx0 = i0*s0 + ik*d0 - p0;
11466
-
11467
- if(!(idx0 < 0 || idx0 >= ne10)) {
11468
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
11469
- }
11470
- }
11471
- }
11472
- }
11473
-
11474
- return;
11475
- }
11476
-
11477
- if (params->type == GGML_TASK_FINALIZE) {
11478
- return;
11479
- }
11480
-
11481
- // total rows in dst
11482
- const int nr = ne02;
11483
-
11484
- // rows per thread
11485
- const int dr = (nr + nth - 1)/nth;
11486
-
11487
- // row range for this thread
11488
- const int ir0 = dr*ith;
11489
- const int ir1 = MIN(ir0 + dr, nr);
11490
-
11491
- float * const wdata = (float *) params->wdata + 0;
11492
-
11493
- for (int i2 = 0; i2 < ne2; i2++) {
11494
- for (int i1 = ir0; i1 < ir1; i1++) {
11495
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11496
-
11497
- for (int i0 = 0; i0 < ne0; i0++) {
11498
- ggml_vec_dot_f32(ew0, dst_data + i0,
11499
- (float *) ((char *) src0->data + i1*nb02),
11500
- (float *) wdata + i2*nb2 + i0*ew0);
11501
- }
11502
- }
11503
- }
11504
- }
11505
-
11506
- // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
11507
- static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
11508
- ggml_fp16_t * A,
11509
- ggml_fp16_t * B,
11510
- float * C,
11511
- const int ith, const int nth) {
11512
- // does not seem to make a difference
11513
- int64_t m0, m1, n0, n1;
11514
- // patches per thread
11515
- if (m > n) {
11516
- n0 = 0;
11517
- n1 = n;
11518
-
11519
- // total patches in dst
11520
- const int np = m;
11521
-
11522
- // patches per thread
11523
- const int dp = (np + nth - 1)/nth;
11524
-
11525
- // patch range for this thread
11526
- m0 = dp*ith;
11527
- m1 = MIN(m0 + dp, np);
11528
- } else {
11529
- m0 = 0;
11530
- m1 = m;
11531
-
11532
- // total patches in dst
11533
- const int np = n;
11534
-
11535
- // patches per thread
11536
- const int dp = (np + nth - 1)/nth;
11537
-
11538
- // patch range for this thread
11539
- n0 = dp*ith;
11540
- n1 = MIN(n0 + dp, np);
11541
- }
11542
-
11543
- // block-tiling attempt
11544
- int64_t blck_n = 16;
11545
- int64_t blck_m = 16;
11546
-
11547
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
11548
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
11549
- // if (blck_size > 0) {
11550
- // blck_0 = 4;
11551
- // blck_1 = blck_size / blck_0;
11552
- // if (blck_1 < 0) {
11553
- // blck_1 = 1;
11554
- // }
11555
- // // blck_0 = (int64_t)sqrt(blck_size);
11556
- // // blck_1 = blck_0;
11557
- // }
11558
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
11559
-
11560
- for (int j = n0; j < n1; j+=blck_n) {
11561
- for (int i = m0; i < m1; i+=blck_m) {
11562
- // printf("i j k => %d %d %d\n", i, j, K);
11563
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
11564
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
11565
- ggml_vec_dot_f16(k,
11566
- C + ii*n + jj,
11567
- A + ii * k,
11568
- B + jj * k);
11569
- }
11570
- }
11571
- }
11572
- }
11573
- }
11574
-
11575
- // src0: kernel [OC, IC, K]
11576
- // src1: signal [N, IC, IL]
11577
- // dst: result [N, OL, IC*K]
11578
- static void ggml_compute_forward_conv_1d_stage_0_f32(
11579
- const struct ggml_compute_params * params,
11580
- const struct ggml_tensor * src0,
11581
- const struct ggml_tensor * src1,
11582
- struct ggml_tensor * dst) {
11583
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11584
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11585
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
11586
-
11587
- int64_t t0 = ggml_perf_time_us();
11588
- UNUSED(t0);
11589
-
11590
- GGML_TENSOR_BINARY_OP_LOCALS;
11591
-
11592
- const int64_t N = ne12;
11593
- const int64_t IC = ne11;
11594
- const int64_t IL = ne10;
11595
-
11596
- const int64_t K = ne00;
11597
-
11598
- const int64_t OL = ne1;
11599
-
11600
- const int ith = params->ith;
11601
- const int nth = params->nth;
11602
-
11603
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11604
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11605
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11606
-
11607
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11608
- GGML_ASSERT(nb10 == sizeof(float));
11609
-
11610
- if (params->type == GGML_TASK_INIT) {
11611
- memset(dst->data, 0, ggml_nbytes(dst));
11612
- return;
11613
- }
11614
-
11615
- if (params->type == GGML_TASK_FINALIZE) {
11616
- return;
11617
- }
11618
-
11619
- // im2col: [N, IC, IL] => [N, OL, IC*K]
11620
- {
11621
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
11622
-
11623
- for (int64_t in = 0; in < N; in++) {
11624
- for (int64_t iol = 0; iol < OL; iol++) {
11625
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11626
-
11627
- // micro kernel
11628
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
11629
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
11630
-
11631
- for (int64_t ik = 0; ik < K; ik++) {
11632
- const int64_t iil = iol*s0 + ik*d0 - p0;
11633
-
11634
- if (!(iil < 0 || iil >= IL)) {
11635
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
11636
- }
11637
- }
11638
- }
11639
- }
11640
- }
11641
- }
11642
- }
11643
-
11644
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11645
- // src0: [OC, IC, K]
11646
- // src1: [N, OL, IC * K]
11647
- // result: [N, OC, OL]
11648
- static void ggml_compute_forward_conv_1d_stage_1_f16(
11649
- const struct ggml_compute_params * params,
11650
- const struct ggml_tensor * src0,
11651
- const struct ggml_tensor * src1,
11652
- struct ggml_tensor * dst) {
11653
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11654
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
11655
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11656
-
11657
- int64_t t0 = ggml_perf_time_us();
11658
- UNUSED(t0);
11659
-
11660
- if (params->type == GGML_TASK_INIT) {
11661
- return;
11662
- }
11663
-
11664
- if (params->type == GGML_TASK_FINALIZE) {
11665
- return;
11666
- }
11667
-
11668
- GGML_TENSOR_BINARY_OP_LOCALS;
11669
-
11670
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11671
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
11672
- GGML_ASSERT(nb0 == sizeof(float));
11673
-
11674
- const int N = ne12;
11675
- const int OL = ne11;
11676
-
11677
- const int OC = ne02;
11678
- const int IC = ne01;
11679
- const int K = ne00;
11680
-
11681
- const int ith = params->ith;
11682
- const int nth = params->nth;
11683
-
11684
- int64_t m = OC;
11685
- int64_t n = OL;
11686
- int64_t k = IC * K;
11687
-
11688
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11689
- for (int i = 0; i < N; i++) {
11690
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
11691
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
11692
- float * C = (float *)dst->data + i * m * n; // [m, n]
11693
-
11694
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
11695
- }
11696
- }
11697
-
11698
- static void ggml_compute_forward_conv_1d(
11699
- const struct ggml_compute_params * params,
11700
- const struct ggml_tensor * src0,
11701
- const struct ggml_tensor * src1,
11702
- struct ggml_tensor * dst) {
11703
- switch(src0->type) {
11704
- case GGML_TYPE_F16:
11705
- {
11706
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
11707
- } break;
11708
- case GGML_TYPE_F32:
11709
- {
11710
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
11711
- } break;
11712
- default:
11713
- {
11714
- GGML_ASSERT(false);
11715
- } break;
11716
- }
11717
- }
11718
-
11719
- static void ggml_compute_forward_conv_1d_stage_0(
11720
- const struct ggml_compute_params * params,
11721
- const struct ggml_tensor * src0,
11722
- const struct ggml_tensor * src1,
11723
- struct ggml_tensor * dst) {
11724
- switch(src0->type) {
11725
- case GGML_TYPE_F16:
11726
- {
11727
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
11728
- } break;
11729
- default:
11730
- {
11731
- GGML_ASSERT(false);
11732
- } break;
11733
- }
11734
- }
11735
-
11736
- static void ggml_compute_forward_conv_1d_stage_1(
11737
- const struct ggml_compute_params * params,
11738
- const struct ggml_tensor * src0,
11739
- const struct ggml_tensor * src1,
11740
- struct ggml_tensor * dst) {
11741
- switch(src0->type) {
11742
- case GGML_TYPE_F16:
11743
- {
11744
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
11745
- } break;
11746
- default:
11747
- {
11748
- GGML_ASSERT(false);
11749
- } break;
11750
- }
11751
- }
11752
-
11753
- // ggml_compute_forward_conv_transpose_1d
11754
-
11755
- static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11756
- const struct ggml_compute_params * params,
11757
- const struct ggml_tensor * src0,
11758
- const struct ggml_tensor * src1,
11759
- struct ggml_tensor * dst) {
11760
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11761
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11762
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11763
-
11764
- int64_t t0 = ggml_perf_time_us();
11765
- UNUSED(t0);
11766
-
11767
- GGML_TENSOR_BINARY_OP_LOCALS
11768
-
11769
- const int ith = params->ith;
11770
- const int nth = params->nth;
11771
-
11772
- const int nk = ne00*ne01*ne02;
11773
-
11774
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11775
- GGML_ASSERT(nb10 == sizeof(float));
11776
-
11777
- if (params->type == GGML_TASK_INIT) {
11778
- memset(params->wdata, 0, params->wsize);
11779
-
11780
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11781
- {
11782
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11783
-
11784
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11785
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11786
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11787
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11788
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11789
- dst_data[i00*ne02 + i02] = src[i00];
11790
- }
11791
- }
11792
- }
11793
- }
11794
-
11795
- // permute source data (src1) from (L x Cin) to (Cin x L)
11796
- {
11797
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11798
- ggml_fp16_t * dst_data = wdata;
11799
-
11800
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11801
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11802
- for (int64_t i10 = 0; i10 < ne10; i10++) {
11803
- dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
11804
- }
11805
- }
11806
- }
11807
-
11808
- // need to zero dst since we are accumulating into it
11809
- memset(dst->data, 0, ggml_nbytes(dst));
11810
-
11811
- return;
11812
- }
11813
-
11814
- if (params->type == GGML_TASK_FINALIZE) {
11815
- return;
11816
- }
11817
-
11818
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11819
-
11820
- // total rows in dst
11821
- const int nr = ne1;
11822
-
11823
- // rows per thread
11824
- const int dr = (nr + nth - 1)/nth;
11825
-
11826
- // row range for this thread
11827
- const int ir0 = dr*ith;
11828
- const int ir1 = MIN(ir0 + dr, nr);
11829
-
11830
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11831
- ggml_fp16_t * const wdata_src = wdata + nk;
11832
-
11833
- for (int i1 = ir0; i1 < ir1; i1++) {
11834
- float * dst_data = (float *)((char *) dst->data + i1*nb1);
11835
- ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
11836
- for (int i10 = 0; i10 < ne10; i10++) {
11837
- const int i1n = i10*ne11;
11838
- for (int i00 = 0; i00 < ne00; i00++) {
11839
- float v = 0;
11840
- ggml_vec_dot_f16(ne02, &v,
11841
- (ggml_fp16_t *) wdata_src + i1n,
11842
- (ggml_fp16_t *) wdata_kernel + i00*ne02);
11843
- dst_data[i10*s0 + i00] += v;
11844
- }
11845
- }
11846
- }
11847
- }
11848
-
11849
- static void ggml_compute_forward_conv_transpose_1d_f32(
11850
- const struct ggml_compute_params * params,
11851
- const struct ggml_tensor * src0,
11852
- const struct ggml_tensor * src1,
11853
- struct ggml_tensor * dst) {
11854
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
11855
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11856
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11857
-
11858
- int64_t t0 = ggml_perf_time_us();
11859
- UNUSED(t0);
11860
-
11861
- GGML_TENSOR_BINARY_OP_LOCALS
11862
-
11863
- const int ith = params->ith;
11864
- const int nth = params->nth;
11865
-
11866
- const int nk = ne00*ne01*ne02;
11480
+ const int nk = ne00*ne01*ne02;
11867
11481
 
11868
11482
  GGML_ASSERT(nb00 == sizeof(float));
11869
11483
  GGML_ASSERT(nb10 == sizeof(float));
@@ -11961,12 +11575,10 @@ static void ggml_compute_forward_conv_transpose_1d(
11961
11575
  }
11962
11576
  }
11963
11577
 
11964
- // ggml_compute_forward_conv_2d
11965
-
11966
11578
  // src0: kernel [OC, IC, KH, KW]
11967
11579
  // src1: image [N, IC, IH, IW]
11968
11580
  // dst: result [N, OH, OW, IC*KH*KW]
11969
- static void ggml_compute_forward_conv_2d_stage_0_f32(
11581
+ static void ggml_compute_forward_im2col_f16(
11970
11582
  const struct ggml_compute_params * params,
11971
11583
  const struct ggml_tensor * src0,
11972
11584
  const struct ggml_tensor * src1,
@@ -11980,34 +11592,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
11980
11592
 
11981
11593
  GGML_TENSOR_BINARY_OP_LOCALS;
11982
11594
 
11983
- const int64_t N = ne13;
11984
- const int64_t IC = ne12;
11985
- const int64_t IH = ne11;
11595
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
11596
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
11597
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
11598
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
11599
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
11600
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
11601
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
11602
+
11603
+ const int ith = params->ith;
11604
+ const int nth = params->nth;
11605
+
11606
+ const int64_t N = is_2D ? ne13 : ne12;
11607
+ const int64_t IC = is_2D ? ne12 : ne11;
11608
+ const int64_t IH = is_2D ? ne11 : 1;
11986
11609
  const int64_t IW = ne10;
11987
11610
 
11988
- // const int64_t OC = ne03;
11989
- // const int64_t IC = ne02;
11990
- const int64_t KH = ne01;
11611
+ const int64_t KH = is_2D ? ne01 : 1;
11991
11612
  const int64_t KW = ne00;
11992
11613
 
11993
- const int64_t OH = ne2;
11614
+ const int64_t OH = is_2D ? ne2 : 1;
11994
11615
  const int64_t OW = ne1;
11995
11616
 
11996
- const int ith = params->ith;
11997
- const int nth = params->nth;
11998
-
11999
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12000
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12001
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12002
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12003
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12004
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
11617
+ int ofs0 = is_2D ? nb13 : nb12;
11618
+ int ofs1 = is_2D ? nb12 : nb11;
12005
11619
 
12006
11620
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12007
11621
  GGML_ASSERT(nb10 == sizeof(float));
12008
11622
 
12009
11623
  if (params->type == GGML_TASK_INIT) {
12010
- memset(dst->data, 0, ggml_nbytes(dst));
12011
11624
  return;
12012
11625
  }
12013
11626
 
@@ -12020,20 +11633,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12020
11633
  ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
12021
11634
 
12022
11635
  for (int64_t in = 0; in < N; in++) {
12023
- for (int64_t ioh = 0; ioh < OH; ioh++) {
11636
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12024
11637
  for (int64_t iow = 0; iow < OW; iow++) {
12025
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11638
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12026
11639
 
12027
11640
  // micro kernel
12028
11641
  ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12029
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
11642
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12030
11643
 
12031
- for (int64_t ikh = 0; ikh < KH; ikh++) {
11644
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12032
11645
  for (int64_t ikw = 0; ikw < KW; ikw++) {
12033
11646
  const int64_t iiw = iow*s0 + ikw*d0 - p0;
12034
11647
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
12035
11648
 
12036
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
11649
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
11650
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
11651
+ } else {
12037
11652
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12038
11653
  }
12039
11654
  }
@@ -12045,180 +11660,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12045
11660
  }
12046
11661
  }
12047
11662
 
12048
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12049
- // src0: [OC, IC, KH, KW]
12050
- // src1: [N, OH, OW, IC * KH * KW]
12051
- // result: [N, OC, OH, OW]
12052
- static void ggml_compute_forward_conv_2d_stage_1_f16(
12053
- const struct ggml_compute_params * params,
12054
- const struct ggml_tensor * src0,
12055
- const struct ggml_tensor * src1,
12056
- struct ggml_tensor * dst) {
12057
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12058
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
12059
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12060
-
12061
- int64_t t0 = ggml_perf_time_us();
12062
- UNUSED(t0);
12063
-
12064
- if (params->type == GGML_TASK_INIT) {
12065
- return;
12066
- }
12067
-
12068
- if (params->type == GGML_TASK_FINALIZE) {
12069
- return;
12070
- }
12071
-
12072
- GGML_TENSOR_BINARY_OP_LOCALS;
12073
-
12074
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12075
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
12076
- GGML_ASSERT(nb0 == sizeof(float));
12077
-
12078
- const int N = ne13;
12079
- const int OH = ne12;
12080
- const int OW = ne11;
12081
-
12082
- const int OC = ne03;
12083
- const int IC = ne02;
12084
- const int KH = ne01;
12085
- const int KW = ne00;
12086
-
12087
- const int ith = params->ith;
12088
- const int nth = params->nth;
12089
-
12090
- int64_t m = OC;
12091
- int64_t n = OH * OW;
12092
- int64_t k = IC * KH * KW;
12093
-
12094
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12095
- for (int i = 0; i < N; i++) {
12096
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12097
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
12098
- float * C = (float *)dst->data + i * m * n; // [m, n]
12099
-
12100
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12101
- }
12102
- }
12103
-
12104
- static void ggml_compute_forward_conv_2d_f16_f32(
12105
- const struct ggml_compute_params * params,
12106
- const struct ggml_tensor * src0,
12107
- const struct ggml_tensor * src1,
12108
- struct ggml_tensor * dst) {
12109
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12110
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12111
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12112
-
12113
- int64_t t0 = ggml_perf_time_us();
12114
- UNUSED(t0);
12115
-
12116
- GGML_TENSOR_BINARY_OP_LOCALS
12117
-
12118
- // src1: image [N, IC, IH, IW]
12119
- // src0: kernel [OC, IC, KH, KW]
12120
- // dst: result [N, OC, OH, OW]
12121
- // ne12: IC
12122
- // ne0: OW
12123
- // ne1: OH
12124
- // nk0: KW
12125
- // nk1: KH
12126
- // ne13: N
12127
-
12128
- const int N = ne13;
12129
- const int IC = ne12;
12130
- const int IH = ne11;
12131
- const int IW = ne10;
12132
-
12133
- const int OC = ne03;
12134
- // const int IC = ne02;
12135
- const int KH = ne01;
12136
- const int KW = ne00;
12137
-
12138
- const int OH = ne1;
12139
- const int OW = ne0;
12140
-
12141
- const int ith = params->ith;
12142
- const int nth = params->nth;
12143
-
12144
- // const int nk0 = ne00;
12145
- // const int nk1 = ne01;
12146
-
12147
- // size of the convolution row - the kernel size unrolled across all channels
12148
- // const int ew0 = nk0*nk1*ne02;
12149
- // ew0: IC*KH*KW
12150
-
12151
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12152
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12153
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12154
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12155
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12156
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12157
-
12158
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12159
- GGML_ASSERT(nb10 == sizeof(float));
12160
-
12161
- if (params->type == GGML_TASK_INIT) {
12162
- memset(params->wdata, 0, params->wsize);
12163
-
12164
- // prepare source data (src1)
12165
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
12166
-
12167
- {
12168
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12169
-
12170
- for (int in = 0; in < N; in++) {
12171
- for (int iic = 0; iic < IC; iic++) {
12172
- for (int ioh = 0; ioh < OH; ioh++) {
12173
- for (int iow = 0; iow < OW; iow++) {
12174
-
12175
- // micro kernel
12176
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12177
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
12178
-
12179
- for (int ikh = 0; ikh < KH; ikh++) {
12180
- for (int ikw = 0; ikw < KW; ikw++) {
12181
- const int iiw = iow*s0 + ikw*d0 - p0;
12182
- const int iih = ioh*s1 + ikh*d1 - p1;
12183
-
12184
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
12185
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12186
- }
12187
- }
12188
- }
12189
- }
12190
- }
12191
- }
12192
- }
12193
- }
12194
-
12195
- return;
12196
- }
12197
-
12198
- if (params->type == GGML_TASK_FINALIZE) {
12199
- return;
12200
- }
12201
-
12202
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12203
- // wdata: [N*OH*OW, IC*KH*KW]
12204
- // dst: result [N, OC, OH, OW]
12205
- // src0: kernel [OC, IC, KH, KW]
12206
-
12207
- int64_t m = OC;
12208
- int64_t n = OH * OW;
12209
- int64_t k = IC * KH * KW;
12210
-
12211
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12212
- for (int i = 0; i < N; i++) {
12213
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12214
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
12215
- float * C = (float *)dst->data + i * m * n; // [m * k]
12216
-
12217
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12218
- }
12219
- }
12220
-
12221
- static void ggml_compute_forward_conv_2d(
11663
+ static void ggml_compute_forward_im2col(
12222
11664
  const struct ggml_compute_params * params,
12223
11665
  const struct ggml_tensor * src0,
12224
11666
  const struct ggml_tensor * src1,
@@ -12226,50 +11668,7 @@ static void ggml_compute_forward_conv_2d(
12226
11668
  switch (src0->type) {
12227
11669
  case GGML_TYPE_F16:
12228
11670
  {
12229
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
12230
- } break;
12231
- case GGML_TYPE_F32:
12232
- {
12233
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
12234
- GGML_ASSERT(false);
12235
- } break;
12236
- default:
12237
- {
12238
- GGML_ASSERT(false);
12239
- } break;
12240
- }
12241
- }
12242
-
12243
- static void ggml_compute_forward_conv_2d_stage_0(
12244
- const struct ggml_compute_params * params,
12245
- const struct ggml_tensor * src0,
12246
- const struct ggml_tensor * src1,
12247
- struct ggml_tensor * dst) {
12248
- switch (src0->type) {
12249
- case GGML_TYPE_F16:
12250
- {
12251
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
12252
- } break;
12253
- case GGML_TYPE_F32:
12254
- {
12255
- GGML_ASSERT(false);
12256
- } break;
12257
- default:
12258
- {
12259
- GGML_ASSERT(false);
12260
- } break;
12261
- }
12262
- }
12263
-
12264
- static void ggml_compute_forward_conv_2d_stage_1(
12265
- const struct ggml_compute_params * params,
12266
- const struct ggml_tensor * src0,
12267
- const struct ggml_tensor * src1,
12268
- struct ggml_tensor * dst) {
12269
- switch (src0->type) {
12270
- case GGML_TYPE_F16:
12271
- {
12272
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
11671
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12273
11672
  } break;
12274
11673
  case GGML_TYPE_F32:
12275
11674
  {
@@ -12454,14 +11853,11 @@ static void ggml_compute_forward_pool_1d(
12454
11853
  ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
12455
11854
  }
12456
11855
 
12457
- // ggml_compute_forward_pool_2d_sk_p0
11856
+ // ggml_compute_forward_pool_2d
12458
11857
 
12459
- static void ggml_compute_forward_pool_2d_sk_p0(
11858
+ static void ggml_compute_forward_pool_2d(
12460
11859
  const struct ggml_compute_params * params,
12461
- const enum ggml_op_pool op,
12462
11860
  const struct ggml_tensor * src,
12463
- const int k0,
12464
- const int k1,
12465
11861
  struct ggml_tensor * dst) {
12466
11862
  assert(src->type == GGML_TYPE_F32);
12467
11863
  assert(params->ith == 0);
@@ -12470,6 +11866,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12470
11866
  return;
12471
11867
  }
12472
11868
 
11869
+ const int32_t * opts = (const int32_t *)dst->op_params;
11870
+ enum ggml_op_pool op = opts[0];
11871
+ const int k0 = opts[1];
11872
+ const int k1 = opts[2];
11873
+ const int s0 = opts[3];
11874
+ const int s1 = opts[4];
11875
+ const int p0 = opts[5];
11876
+ const int p1 = opts[6];
12473
11877
  const char * cdata = (const char*)src->data;
12474
11878
  const char * const data_end = cdata + ggml_nbytes(src);
12475
11879
 
@@ -12480,6 +11884,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12480
11884
  float * dplane = (float *)dst->data;
12481
11885
 
12482
11886
  const int ka = k0 * k1;
11887
+ const int offset0 = -p0;
11888
+ const int offset1 = -p1;
12483
11889
 
12484
11890
  while (cdata < data_end) {
12485
11891
  for (int oy = 0; oy < py; ++oy) {
@@ -12492,13 +11898,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12492
11898
  case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
12493
11899
  }
12494
11900
 
12495
- const int ix = ox * k0;
12496
- const int iy = oy * k1;
11901
+ const int ix = offset0 + ox * s0;
11902
+ const int iy = offset1 + oy * s1;
12497
11903
 
12498
11904
  for (int ky = 0; ky < k1; ++ky) {
11905
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
12499
11906
  const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
12500
11907
  for (int kx = 0; kx < k0; ++kx) {
12501
11908
  int j = ix + kx;
11909
+ if (j < 0 || j >= src->ne[0]) continue;
12502
11910
  switch (op) {
12503
11911
  case GGML_OP_POOL_AVG: *out += srow[j]; break;
12504
11912
  case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
@@ -12519,29 +11927,6 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12519
11927
  }
12520
11928
  }
12521
11929
 
12522
- // ggml_compute_forward_pool_2d
12523
-
12524
- static void ggml_compute_forward_pool_2d(
12525
- const struct ggml_compute_params * params,
12526
- const struct ggml_tensor * src0,
12527
- struct ggml_tensor * dst) {
12528
-
12529
- const int32_t * opts = (const int32_t *)dst->op_params;
12530
- enum ggml_op_pool op = opts[0];
12531
- const int k0 = opts[1];
12532
- const int k1 = opts[2];
12533
- const int s0 = opts[3];
12534
- const int s1 = opts[4];
12535
- const int p0 = opts[5];
12536
- const int p1 = opts[6];
12537
- GGML_ASSERT(p0 == 0);
12538
- GGML_ASSERT(p1 == 0); // padding not supported
12539
- GGML_ASSERT(k0 == s0);
12540
- GGML_ASSERT(k1 == s1); // only s = k supported
12541
-
12542
- ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
12543
- }
12544
-
12545
11930
  // ggml_compute_forward_upscale
12546
11931
 
12547
11932
  static void ggml_compute_forward_upscale_f32(
@@ -13743,6 +13128,10 @@ static void ggml_compute_forward_unary(
13743
13128
  {
13744
13129
  ggml_compute_forward_silu(params, src0, dst);
13745
13130
  } break;
13131
+ case GGML_UNARY_OP_LEAKY:
13132
+ {
13133
+ ggml_compute_forward_leaky(params, src0, dst);
13134
+ } break;
13746
13135
  default:
13747
13136
  {
13748
13137
  GGML_ASSERT(false);
@@ -14496,33 +13885,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14496
13885
  {
14497
13886
  ggml_compute_forward_clamp(params, tensor->src[0], tensor);
14498
13887
  } break;
14499
- case GGML_OP_CONV_1D:
14500
- {
14501
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
14502
- } break;
14503
- case GGML_OP_CONV_1D_STAGE_0:
14504
- {
14505
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14506
- } break;
14507
- case GGML_OP_CONV_1D_STAGE_1:
14508
- {
14509
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
14510
- } break;
14511
13888
  case GGML_OP_CONV_TRANSPOSE_1D:
14512
13889
  {
14513
13890
  ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
14514
13891
  } break;
14515
- case GGML_OP_CONV_2D:
13892
+ case GGML_OP_IM2COL:
14516
13893
  {
14517
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14518
- } break;
14519
- case GGML_OP_CONV_2D_STAGE_0:
14520
- {
14521
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14522
- } break;
14523
- case GGML_OP_CONV_2D_STAGE_1:
14524
- {
14525
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
13894
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
14526
13895
  } break;
14527
13896
  case GGML_OP_CONV_TRANSPOSE_2D:
14528
13897
  {
@@ -14651,62 +14020,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14651
14020
 
14652
14021
  ////////////////////////////////////////////////////////////////////////////////
14653
14022
 
14654
- static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
14023
+ static size_t ggml_hash_size(size_t min_sz) {
14024
+ // next primes after powers of two
14025
+ static const size_t primes[] = {
14026
+ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
14027
+ 2053, 4099, 8209, 16411, 32771, 65537, 131101,
14028
+ 262147, 524309, 1048583, 2097169, 4194319, 8388617,
14029
+ 16777259, 33554467, 67108879, 134217757, 268435459,
14030
+ 536870923, 1073741827, 2147483659
14031
+ };
14032
+ static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
14033
+
14034
+ // find the smallest prime that is larger or equal to min_sz
14035
+ size_t l = 0;
14036
+ size_t r = n_primes;
14037
+ while (l < r) {
14038
+ size_t m = (l + r)/2;
14039
+ if (primes[m] < min_sz) {
14040
+ l = m + 1;
14041
+ } else {
14042
+ r = m;
14043
+ }
14044
+ }
14045
+ size_t sz = l < n_primes ? primes[l] : min_sz | 1;
14046
+ return sz;
14047
+ }
14655
14048
 
14656
- static size_t hash(void * p) {
14657
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
14049
+ static size_t ggml_hash(const void * p) {
14050
+ return (size_t)p;
14658
14051
  }
14659
14052
 
14660
- static size_t hash_find(void * hash_table[], void * p) {
14661
- size_t h = hash(p);
14053
+ size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14054
+ size_t h = ggml_hash(key) % hash_set.size;
14662
14055
 
14663
14056
  // linear probing
14664
14057
  size_t i = h;
14665
- while (hash_table[i] != NULL && hash_table[i] != p) {
14666
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
14058
+ while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
14059
+ i = (i + 1) % hash_set.size;
14667
14060
  if (i == h) {
14668
14061
  // visited all hash table entries -> not found
14669
- return GGML_GRAPH_HASHTABLE_SIZE;
14062
+ return GGML_HASHTABLE_FULL;
14670
14063
  }
14671
14064
  }
14672
14065
  return i;
14673
14066
  }
14674
14067
 
14675
- static bool hash_insert(void * hash_table[], void * p) {
14676
- size_t i = hash_find(hash_table, p);
14068
+ bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14069
+ size_t i = ggml_hash_find(hash_set, key);
14070
+ return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
14071
+ }
14072
+
14073
+ size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14074
+ size_t i = ggml_hash_find(hash_set, key);
14677
14075
 
14678
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14076
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14679
14077
 
14680
- if (hash_table[i] == p) {
14681
- return true;
14078
+ if (hash_set.keys[i] == key) {
14079
+ return GGML_HASHTABLE_ALREADY_EXISTS;
14682
14080
  }
14683
14081
 
14684
14082
  // insert
14685
- GGML_ASSERT(hash_table[i] == NULL);
14686
- hash_table[i] = p;
14687
- return false;
14083
+ GGML_ASSERT(hash_set.keys[i] == NULL);
14084
+ hash_set.keys[i] = key;
14085
+ return i;
14086
+ }
14087
+
14088
+ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14089
+ size_t i = ggml_hash_find(hash_set, key);
14090
+
14091
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14092
+
14093
+ hash_set.keys[i] = key;
14094
+ return i;
14095
+ }
14096
+
14097
+ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14098
+ size = ggml_hash_size(size);
14099
+ struct ggml_hash_set result;
14100
+ result.size = size;
14101
+ result.keys = malloc(sizeof(struct ggml_tensor *) * size);
14102
+ memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14103
+ return result;
14688
14104
  }
14689
14105
 
14690
- static bool hash_contains(void * hash_table[], void * p) {
14691
- size_t i = hash_find(hash_table, p);
14692
- return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
14106
+ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14107
+ free(hash_set.keys);
14693
14108
  }
14694
14109
 
14695
14110
  struct hash_map {
14696
- void * keys[GGML_GRAPH_HASHTABLE_SIZE];
14697
- void * vals[GGML_GRAPH_HASHTABLE_SIZE];
14111
+ struct ggml_hash_set set;
14112
+ struct ggml_tensor ** vals;
14698
14113
  };
14699
14114
 
14700
- static struct hash_map * new_hash_map(void) {
14115
+ static struct hash_map * ggml_new_hash_map(size_t size) {
14701
14116
  struct hash_map * result = malloc(sizeof(struct hash_map));
14702
- for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
14703
- result->keys[i] = NULL;
14704
- result->vals[i] = NULL;
14705
- }
14117
+ result->set = ggml_hash_set_new(size);
14118
+ result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
14119
+ memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14706
14120
  return result;
14707
14121
  }
14708
14122
 
14709
- static void free_hash_map(struct hash_map * map) {
14123
+ static void ggml_hash_map_free(struct hash_map * map) {
14124
+ ggml_hash_set_free(map->set);
14125
+ free(map->vals);
14710
14126
  free(map);
14711
14127
  }
14712
14128
 
@@ -14726,7 +14142,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14726
14142
  return node;
14727
14143
  }
14728
14144
 
14729
- if (!hash_contains(graph->visited_hash_table, node)) {
14145
+ if (!ggml_hash_contains(graph->visited_hash_table, node)) {
14730
14146
  return node;
14731
14147
  }
14732
14148
 
@@ -14741,17 +14157,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14741
14157
  return node;
14742
14158
  }
14743
14159
 
14744
- size_t i = hash_find(replacements->keys, node);
14745
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14746
- if (replacements->keys[i] == node) {
14747
- return (struct ggml_tensor *) replacements->vals[i];
14160
+ size_t i = ggml_hash_find(replacements->set, node);
14161
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
14162
+ if (replacements->set.keys[i] == node) {
14163
+ return replacements->vals[i];
14748
14164
  }
14749
14165
 
14750
14166
  struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14751
14167
 
14752
14168
  // insert clone into replacements
14753
- GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
14754
- replacements->keys[i] = node;
14169
+ GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
14170
+ replacements->set.keys[i] = node;
14755
14171
  replacements->vals[i] = clone;
14756
14172
 
14757
14173
  clone->op = node->op;
@@ -14788,26 +14204,26 @@ void ggml_build_backward_gradient_checkpointing(
14788
14204
  struct ggml_cgraph * gb_tmp,
14789
14205
  struct ggml_tensor * * checkpoints,
14790
14206
  int n_checkpoints) {
14791
- *gb_tmp = *gf;
14207
+ ggml_graph_cpy(gf, gb_tmp);
14792
14208
  ggml_build_backward_expand(ctx, gf, gb_tmp, true);
14793
14209
 
14794
14210
  if (n_checkpoints <= 0) {
14795
- *gb = *gb_tmp;
14211
+ ggml_graph_cpy(gb_tmp, gb);
14796
14212
  return;
14797
14213
  }
14798
14214
 
14799
- struct hash_map * replacements = new_hash_map();
14215
+ struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
14800
14216
 
14801
14217
  // insert checkpoints in replacements
14802
14218
  for (int i = 0; i < n_checkpoints; ++i) {
14803
- size_t k = hash_find(replacements->keys, checkpoints[i]);
14804
- GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14805
- GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
14806
- replacements->keys[k] = checkpoints[i];
14807
- replacements->vals[k] = checkpoints[i];
14219
+ size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
14220
+ GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
14221
+ GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
14222
+ replacements->set.keys[k] = checkpoints[i];
14223
+ replacements->vals[k] = checkpoints[i];
14808
14224
  }
14809
14225
 
14810
- *gb = *gf;
14226
+ ggml_graph_cpy(gf, gb);
14811
14227
  // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
14812
14228
  // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
14813
14229
  // by recomputing them from checkpoints
@@ -14824,21 +14240,21 @@ void ggml_build_backward_gradient_checkpointing(
14824
14240
  ggml_build_forward_expand(gb, node);
14825
14241
  }
14826
14242
 
14827
- free_hash_map(replacements);
14243
+ ggml_hash_map_free(replacements);
14828
14244
  }
14829
14245
 
14830
14246
  // functions to change gradients considering the case that input a might be initial gradient with zero value
14831
14247
 
14832
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14833
- if (hash_contains(zero_table, a)) {
14248
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14249
+ if (ggml_hash_contains(zero_table, a)) {
14834
14250
  return b;
14835
14251
  } else {
14836
14252
  return ggml_add_impl(ctx, a, b, false);
14837
14253
  }
14838
14254
  }
14839
14255
 
14840
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
14841
- if (hash_contains(zero_table, a)) {
14256
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14257
+ if (ggml_hash_contains(zero_table, a)) {
14842
14258
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14843
14259
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
14844
14260
  } else {
@@ -14846,23 +14262,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
14846
14262
  }
14847
14263
  }
14848
14264
 
14849
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14850
- if (hash_contains(zero_table, a)) {
14265
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14266
+ if (ggml_hash_contains(zero_table, a)) {
14851
14267
  return ggml_repeat(ctx, b, a);
14852
14268
  } else {
14853
14269
  return ggml_add1_impl(ctx, a, b, false);
14854
14270
  }
14855
14271
  }
14856
14272
 
14857
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
14858
- if (hash_contains(zero_table, a)) {
14273
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14274
+ if (ggml_hash_contains(zero_table, a)) {
14859
14275
  return ggml_neg(ctx, b);
14860
14276
  } else {
14861
14277
  return ggml_sub_impl(ctx, a, b, false);
14862
14278
  }
14863
14279
  }
14864
14280
 
14865
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
14281
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
14866
14282
  struct ggml_tensor * src0 = tensor->src[0];
14867
14283
  struct ggml_tensor * src1 = tensor->src[1];
14868
14284
 
@@ -15457,31 +14873,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15457
14873
  {
15458
14874
  GGML_ASSERT(false); // TODO: not implemented
15459
14875
  } break;
15460
- case GGML_OP_CONV_1D:
15461
- {
15462
- GGML_ASSERT(false); // TODO: not implemented
15463
- } break;
15464
- case GGML_OP_CONV_1D_STAGE_0:
15465
- {
15466
- GGML_ASSERT(false); // TODO: not implemented
15467
- } break;
15468
- case GGML_OP_CONV_1D_STAGE_1:
15469
- {
15470
- GGML_ASSERT(false); // TODO: not implemented
15471
- } break;
15472
14876
  case GGML_OP_CONV_TRANSPOSE_1D:
15473
14877
  {
15474
14878
  GGML_ASSERT(false); // TODO: not implemented
15475
14879
  } break;
15476
- case GGML_OP_CONV_2D:
15477
- {
15478
- GGML_ASSERT(false); // TODO: not implemented
15479
- } break;
15480
- case GGML_OP_CONV_2D_STAGE_0:
15481
- {
15482
- GGML_ASSERT(false); // TODO: not implemented
15483
- } break;
15484
- case GGML_OP_CONV_2D_STAGE_1:
14880
+ case GGML_OP_IM2COL:
15485
14881
  {
15486
14882
  GGML_ASSERT(false); // TODO: not implemented
15487
14883
  } break;
@@ -15695,7 +15091,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15695
15091
  }
15696
15092
 
15697
15093
  // check if already visited
15698
- if (hash_insert(cgraph->visited_hash_table, node)) {
15094
+ if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
15699
15095
  return;
15700
15096
  }
15701
15097
 
@@ -15711,7 +15107,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15711
15107
 
15712
15108
  if (node->op == GGML_OP_NONE && node->grad == NULL) {
15713
15109
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
15714
- GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15110
+ GGML_ASSERT(cgraph->n_leafs < cgraph->size);
15715
15111
 
15716
15112
  if (strlen(node->name) == 0) {
15717
15113
  ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15720,22 +15116,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15720
15116
  cgraph->leafs[cgraph->n_leafs] = node;
15721
15117
  cgraph->n_leafs++;
15722
15118
  } else {
15723
- GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15119
+ GGML_ASSERT(cgraph->n_nodes < cgraph->size);
15724
15120
 
15725
15121
  if (strlen(node->name) == 0) {
15726
15122
  ggml_format_name(node, "node_%d", cgraph->n_nodes);
15727
15123
  }
15728
15124
 
15729
15125
  cgraph->nodes[cgraph->n_nodes] = node;
15730
- cgraph->grads[cgraph->n_nodes] = node->grad;
15126
+ if (cgraph->grads) {
15127
+ cgraph->grads[cgraph->n_nodes] = node->grad;
15128
+ }
15731
15129
  cgraph->n_nodes++;
15732
15130
  }
15733
15131
  }
15734
15132
 
15735
15133
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
15736
15134
  if (!expand) {
15737
- cgraph->n_nodes = 0;
15738
- cgraph->n_leafs = 0;
15135
+ // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
15136
+ ggml_graph_clear(cgraph);
15739
15137
  }
15740
15138
 
15741
15139
  const int n0 = cgraph->n_nodes;
@@ -15756,25 +15154,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
15756
15154
  ggml_build_forward_impl(cgraph, tensor, true);
15757
15155
  }
15758
15156
 
15759
- struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15760
- struct ggml_cgraph result = {
15761
- /*.n_nodes =*/ 0,
15762
- /*.n_leafs =*/ 0,
15763
- /*.nodes =*/ { NULL },
15764
- /*.grads =*/ { NULL },
15765
- /*.leafs =*/ { NULL },
15766
- /*.hash_table =*/ { NULL },
15767
- /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15768
- /*.perf_runs =*/ 0,
15769
- /*.perf_cycles =*/ 0,
15770
- /*.perf_time_us =*/ 0,
15771
- };
15772
-
15773
- ggml_build_forward_impl(&result, tensor, false);
15774
-
15775
- return result;
15776
- }
15777
-
15778
15157
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
15779
15158
  GGML_ASSERT(gf->n_nodes > 0);
15780
15159
 
@@ -15791,11 +15170,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15791
15170
  }
15792
15171
 
15793
15172
  // remember original gradients which start with zero values
15794
- void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
15795
- memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
15173
+ struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
15796
15174
  for (int i = 0; i < gf->n_nodes; i++) {
15797
15175
  if (gf->grads[i]) {
15798
- hash_insert(zero_table, gf->grads[i]);
15176
+ ggml_hash_insert(zero_table, gf->grads[i]);
15799
15177
  }
15800
15178
  }
15801
15179
 
@@ -15818,26 +15196,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15818
15196
  }
15819
15197
  }
15820
15198
 
15821
- free(zero_table);
15199
+ ggml_hash_set_free(zero_table);
15822
15200
  }
15823
15201
 
15824
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
15825
- struct ggml_cgraph result = *gf;
15826
- ggml_build_backward_expand(ctx, gf, &result, keep);
15827
- return result;
15202
+ static size_t ggml_graph_nbytes(size_t size, bool grads) {
15203
+ size_t nbytes = sizeof(struct ggml_cgraph);
15204
+ nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
15205
+ if (grads) {
15206
+ nbytes += size * sizeof(struct ggml_tensor *); // grads
15207
+ }
15208
+ nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
15209
+ return nbytes;
15828
15210
  }
15829
15211
 
15830
- struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15831
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15212
+ size_t ggml_graph_overhead_custom(size_t size, bool grads) {
15213
+ return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
15214
+ }
15215
+
15216
+ size_t ggml_graph_overhead(void) {
15217
+ return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
15218
+ }
15219
+
15220
+ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
15221
+ const size_t obj_size = ggml_graph_nbytes(size, grads);
15222
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15832
15223
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15833
15224
 
15225
+ struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
15226
+
15227
+ size_t hash_size = ggml_hash_size(size * 2);
15228
+ struct ggml_tensor ** nodes_ptr = data_start;
15229
+ struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
15230
+ struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
15231
+ struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
15232
+
15233
+ // check that we allocated the correct amount of memory
15234
+ assert(obj_size == (size_t) (
15235
+ (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
15236
+
15237
+ memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
15238
+
15834
15239
  *cgraph = (struct ggml_cgraph) {
15240
+ /*.size =*/ size,
15835
15241
  /*.n_nodes =*/ 0,
15836
15242
  /*.n_leafs =*/ 0,
15837
- /*.nodes =*/ { NULL },
15838
- /*.grads =*/ { NULL },
15839
- /*.leafs =*/ { NULL },
15840
- /*.hash_table =*/ { NULL },
15243
+ /*.nodes =*/ nodes_ptr,
15244
+ /*.grads =*/ grads_ptr,
15245
+ /*.leafs =*/ leafs_ptr,
15246
+ /*.hash_table =*/ { hash_size, hash_keys_ptr },
15841
15247
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15842
15248
  /*.perf_runs =*/ 0,
15843
15249
  /*.perf_cycles =*/ 0,
@@ -15847,14 +15253,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15847
15253
  return cgraph;
15848
15254
  }
15849
15255
 
15850
- struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
15851
- struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
15852
- ggml_build_forward_impl(cgraph, tensor, false);
15256
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15257
+ return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15258
+ }
15259
+
15260
+ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15261
+ const size_t obj_size = sizeof(struct ggml_cgraph);
15262
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15263
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15264
+
15265
+ *cgraph = (struct ggml_cgraph) {
15266
+ /*.size =*/ 0,
15267
+ /*.n_nodes =*/ i1 - i0,
15268
+ /*.n_leafs =*/ 0,
15269
+ /*.nodes =*/ cgraph0->nodes + i0,
15270
+ /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
15271
+ /*.leafs =*/ NULL,
15272
+ /*.hash_table =*/ { 0, NULL },
15273
+ /*.order =*/ cgraph0->order,
15274
+ /*.perf_runs =*/ 0,
15275
+ /*.perf_cycles =*/ 0,
15276
+ /*.perf_time_us =*/ 0,
15277
+ };
15278
+
15853
15279
  return cgraph;
15854
15280
  }
15855
15281
 
15856
- size_t ggml_graph_overhead(void) {
15857
- return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15282
+ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
15283
+ GGML_ASSERT(dst->size >= src->n_leafs);
15284
+ GGML_ASSERT(dst->size >= src->n_nodes);
15285
+ GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
15286
+
15287
+ dst->n_leafs = src->n_leafs;
15288
+ dst->n_nodes = src->n_nodes;
15289
+ dst->order = src->order;
15290
+
15291
+ for (int i = 0; i < src->n_leafs; ++i) {
15292
+ dst->leafs[i] = src->leafs[i];
15293
+ }
15294
+
15295
+ for (int i = 0; i < src->n_nodes; ++i) {
15296
+ dst->nodes[i] = src->nodes[i];
15297
+ }
15298
+
15299
+ if (src->grads) {
15300
+ GGML_ASSERT(dst->grads != NULL);
15301
+ for (int i = 0; i < src->n_nodes; ++i) {
15302
+ dst->grads[i] = src->grads[i];
15303
+ }
15304
+ }
15305
+
15306
+ for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
15307
+ if (src->visited_hash_table.keys[i]) {
15308
+ ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
15309
+ }
15310
+ }
15311
+ }
15312
+
15313
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
15314
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
15315
+ ggml_graph_cpy(cgraph, result);
15316
+ return result;
15317
+ }
15318
+
15319
+ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
15320
+ GGML_ASSERT(cgraph->grads != NULL);
15321
+
15322
+ for (int i = 0; i < cgraph->n_nodes; i++) {
15323
+ struct ggml_tensor * grad = cgraph->grads[i];
15324
+
15325
+ if (grad) {
15326
+ ggml_set_zero(grad);
15327
+ }
15328
+ }
15329
+ }
15330
+
15331
+ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
15332
+ cgraph->n_leafs = 0;
15333
+ cgraph->n_nodes = 0;
15334
+ memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
15858
15335
  }
15859
15336
 
15860
15337
  //
@@ -15966,45 +15443,266 @@ static void clear_numa_thread_affinity(void) {
15966
15443
  strerror(rv));
15967
15444
  }
15968
15445
 
15969
- CPU_FREE(cpus);
15970
- }
15971
- #else
15972
- // TODO: Windows etc.
15973
- // (the linux implementation may also work on BSD, someone should test)
15974
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15975
- static void clear_numa_thread_affinity(void) {}
15976
- #endif
15977
-
15978
- struct ggml_compute_state_shared {
15979
- const struct ggml_cgraph * cgraph;
15980
- const struct ggml_cplan * cplan;
15981
-
15982
- int64_t perf_node_start_cycles;
15983
- int64_t perf_node_start_time_us;
15984
-
15985
- const int n_threads;
15986
-
15987
- // synchronization primitives
15988
- atomic_int n_active; // num active threads
15989
- atomic_int node_n; // active graph node
15990
-
15991
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15992
- void * abort_callback_data;
15993
- };
15994
-
15995
- struct ggml_compute_state {
15996
- ggml_thread_t thrd;
15997
- int ith;
15998
- struct ggml_compute_state_shared * shared;
15999
- };
16000
-
16001
- static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16002
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16003
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15446
+ CPU_FREE(cpus);
15447
+ }
15448
+ #else
15449
+ // TODO: Windows etc.
15450
+ // (the linux implementation may also work on BSD, someone should test)
15451
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15452
+ static void clear_numa_thread_affinity(void) {}
15453
+ #endif
15454
+
15455
+ struct ggml_compute_state_shared {
15456
+ const struct ggml_cgraph * cgraph;
15457
+ const struct ggml_cplan * cplan;
15458
+
15459
+ int64_t perf_node_start_cycles;
15460
+ int64_t perf_node_start_time_us;
15461
+
15462
+ const int n_threads;
15463
+
15464
+ // synchronization primitives
15465
+ atomic_int n_active; // num active threads
15466
+ atomic_int node_n; // active graph node
15467
+
15468
+ bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15469
+ void * abort_callback_data;
15470
+ };
15471
+
15472
+ struct ggml_compute_state {
15473
+ ggml_thread_t thrd;
15474
+ int ith;
15475
+ struct ggml_compute_state_shared * shared;
15476
+ };
15477
+
15478
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
15479
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
15480
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15481
+
15482
+ node->perf_runs++;
15483
+ node->perf_cycles += cycles_cur;
15484
+ node->perf_time_us += time_us_cur;
15485
+ }
15486
+
15487
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15488
+ int n_tasks = 0;
15489
+
15490
+ switch (node->op) {
15491
+ case GGML_OP_CPY:
15492
+ case GGML_OP_DUP:
15493
+ case GGML_OP_ADD:
15494
+ case GGML_OP_ADD1:
15495
+ case GGML_OP_ACC:
15496
+ {
15497
+ n_tasks = n_threads;
15498
+ } break;
15499
+ case GGML_OP_SUB:
15500
+ case GGML_OP_DIV:
15501
+ case GGML_OP_SQR:
15502
+ case GGML_OP_SQRT:
15503
+ case GGML_OP_LOG:
15504
+ case GGML_OP_SUM:
15505
+ case GGML_OP_SUM_ROWS:
15506
+ case GGML_OP_MEAN:
15507
+ case GGML_OP_ARGMAX:
15508
+ case GGML_OP_REPEAT:
15509
+ case GGML_OP_REPEAT_BACK:
15510
+ {
15511
+ n_tasks = 1;
15512
+ } break;
15513
+ case GGML_OP_UNARY:
15514
+ switch (ggml_get_unary_op(node)) {
15515
+ case GGML_UNARY_OP_ABS:
15516
+ case GGML_UNARY_OP_SGN:
15517
+ case GGML_UNARY_OP_NEG:
15518
+ case GGML_UNARY_OP_STEP:
15519
+ case GGML_UNARY_OP_TANH:
15520
+ case GGML_UNARY_OP_ELU:
15521
+ case GGML_UNARY_OP_RELU:
15522
+ case GGML_UNARY_OP_LEAKY:
15523
+ {
15524
+ n_tasks = 1;
15525
+ } break;
15526
+
15527
+ case GGML_UNARY_OP_GELU:
15528
+ case GGML_UNARY_OP_GELU_QUICK:
15529
+ case GGML_UNARY_OP_SILU:
15530
+ {
15531
+ n_tasks = n_threads;
15532
+ } break;
15533
+ }
15534
+ break;
15535
+ case GGML_OP_SILU_BACK:
15536
+ case GGML_OP_MUL:
15537
+ case GGML_OP_NORM:
15538
+ case GGML_OP_RMS_NORM:
15539
+ case GGML_OP_RMS_NORM_BACK:
15540
+ case GGML_OP_GROUP_NORM:
15541
+ case GGML_OP_CONCAT:
15542
+ {
15543
+ n_tasks = n_threads;
15544
+ } break;
15545
+ case GGML_OP_MUL_MAT:
15546
+ {
15547
+ n_tasks = n_threads;
15548
+
15549
+ // TODO: use different scheduling for different matrix sizes
15550
+ //const int nr0 = ggml_nrows(node->src[0]);
15551
+ //const int nr1 = ggml_nrows(node->src[1]);
15552
+
15553
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
15554
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
15555
+
15556
+ #if defined(GGML_USE_CUBLAS)
15557
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
15558
+ n_tasks = 1; // TODO: this actually is doing nothing
15559
+ // the threads are still spinning
15560
+ }
15561
+ #elif defined(GGML_USE_CLBLAST)
15562
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
15563
+ n_tasks = 1; // TODO: this actually is doing nothing
15564
+ // the threads are still spinning
15565
+ }
15566
+ #endif
15567
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
15568
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
15569
+ n_tasks = 1; // TODO: this actually is doing nothing
15570
+ // the threads are still spinning
15571
+ }
15572
+ #endif
15573
+ } break;
15574
+ case GGML_OP_OUT_PROD:
15575
+ {
15576
+ n_tasks = n_threads;
15577
+ } break;
15578
+ case GGML_OP_SCALE:
15579
+ case GGML_OP_SET:
15580
+ case GGML_OP_CONT:
15581
+ case GGML_OP_RESHAPE:
15582
+ case GGML_OP_VIEW:
15583
+ case GGML_OP_PERMUTE:
15584
+ case GGML_OP_TRANSPOSE:
15585
+ case GGML_OP_GET_ROWS:
15586
+ case GGML_OP_GET_ROWS_BACK:
15587
+ case GGML_OP_DIAG:
15588
+ {
15589
+ n_tasks = 1;
15590
+ } break;
15591
+ case GGML_OP_DIAG_MASK_ZERO:
15592
+ case GGML_OP_DIAG_MASK_INF:
15593
+ case GGML_OP_SOFT_MAX:
15594
+ case GGML_OP_SOFT_MAX_BACK:
15595
+ case GGML_OP_ROPE:
15596
+ case GGML_OP_ROPE_BACK:
15597
+ case GGML_OP_ADD_REL_POS:
15598
+ {
15599
+ n_tasks = n_threads;
15600
+ } break;
15601
+ case GGML_OP_ALIBI:
15602
+ {
15603
+ n_tasks = 1; //TODO
15604
+ } break;
15605
+ case GGML_OP_CLAMP:
15606
+ {
15607
+ n_tasks = 1; //TODO
15608
+ } break;
15609
+ case GGML_OP_CONV_TRANSPOSE_1D:
15610
+ {
15611
+ n_tasks = n_threads;
15612
+ } break;
15613
+ case GGML_OP_IM2COL:
15614
+ {
15615
+ n_tasks = n_threads;
15616
+ } break;
15617
+ case GGML_OP_CONV_TRANSPOSE_2D:
15618
+ {
15619
+ n_tasks = n_threads;
15620
+ } break;
15621
+ case GGML_OP_POOL_1D:
15622
+ case GGML_OP_POOL_2D:
15623
+ {
15624
+ n_tasks = 1;
15625
+ } break;
15626
+ case GGML_OP_UPSCALE:
15627
+ {
15628
+ n_tasks = n_threads;
15629
+ } break;
15630
+ case GGML_OP_FLASH_ATTN:
15631
+ {
15632
+ n_tasks = n_threads;
15633
+ } break;
15634
+ case GGML_OP_FLASH_FF:
15635
+ {
15636
+ n_tasks = n_threads;
15637
+ } break;
15638
+ case GGML_OP_FLASH_ATTN_BACK:
15639
+ {
15640
+ n_tasks = n_threads;
15641
+ } break;
15642
+ case GGML_OP_WIN_PART:
15643
+ case GGML_OP_WIN_UNPART:
15644
+ case GGML_OP_GET_REL_POS:
15645
+ case GGML_OP_MAP_UNARY:
15646
+ case GGML_OP_MAP_BINARY:
15647
+ case GGML_OP_MAP_CUSTOM1_F32:
15648
+ case GGML_OP_MAP_CUSTOM2_F32:
15649
+ case GGML_OP_MAP_CUSTOM3_F32:
15650
+ {
15651
+ n_tasks = 1;
15652
+ } break;
15653
+ case GGML_OP_MAP_CUSTOM1:
15654
+ {
15655
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
15656
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15657
+ n_tasks = n_threads;
15658
+ } else {
15659
+ n_tasks = MIN(p->n_tasks, n_threads);
15660
+ }
15661
+ } break;
15662
+ case GGML_OP_MAP_CUSTOM2:
15663
+ {
15664
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
15665
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15666
+ n_tasks = n_threads;
15667
+ } else {
15668
+ n_tasks = MIN(p->n_tasks, n_threads);
15669
+ }
15670
+ } break;
15671
+ case GGML_OP_MAP_CUSTOM3:
15672
+ {
15673
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
15674
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15675
+ n_tasks = n_threads;
15676
+ } else {
15677
+ n_tasks = MIN(p->n_tasks, n_threads);
15678
+ }
15679
+ } break;
15680
+ case GGML_OP_CROSS_ENTROPY_LOSS:
15681
+ {
15682
+ n_tasks = n_threads;
15683
+ } break;
15684
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15685
+ {
15686
+ n_tasks = n_threads;
15687
+ } break;
15688
+ case GGML_OP_NONE:
15689
+ {
15690
+ n_tasks = 1;
15691
+ } break;
15692
+ case GGML_OP_COUNT:
15693
+ {
15694
+ GGML_ASSERT(false);
15695
+ } break;
15696
+ default:
15697
+ {
15698
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15699
+ GGML_ASSERT(false);
15700
+ } break;
15701
+ }
15702
+
15703
+ assert(n_tasks > 0);
16004
15704
 
16005
- node->perf_runs++;
16006
- node->perf_cycles += cycles_cur;
16007
- node->perf_time_us += time_us_cur;
15705
+ return n_tasks;
16008
15706
  }
16009
15707
 
16010
15708
  static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -16013,7 +15711,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16013
15711
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
16014
15712
  const struct ggml_cplan * cplan = state->shared->cplan;
16015
15713
 
16016
- const int * n_tasks_arr = cplan->n_tasks;
16017
15714
  const int n_threads = state->shared->n_threads;
16018
15715
 
16019
15716
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16038,9 +15735,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16038
15735
 
16039
15736
  if (node_n != -1) {
16040
15737
  /* FINALIZE */
16041
- struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
15738
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16042
15739
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16043
- params.nth = n_tasks_arr[node_n];
15740
+ params.nth = ggml_get_n_tasks(node, n_threads);
16044
15741
  ggml_compute_forward(&params, node);
16045
15742
  }
16046
15743
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16051,7 +15748,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16051
15748
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16052
15749
 
16053
15750
  struct ggml_tensor * node = cgraph->nodes[node_n];
16054
- const int n_tasks = n_tasks_arr[node_n];
15751
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16055
15752
 
16056
15753
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16057
15754
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -16109,7 +15806,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16109
15806
 
16110
15807
  /* COMPUTE */
16111
15808
  struct ggml_tensor * node = cgraph->nodes[node_n];
16112
- const int n_tasks = n_tasks_arr[node_n];
15809
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16113
15810
 
16114
15811
  struct ggml_compute_params params = {
16115
15812
  /*.type =*/ GGML_TASK_COMPUTE,
@@ -16143,121 +15840,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16143
15840
 
16144
15841
  struct ggml_tensor * node = cgraph->nodes[i];
16145
15842
 
15843
+ size_t cur = 0;
15844
+
16146
15845
  switch (node->op) {
16147
15846
  case GGML_OP_CPY:
16148
15847
  case GGML_OP_DUP:
16149
15848
  {
16150
15849
  n_tasks = n_threads;
16151
15850
 
16152
- size_t cur = 0;
16153
15851
  if (ggml_is_quantized(node->type)) {
16154
15852
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16155
15853
  }
16156
-
16157
- work_size = MAX(work_size, cur);
16158
15854
  } break;
16159
15855
  case GGML_OP_ADD:
16160
15856
  case GGML_OP_ADD1:
16161
15857
  {
16162
15858
  n_tasks = n_threads;
16163
15859
 
16164
- size_t cur = 0;
16165
-
16166
15860
  if (ggml_is_quantized(node->src[0]->type)) {
16167
15861
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16168
15862
  }
16169
-
16170
- work_size = MAX(work_size, cur);
16171
15863
  } break;
16172
15864
  case GGML_OP_ACC:
16173
15865
  {
16174
15866
  n_tasks = n_threads;
16175
15867
 
16176
- size_t cur = 0;
16177
-
16178
15868
  if (ggml_is_quantized(node->src[0]->type)) {
16179
15869
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16180
15870
  }
16181
-
16182
- work_size = MAX(work_size, cur);
16183
- } break;
16184
- case GGML_OP_SUB:
16185
- case GGML_OP_DIV:
16186
- case GGML_OP_SQR:
16187
- case GGML_OP_SQRT:
16188
- case GGML_OP_LOG:
16189
- case GGML_OP_SUM:
16190
- case GGML_OP_SUM_ROWS:
16191
- case GGML_OP_MEAN:
16192
- case GGML_OP_ARGMAX:
16193
- case GGML_OP_REPEAT:
16194
- case GGML_OP_REPEAT_BACK:
16195
- {
16196
- n_tasks = 1;
16197
- } break;
16198
-
16199
- case GGML_OP_UNARY:
16200
- {
16201
- switch (ggml_get_unary_op(node)) {
16202
- case GGML_UNARY_OP_ABS:
16203
- case GGML_UNARY_OP_SGN:
16204
- case GGML_UNARY_OP_NEG:
16205
- case GGML_UNARY_OP_STEP:
16206
- case GGML_UNARY_OP_TANH:
16207
- case GGML_UNARY_OP_ELU:
16208
- case GGML_UNARY_OP_RELU:
16209
- {
16210
- n_tasks = 1;
16211
- } break;
16212
-
16213
- case GGML_UNARY_OP_GELU:
16214
- case GGML_UNARY_OP_GELU_QUICK:
16215
- case GGML_UNARY_OP_SILU:
16216
- {
16217
- n_tasks = n_threads;
16218
- } break;
16219
- }
16220
- } break;
16221
- case GGML_OP_SILU_BACK:
16222
- case GGML_OP_MUL:
16223
- case GGML_OP_NORM:
16224
- case GGML_OP_RMS_NORM:
16225
- case GGML_OP_RMS_NORM_BACK:
16226
- case GGML_OP_GROUP_NORM:
16227
- {
16228
- n_tasks = n_threads;
16229
15871
  } break;
16230
- case GGML_OP_CONCAT:
16231
15872
  case GGML_OP_MUL_MAT:
16232
15873
  {
16233
- n_tasks = n_threads;
16234
-
16235
- // TODO: use different scheduling for different matrix sizes
16236
- //const int nr0 = ggml_nrows(node->src[0]);
16237
- //const int nr1 = ggml_nrows(node->src[1]);
16238
-
16239
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16240
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16241
-
16242
- size_t cur = 0;
16243
15874
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16244
15875
 
16245
- #if defined(GGML_USE_CUBLAS)
16246
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16247
- n_tasks = 1; // TODO: this actually is doing nothing
16248
- // the threads are still spinning
16249
- } else
16250
- #elif defined(GGML_USE_CLBLAST)
15876
+ #if defined(GGML_USE_CLBLAST)
16251
15877
  if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16252
- n_tasks = 1; // TODO: this actually is doing nothing
16253
- // the threads are still spinning
16254
15878
  cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16255
15879
  } else
16256
15880
  #endif
16257
15881
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16258
15882
  if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16259
- n_tasks = 1; // TODO: this actually is doing nothing
16260
- // the threads are still spinning
16261
15883
  if (node->src[0]->type != GGML_TYPE_F32) {
16262
15884
  // here we need memory just for single 2D matrix from src0
16263
15885
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16266,108 +15888,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16266
15888
  #endif
16267
15889
  if (node->src[1]->type != vec_dot_type) {
16268
15890
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16269
- } else {
16270
- cur = 0;
16271
15891
  }
16272
-
16273
- work_size = MAX(work_size, cur);
16274
15892
  } break;
16275
15893
  case GGML_OP_OUT_PROD:
16276
15894
  {
16277
15895
  n_tasks = n_threads;
16278
15896
 
16279
- size_t cur = 0;
16280
-
16281
15897
  if (ggml_is_quantized(node->src[0]->type)) {
16282
15898
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16283
15899
  }
16284
-
16285
- work_size = MAX(work_size, cur);
16286
- } break;
16287
- case GGML_OP_SCALE:
16288
- {
16289
- n_tasks = 1;
16290
- } break;
16291
- case GGML_OP_SET:
16292
- case GGML_OP_CONT:
16293
- case GGML_OP_RESHAPE:
16294
- case GGML_OP_VIEW:
16295
- case GGML_OP_PERMUTE:
16296
- case GGML_OP_TRANSPOSE:
16297
- case GGML_OP_GET_ROWS:
16298
- case GGML_OP_GET_ROWS_BACK:
16299
- case GGML_OP_DIAG:
16300
- {
16301
- n_tasks = 1;
16302
- } break;
16303
- case GGML_OP_DIAG_MASK_ZERO:
16304
- case GGML_OP_DIAG_MASK_INF:
16305
- case GGML_OP_SOFT_MAX:
16306
- case GGML_OP_SOFT_MAX_BACK:
16307
- case GGML_OP_ROPE:
16308
- case GGML_OP_ROPE_BACK:
16309
- case GGML_OP_ADD_REL_POS:
16310
- {
16311
- n_tasks = n_threads;
16312
- } break;
16313
- case GGML_OP_ALIBI:
16314
- {
16315
- n_tasks = 1; //TODO
16316
- } break;
16317
- case GGML_OP_CLAMP:
16318
- {
16319
- n_tasks = 1; //TODO
16320
- } break;
16321
- case GGML_OP_CONV_1D:
16322
- {
16323
- n_tasks = n_threads;
16324
-
16325
- GGML_ASSERT(node->src[0]->ne[3] == 1);
16326
- GGML_ASSERT(node->src[1]->ne[2] == 1);
16327
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16328
-
16329
- const int64_t ne00 = node->src[0]->ne[0];
16330
- const int64_t ne01 = node->src[0]->ne[1];
16331
- const int64_t ne02 = node->src[0]->ne[2];
16332
-
16333
- const int64_t ne10 = node->src[1]->ne[0];
16334
- const int64_t ne11 = node->src[1]->ne[1];
16335
-
16336
- const int64_t ne0 = node->ne[0];
16337
- const int64_t ne1 = node->ne[1];
16338
- const int64_t nk = ne00;
16339
- const int64_t ew0 = nk * ne01;
16340
-
16341
- UNUSED(ne02);
16342
- UNUSED(ne10);
16343
- UNUSED(ne11);
16344
-
16345
- size_t cur = 0;
16346
-
16347
- if (node->src[0]->type == GGML_TYPE_F16 &&
16348
- node->src[1]->type == GGML_TYPE_F32) {
16349
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16350
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16351
- node->src[1]->type == GGML_TYPE_F32) {
16352
- cur = sizeof(float)*(ne0*ne1*ew0);
16353
- } else {
16354
- GGML_ASSERT(false);
16355
- }
16356
-
16357
- work_size = MAX(work_size, cur);
16358
- } break;
16359
- case GGML_OP_CONV_1D_STAGE_0:
16360
- {
16361
- n_tasks = n_threads;
16362
- } break;
16363
- case GGML_OP_CONV_1D_STAGE_1:
16364
- {
16365
- n_tasks = n_threads;
16366
15900
  } break;
16367
15901
  case GGML_OP_CONV_TRANSPOSE_1D:
16368
15902
  {
16369
- n_tasks = n_threads;
16370
-
16371
15903
  GGML_ASSERT(node->src[0]->ne[3] == 1);
16372
15904
  GGML_ASSERT(node->src[1]->ne[2] == 1);
16373
15905
  GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16379,7 +15911,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16379
15911
  const int64_t ne10 = node->src[1]->ne[0]; // L
16380
15912
  const int64_t ne11 = node->src[1]->ne[1]; // Cin
16381
15913
 
16382
- size_t cur = 0;
16383
15914
  if (node->src[0]->type == GGML_TYPE_F16 &&
16384
15915
  node->src[1]->type == GGML_TYPE_F32) {
16385
15916
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
@@ -16391,59 +15922,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16391
15922
  } else {
16392
15923
  GGML_ASSERT(false);
16393
15924
  }
16394
-
16395
- work_size = MAX(work_size, cur);
16396
- } break;
16397
- case GGML_OP_CONV_2D:
16398
- {
16399
- n_tasks = n_threads;
16400
-
16401
- const int64_t ne00 = node->src[0]->ne[0]; // W
16402
- const int64_t ne01 = node->src[0]->ne[1]; // H
16403
- const int64_t ne02 = node->src[0]->ne[2]; // C
16404
- const int64_t ne03 = node->src[0]->ne[3]; // N
16405
-
16406
- const int64_t ne10 = node->src[1]->ne[0]; // W
16407
- const int64_t ne11 = node->src[1]->ne[1]; // H
16408
- const int64_t ne12 = node->src[1]->ne[2]; // C
16409
-
16410
- const int64_t ne0 = node->ne[0];
16411
- const int64_t ne1 = node->ne[1];
16412
- const int64_t ne2 = node->ne[2];
16413
- const int64_t ne3 = node->ne[3];
16414
- const int64_t nk = ne00*ne01;
16415
- const int64_t ew0 = nk * ne02;
16416
-
16417
- UNUSED(ne03);
16418
- UNUSED(ne2);
16419
-
16420
- size_t cur = 0;
16421
-
16422
- if (node->src[0]->type == GGML_TYPE_F16 &&
16423
- node->src[1]->type == GGML_TYPE_F32) {
16424
- // im2col: [N*OH*OW, IC*KH*KW]
16425
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
16426
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16427
- node->src[1]->type == GGML_TYPE_F32) {
16428
- cur = sizeof(float)* (ne10*ne11*ne12);
16429
- } else {
16430
- GGML_ASSERT(false);
16431
- }
16432
-
16433
- work_size = MAX(work_size, cur);
16434
- } break;
16435
- case GGML_OP_CONV_2D_STAGE_0:
16436
- {
16437
- n_tasks = n_threads;
16438
15925
  } break;
16439
- case GGML_OP_CONV_2D_STAGE_1:
15926
+ case GGML_OP_IM2COL:
16440
15927
  {
16441
15928
  n_tasks = n_threads;
16442
15929
  } break;
16443
15930
  case GGML_OP_CONV_TRANSPOSE_2D:
16444
15931
  {
16445
- n_tasks = n_threads;
16446
-
16447
15932
  const int64_t ne00 = node->src[0]->ne[0]; // W
16448
15933
  const int64_t ne01 = node->src[0]->ne[1]; // H
16449
15934
  const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16453,141 +15938,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16453
15938
  const int64_t ne11 = node->src[1]->ne[1]; // H
16454
15939
  const int64_t ne12 = node->src[1]->ne[2]; // Channels In
16455
15940
 
16456
- size_t cur = 0;
16457
15941
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
16458
15942
  cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
16459
-
16460
- work_size = MAX(work_size, cur);
16461
- } break;
16462
- case GGML_OP_POOL_1D:
16463
- case GGML_OP_POOL_2D:
16464
- {
16465
- n_tasks = 1;
16466
- } break;
16467
- case GGML_OP_UPSCALE:
16468
- {
16469
- n_tasks = n_threads;
16470
15943
  } break;
16471
15944
  case GGML_OP_FLASH_ATTN:
16472
15945
  {
16473
15946
  n_tasks = n_threads;
16474
15947
 
16475
- size_t cur = 0;
16476
-
16477
15948
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16478
15949
 
16479
15950
  if (node->src[1]->type == GGML_TYPE_F32) {
16480
15951
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16481
15952
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16482
- }
16483
-
16484
- if (node->src[1]->type == GGML_TYPE_F16) {
15953
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16485
15954
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16486
15955
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16487
15956
  }
16488
-
16489
- work_size = MAX(work_size, cur);
16490
15957
  } break;
16491
15958
  case GGML_OP_FLASH_FF:
16492
15959
  {
16493
15960
  n_tasks = n_threads;
16494
15961
 
16495
- size_t cur = 0;
16496
-
16497
15962
  if (node->src[1]->type == GGML_TYPE_F32) {
16498
15963
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16499
15964
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16500
- }
16501
-
16502
- if (node->src[1]->type == GGML_TYPE_F16) {
15965
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16503
15966
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16504
15967
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16505
15968
  }
16506
-
16507
- work_size = MAX(work_size, cur);
16508
15969
  } break;
16509
15970
  case GGML_OP_FLASH_ATTN_BACK:
16510
15971
  {
16511
15972
  n_tasks = n_threads;
16512
15973
 
16513
- size_t cur = 0;
16514
-
16515
15974
  const int64_t D = node->src[0]->ne[0];
16516
15975
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16517
15976
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16518
15977
  if (node->src[1]->type == GGML_TYPE_F32) {
16519
15978
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16520
15979
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16521
- }
16522
-
16523
- if (node->src[1]->type == GGML_TYPE_F16) {
15980
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16524
15981
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16525
15982
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16526
15983
  }
16527
-
16528
- work_size = MAX(work_size, cur);
16529
- } break;
16530
- case GGML_OP_WIN_PART:
16531
- case GGML_OP_WIN_UNPART:
16532
- case GGML_OP_GET_REL_POS:
16533
- case GGML_OP_MAP_UNARY:
16534
- case GGML_OP_MAP_BINARY:
16535
- case GGML_OP_MAP_CUSTOM1_F32:
16536
- case GGML_OP_MAP_CUSTOM2_F32:
16537
- case GGML_OP_MAP_CUSTOM3_F32:
16538
- {
16539
- n_tasks = 1;
16540
- } break;
16541
- case GGML_OP_MAP_CUSTOM1:
16542
- {
16543
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16544
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16545
- n_tasks = n_threads;
16546
- } else {
16547
- n_tasks = MIN(p->n_tasks, n_threads);
16548
- }
16549
- } break;
16550
- case GGML_OP_MAP_CUSTOM2:
16551
- {
16552
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16553
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16554
- n_tasks = n_threads;
16555
- } else {
16556
- n_tasks = MIN(p->n_tasks, n_threads);
16557
- }
16558
- } break;
16559
- case GGML_OP_MAP_CUSTOM3:
16560
- {
16561
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16562
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16563
- n_tasks = n_threads;
16564
- } else {
16565
- n_tasks = MIN(p->n_tasks, n_threads);
16566
- }
16567
15984
  } break;
15985
+
16568
15986
  case GGML_OP_CROSS_ENTROPY_LOSS:
16569
15987
  {
16570
15988
  n_tasks = n_threads;
16571
15989
 
16572
- size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16573
-
16574
- work_size = MAX(work_size, cur);
16575
- } break;
16576
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16577
- {
16578
- n_tasks = n_threads;
16579
- } break;
16580
- case GGML_OP_NONE:
16581
- {
16582
- n_tasks = 1;
15990
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16583
15991
  } break;
16584
15992
  case GGML_OP_COUNT:
16585
15993
  {
16586
15994
  GGML_ASSERT(false);
16587
15995
  } break;
15996
+ default:
15997
+ break;
16588
15998
  }
16589
15999
 
16590
- cplan.n_tasks[i] = n_tasks;
16000
+ work_size = MAX(work_size, cur);
16591
16001
  }
16592
16002
 
16593
16003
  if (work_size > 0) {
@@ -16609,12 +16019,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16609
16019
  if (cplan->work_size > 0) {
16610
16020
  GGML_ASSERT(cplan->work_data);
16611
16021
  }
16612
-
16613
- for (int i = 0; i < cgraph->n_nodes; ++i) {
16614
- if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16615
- GGML_ASSERT(cplan->n_tasks[i] > 0);
16616
- }
16617
- }
16618
16022
  }
16619
16023
 
16620
16024
  const int n_threads = cplan->n_threads;
@@ -16687,16 +16091,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16687
16091
  return compute_status;
16688
16092
  }
16689
16093
 
16690
- void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16691
- for (int i = 0; i < cgraph->n_nodes; i++) {
16692
- struct ggml_tensor * grad = cgraph->grads[i];
16693
-
16694
- if (grad) {
16695
- ggml_set_zero(grad);
16696
- }
16697
- }
16698
- }
16699
-
16700
16094
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16701
16095
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16702
16096
 
@@ -16823,12 +16217,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16823
16217
  const uint32_t magic = GGML_FILE_MAGIC;
16824
16218
  const uint32_t version = GGML_FILE_VERSION;
16825
16219
  const uint32_t n_leafs = cgraph->n_leafs;
16826
- const uint32_t nodes = cgraph->n_nodes;
16220
+ const uint32_t n_nodes = cgraph->n_nodes;
16827
16221
 
16828
16222
  fwrite(&magic, sizeof(uint32_t), 1, fout);
16829
16223
  fwrite(&version, sizeof(uint32_t), 1, fout);
16830
16224
  fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
16831
- fwrite(&nodes, sizeof(uint32_t), 1, fout);
16225
+ fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
16832
16226
  fwrite(&size_eval, sizeof(uint64_t), 1, fout);
16833
16227
  }
16834
16228
 
@@ -16916,7 +16310,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16916
16310
  if (idx == -1) {
16917
16311
  for (int k = 0; k < cgraph->n_nodes; ++k) {
16918
16312
  if (args[j] == cgraph->nodes[k]) {
16919
- idx = GGML_MAX_NODES + k;
16313
+ idx = cgraph->n_leafs + k;
16920
16314
  break;
16921
16315
  }
16922
16316
  }
@@ -16943,11 +16337,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16943
16337
  }
16944
16338
  }
16945
16339
 
16946
- struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16340
+ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16947
16341
  assert(*ctx_data == NULL);
16948
16342
  assert(*ctx_eval == NULL);
16949
16343
 
16950
- struct ggml_cgraph result = { 0 };
16344
+ struct ggml_cgraph * result = NULL;
16951
16345
 
16952
16346
  struct ggml_tensor * data = NULL;
16953
16347
 
@@ -17019,13 +16413,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17019
16413
  const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
17020
16414
  const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
17021
16415
  const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
17022
-
17023
- result.n_leafs = n_leafs;
17024
- result.n_nodes = n_nodes;
16416
+ const int graph_size = MAX(n_leafs, n_nodes);
17025
16417
 
17026
16418
  // create the data context
17027
16419
  {
17028
- const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
16420
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
17029
16421
 
17030
16422
  struct ggml_init_params params = {
17031
16423
  .mem_size = size_eval + overhead,
@@ -17041,6 +16433,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17041
16433
  }
17042
16434
  }
17043
16435
 
16436
+ result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
16437
+
16438
+ result->n_leafs = n_leafs;
16439
+ result->n_nodes = n_nodes;
16440
+
16441
+
17044
16442
  // leafs
17045
16443
  {
17046
16444
  uint32_t type;
@@ -17079,7 +16477,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17079
16477
  tensor->nb[j] = nb[j];
17080
16478
  }
17081
16479
 
17082
- result.leafs[i] = tensor;
16480
+ result->leafs[i] = tensor;
17083
16481
 
17084
16482
  ptr += ggml_nbytes(tensor);
17085
16483
 
@@ -17131,10 +16529,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17131
16529
  continue;
17132
16530
  }
17133
16531
 
17134
- if (arg_idx < GGML_MAX_NODES) {
17135
- args[j] = result.leafs[arg_idx];
16532
+ if (arg_idx < result->n_leafs) {
16533
+ args[j] = result->leafs[arg_idx];
17136
16534
  } else {
17137
- args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
16535
+ args[j] = result->nodes[arg_idx - result->n_leafs];
17138
16536
  }
17139
16537
  }
17140
16538
 
@@ -17186,7 +16584,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17186
16584
  tensor->src[j] = args[j];
17187
16585
  }
17188
16586
 
17189
- result.nodes[i] = tensor;
16587
+ result->nodes[i] = tensor;
17190
16588
 
17191
16589
  fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17192
16590
  }
@@ -18091,10 +17489,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18091
17489
  case GGML_OPT_ADAM:
18092
17490
  {
18093
17491
  result = (struct ggml_opt_params) {
18094
- .type = GGML_OPT_ADAM,
18095
- .n_threads = 1,
18096
- .past = 0,
18097
- .delta = 1e-5f,
17492
+ .type = GGML_OPT_ADAM,
17493
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17494
+ .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
17495
+ .past = 0,
17496
+ .delta = 1e-5f,
18098
17497
 
18099
17498
  .max_no_improvement = 100,
18100
17499
 
@@ -18121,10 +17520,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18121
17520
  case GGML_OPT_LBFGS:
18122
17521
  {
18123
17522
  result = (struct ggml_opt_params) {
18124
- .type = GGML_OPT_LBFGS,
18125
- .n_threads = 1,
18126
- .past = 0,
18127
- .delta = 1e-5f,
17523
+ .type = GGML_OPT_LBFGS,
17524
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17525
+ .n_threads = 1,
17526
+ .past = 0,
17527
+ .delta = 1e-5f,
18128
17528
 
18129
17529
  .max_no_improvement = 0,
18130
17530
 
@@ -18266,14 +17666,11 @@ enum ggml_opt_result ggml_opt_resume(
18266
17666
  struct ggml_tensor * f) {
18267
17667
 
18268
17668
  // build forward + backward compute graphs
18269
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18270
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18271
-
18272
- struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18273
- struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
17669
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
17670
+ ggml_build_forward_expand(gf, f);
18274
17671
 
18275
- *gf = ggml_build_forward (f);
18276
- *gb = ggml_build_backward(ctx, gf, true);
17672
+ struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
17673
+ ggml_build_backward_expand(ctx, gf, gb, true);
18277
17674
 
18278
17675
  return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18279
17676
  }
@@ -18729,7 +18126,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18729
18126
  {
18730
18127
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18731
18128
 
18732
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18129
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18733
18130
  struct gguf_kv * kv = &ctx->kv[i];
18734
18131
 
18735
18132
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18776,7 +18173,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18776
18173
  case GGUF_TYPE_STRING:
18777
18174
  {
18778
18175
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18779
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18176
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18780
18177
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18781
18178
  }
18782
18179
  } break;
@@ -18804,7 +18201,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18804
18201
  {
18805
18202
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18806
18203
 
18807
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18204
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18808
18205
  struct gguf_tensor_info * info = &ctx->infos[i];
18809
18206
 
18810
18207
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18851,7 +18248,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18851
18248
  // compute the total size of the data section, taking into account the alignment
18852
18249
  {
18853
18250
  ctx->size = 0;
18854
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18251
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18855
18252
  struct gguf_tensor_info * info = &ctx->infos[i];
18856
18253
 
18857
18254
  const int64_t ne =
@@ -18920,7 +18317,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18920
18317
  ggml_set_no_alloc(ctx_data, true);
18921
18318
 
18922
18319
  // create the tensors
18923
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18320
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18924
18321
  const int64_t ne[GGML_MAX_DIMS] = {
18925
18322
  ctx->infos[i].ne[0],
18926
18323
  ctx->infos[i].ne[1],
@@ -19055,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
19055
18452
  }
19056
18453
 
19057
18454
  const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
18455
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19058
18456
  return ctx->kv[key_id].key.data;
19059
18457
  }
19060
18458
 
19061
18459
  enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
18460
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19062
18461
  return ctx->kv[key_id].type;
19063
18462
  }
19064
18463
 
19065
18464
  enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
18465
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19066
18466
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19067
18467
  return ctx->kv[key_id].value.arr.type;
19068
18468
  }
19069
18469
 
19070
18470
  const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
18471
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19071
18472
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19072
18473
  return ctx->kv[key_id].value.arr.data;
19073
18474
  }
19074
18475
 
19075
18476
  const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
18477
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19076
18478
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19077
18479
  struct gguf_kv * kv = &ctx->kv[key_id];
19078
18480
  struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@@ -19080,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
19080
18482
  }
19081
18483
 
19082
18484
  int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
18485
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19083
18486
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
19084
18487
  return ctx->kv[key_id].value.arr.n;
19085
18488
  }
19086
18489
 
19087
18490
  uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
18491
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19088
18492
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
19089
18493
  return ctx->kv[key_id].value.uint8;
19090
18494
  }
19091
18495
 
19092
18496
  int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
18497
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19093
18498
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
19094
18499
  return ctx->kv[key_id].value.int8;
19095
18500
  }
19096
18501
 
19097
18502
  uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
18503
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19098
18504
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
19099
18505
  return ctx->kv[key_id].value.uint16;
19100
18506
  }
19101
18507
 
19102
18508
  int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
18509
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19103
18510
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
19104
18511
  return ctx->kv[key_id].value.int16;
19105
18512
  }
19106
18513
 
19107
18514
  uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
18515
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19108
18516
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
19109
18517
  return ctx->kv[key_id].value.uint32;
19110
18518
  }
19111
18519
 
19112
18520
  int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
18521
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19113
18522
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
19114
18523
  return ctx->kv[key_id].value.int32;
19115
18524
  }
19116
18525
 
19117
18526
  float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
18527
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19118
18528
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
19119
18529
  return ctx->kv[key_id].value.float32;
19120
18530
  }
19121
18531
 
19122
18532
  uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
18533
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19123
18534
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
19124
18535
  return ctx->kv[key_id].value.uint64;
19125
18536
  }
19126
18537
 
19127
18538
  int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
18539
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19128
18540
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
19129
18541
  return ctx->kv[key_id].value.int64;
19130
18542
  }
19131
18543
 
19132
18544
  double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
18545
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19133
18546
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
19134
18547
  return ctx->kv[key_id].value.float64;
19135
18548
  }
19136
18549
 
19137
18550
  bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
18551
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19138
18552
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
19139
18553
  return ctx->kv[key_id].value.bool_;
19140
18554
  }
19141
18555
 
19142
18556
  const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
18557
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
19143
18558
  GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
19144
18559
  return ctx->kv[key_id].value.str.data;
19145
18560
  }
19146
18561
 
18562
+ const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
18563
+ GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
18564
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
18565
+ GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
18566
+ return &ctx->kv[key_id].value;
18567
+ }
18568
+
19147
18569
  int gguf_get_n_tensors(const struct gguf_context * ctx) {
19148
18570
  return ctx->header.n_tensors;
19149
18571
  }