llama_cpp 0.9.1 → 0.9.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -100,6 +100,49 @@ typedef void * thread_ret_t;
100
100
  #include <hbwmalloc.h>
101
101
  #endif
102
102
 
103
+ #if defined(__APPLE__)
104
+ #include <TargetConditionals.h>
105
+ #endif
106
+
107
+ #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
108
+ (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
109
+
110
+ #include <sys/wait.h>
111
+
112
+ void ggml_print_backtrace(void) {
113
+ /*
114
+ #include <execinfo.h>
115
+ #include <dlfcn.h>
116
+
117
+ void * trace[100];
118
+
119
+ int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
120
+
121
+ backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
122
+ */
123
+
124
+ // backtrack_symbols does not show line numbers, use gdb instead
125
+ char attach[32];
126
+ snprintf(attach, sizeof(attach), "attach %d", getpid());
127
+ int pid = fork();
128
+ if (pid == 0) {
129
+ execlp("gdb", "gdb", "--batch",
130
+ "-ex", "set style enabled on",
131
+ "-ex", attach,
132
+ "-ex", "bt -frame-info source-and-location",
133
+ "-ex", "detach",
134
+ "-ex", "quit",
135
+ NULL);
136
+ } else {
137
+ waitpid(pid, NULL, 0);
138
+ }
139
+ }
140
+ #else
141
+ void ggml_print_backtrace(void) {
142
+ // platform not supported
143
+ }
144
+ #endif
145
+
103
146
  /*#define GGML_PERF*/
104
147
  #define GGML_DEBUG 0
105
148
  #define GGML_GELU_FP16
@@ -228,6 +271,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
228
271
  // floating point type used to accumulate sums
229
272
  typedef double ggml_float;
230
273
 
274
+ #undef MIN
275
+ #undef MAX
276
+
277
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
278
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
279
+
231
280
  //
232
281
  // global data
233
282
  //
@@ -561,6 +610,18 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
561
610
  // simd mappings
562
611
  //
563
612
 
613
+ #if defined(__ARM_NEON)
614
+ #if !defined(__aarch64__)
615
+
616
+ // 64-bit compatibility
617
+
618
+ inline static float vaddvq_f32(float32x4_t v) {
619
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
620
+ }
621
+
622
+ #endif
623
+ #endif
624
+
564
625
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
565
626
  // we then implement the fundamental computation operations below using only these macros
566
627
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -1352,6 +1413,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
1352
1413
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1353
1414
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1354
1415
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1416
+ inline static void ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
1355
1417
 
1356
1418
  static const float GELU_COEF_A = 0.044715f;
1357
1419
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1572,13 +1634,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1572
1634
  "ROPE_BACK",
1573
1635
  "ALIBI",
1574
1636
  "CLAMP",
1575
- "CONV_1D",
1576
- "CONV_1D_STAGE_0",
1577
- "CONV_1D_STAGE_1",
1578
1637
  "CONV_TRANSPOSE_1D",
1579
- "CONV_2D",
1580
- "CONV_2D_STAGE_0",
1581
- "CONV_2D_STAGE_1",
1638
+ "IM2COL",
1582
1639
  "CONV_TRANSPOSE_2D",
1583
1640
  "POOL_1D",
1584
1641
  "POOL_2D",
@@ -1609,7 +1666,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1609
1666
  "CROSS_ENTROPY_LOSS_BACK",
1610
1667
  };
1611
1668
 
1612
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1669
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1613
1670
 
1614
1671
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1615
1672
  "none",
@@ -1659,13 +1716,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1659
1716
  "rope_back(x)",
1660
1717
  "alibi(x)",
1661
1718
  "clamp(x)",
1662
- "conv_1d(x)",
1663
- "conv_1d_stage_0(x)",
1664
- "conv_1d_stage_1(x)",
1665
1719
  "conv_transpose_1d(x)",
1666
- "conv_2d(x)",
1667
- "conv_2d_stage_0(x)",
1668
- "conv_2d_stage_1(x)",
1720
+ "im2col(x)",
1669
1721
  "conv_transpose_2d(x)",
1670
1722
  "pool_1d(x)",
1671
1723
  "pool_2d(x)",
@@ -1696,7 +1748,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1696
1748
  "cross_entropy_loss_back(x,y)",
1697
1749
  };
1698
1750
 
1699
- static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
1751
+ static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
1700
1752
 
1701
1753
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1702
1754
 
@@ -1724,13 +1776,7 @@ static void ggml_setup_op_has_task_pass(void) {
1724
1776
  p[GGML_OP_GET_ROWS_BACK ] = true;
1725
1777
  p[GGML_OP_DIAG_MASK_INF ] = true;
1726
1778
  p[GGML_OP_DIAG_MASK_ZERO ] = true;
1727
- p[GGML_OP_CONV_1D ] = true;
1728
- p[GGML_OP_CONV_1D_STAGE_0 ] = true;
1729
- p[GGML_OP_CONV_1D_STAGE_1 ] = true;
1730
1779
  p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
1731
- p[GGML_OP_CONV_2D ] = true;
1732
- p[GGML_OP_CONV_2D_STAGE_0 ] = true;
1733
- p[GGML_OP_CONV_2D_STAGE_1 ] = true;
1734
1780
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
1735
1781
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
1736
1782
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@@ -3769,6 +3815,14 @@ struct ggml_tensor * ggml_relu_inplace(
3769
3815
  return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
3770
3816
  }
3771
3817
 
3818
+ // ggml_leaky
3819
+
3820
+ struct ggml_tensor * ggml_leaky(
3821
+ struct ggml_context * ctx,
3822
+ struct ggml_tensor * a) {
3823
+ return ggml_unary(ctx, a, GGML_UNARY_OP_LEAKY);
3824
+ }
3825
+
3772
3826
  // ggml_gelu
3773
3827
 
3774
3828
  struct ggml_tensor * ggml_gelu(
@@ -4970,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
4970
5024
  int n_dims,
4971
5025
  int mode,
4972
5026
  int n_ctx,
5027
+ int n_orig_ctx,
4973
5028
  float freq_base,
4974
5029
  float freq_scale,
5030
+ float ext_factor,
5031
+ float attn_factor,
5032
+ float beta_fast,
5033
+ float beta_slow,
4975
5034
  float xpos_base,
4976
5035
  bool xpos_down) {
4977
5036
  GGML_ASSERT(ggml_is_vector(b));
@@ -4988,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
4988
5047
 
4989
5048
  struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
4990
5049
 
4991
- int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
4992
- memcpy(params + 4, &freq_base, sizeof(float));
4993
- memcpy(params + 5, &freq_scale, sizeof(float));
4994
- memcpy(params + 6, &xpos_base, sizeof(float));
4995
- memcpy(params + 7, &xpos_down, sizeof(bool));
5050
+ int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
5051
+ memcpy(params + 5, &freq_base, sizeof(float));
5052
+ memcpy(params + 6, &freq_scale, sizeof(float));
5053
+ memcpy(params + 7, &ext_factor, sizeof(float));
5054
+ memcpy(params + 8, &attn_factor, sizeof(float));
5055
+ memcpy(params + 9, &beta_fast, sizeof(float));
5056
+ memcpy(params + 10, &beta_slow, sizeof(float));
5057
+ memcpy(params + 11, &xpos_base, sizeof(float));
5058
+ memcpy(params + 12, &xpos_down, sizeof(bool));
4996
5059
  ggml_set_op_params(result, params, sizeof(params));
4997
5060
 
4998
5061
  result->op = GGML_OP_ROPE_BACK;
@@ -5067,82 +5130,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
5067
5130
  return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
5068
5131
  }
5069
5132
 
5070
- // im2col: [N, IC, IL] => [N, OL, IC*K]
5071
- // a: [OC,IC, K]
5072
- // b: [N, IC, IL]
5073
- // result: [N, OL, IC*K]
5074
- static struct ggml_tensor * ggml_conv_1d_stage_0(
5075
- struct ggml_context * ctx,
5076
- struct ggml_tensor * a,
5077
- struct ggml_tensor * b,
5078
- int s0,
5079
- int p0,
5080
- int d0) {
5081
- GGML_ASSERT(a->ne[1] == b->ne[1]);
5082
- bool is_node = false;
5083
-
5084
- if (a->grad || b->grad) {
5085
- GGML_ASSERT(false); // TODO: implement backward
5086
- is_node = true;
5087
- }
5088
-
5089
- const int64_t OL = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5090
-
5091
- const int64_t ne[4] = {
5092
- a->ne[1] * a->ne[0],
5093
- OL,
5094
- b->ne[2],
5095
- 1,
5096
- };
5097
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5098
-
5099
- int32_t params[] = { s0, p0, d0 };
5100
- ggml_set_op_params(result, params, sizeof(params));
5101
-
5102
- result->op = GGML_OP_CONV_1D_STAGE_0;
5103
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5104
- result->src[0] = a;
5105
- result->src[1] = b;
5106
-
5107
- return result;
5108
- }
5109
-
5110
- // ggml_conv_1d_stage_1
5111
-
5112
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
5113
- // a: [OC, IC, K]
5114
- // b: [N, OL, IC * K]
5115
- // result: [N, OC, OL]
5116
- static struct ggml_tensor * ggml_conv_1d_stage_1(
5117
- struct ggml_context * ctx,
5118
- struct ggml_tensor * a,
5119
- struct ggml_tensor * b) {
5120
-
5121
- bool is_node = false;
5122
-
5123
- if (a->grad || b->grad) {
5124
- GGML_ASSERT(false); // TODO: implement backward
5125
- is_node = true;
5126
- }
5127
-
5128
- const int64_t ne[4] = {
5129
- b->ne[1],
5130
- a->ne[2],
5131
- b->ne[2],
5132
- 1,
5133
- };
5134
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5135
-
5136
- result->op = GGML_OP_CONV_1D_STAGE_1;
5137
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5138
- result->src[0] = a;
5139
- result->src[1] = b;
5140
-
5141
- return result;
5142
- }
5143
-
5144
- // ggml_conv_1d
5145
-
5146
5133
  GGML_API struct ggml_tensor * ggml_conv_1d(
5147
5134
  struct ggml_context * ctx,
5148
5135
  struct ggml_tensor * a,
@@ -5150,43 +5137,17 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
5150
5137
  int s0,
5151
5138
  int p0,
5152
5139
  int d0) {
5153
- struct ggml_tensor * result = ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
5154
- result = ggml_conv_1d_stage_1(ctx, a, result);
5155
- return result;
5156
- }
5140
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
5157
5141
 
5158
- // GGML_API struct ggml_tensor * ggml_conv_1d(
5159
- // struct ggml_context * ctx,
5160
- // struct ggml_tensor * a,
5161
- // struct ggml_tensor * b,
5162
- // int s0,
5163
- // int p0,
5164
- // int d0) {
5165
- // GGML_ASSERT(ggml_is_matrix(b));
5166
- // GGML_ASSERT(a->ne[1] == b->ne[1]);
5167
- // bool is_node = false;
5142
+ struct ggml_tensor * result =
5143
+ ggml_mul_mat(ctx,
5144
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
5145
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
5168
5146
 
5169
- // if (a->grad || b->grad) {
5170
- // GGML_ASSERT(false); // TODO: implement backward
5171
- // is_node = true;
5172
- // }
5147
+ result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
5173
5148
 
5174
- // const int64_t ne[4] = {
5175
- // ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
5176
- // a->ne[2], 1, 1,
5177
- // };
5178
- // struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
5179
-
5180
- // int32_t params[] = { s0, p0, d0 };
5181
- // ggml_set_op_params(result, params, sizeof(params));
5182
-
5183
- // result->op = GGML_OP_CONV_1D;
5184
- // result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5185
- // result->src[0] = a;
5186
- // result->src[1] = b;
5187
-
5188
- // return result;
5189
- // }
5149
+ return result;
5150
+ }
5190
5151
 
5191
5152
  // ggml_conv_1d_ph
5192
5153
 
@@ -5249,7 +5210,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
5249
5210
  // a: [OC,IC, KH, KW]
5250
5211
  // b: [N, IC, IH, IW]
5251
5212
  // result: [N, OH, OW, IC*KH*KW]
5252
- static struct ggml_tensor * ggml_conv_2d_stage_0(
5213
+ struct ggml_tensor * ggml_im2col(
5253
5214
  struct ggml_context * ctx,
5254
5215
  struct ggml_tensor * a,
5255
5216
  struct ggml_tensor * b,
@@ -5258,9 +5219,14 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5258
5219
  int p0,
5259
5220
  int p1,
5260
5221
  int d0,
5261
- int d1) {
5222
+ int d1,
5223
+ bool is_2D) {
5262
5224
 
5263
- GGML_ASSERT(a->ne[2] == b->ne[2]);
5225
+ if(is_2D) {
5226
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
5227
+ } else {
5228
+ GGML_ASSERT(a->ne[1] == b->ne[1]);
5229
+ }
5264
5230
  bool is_node = false;
5265
5231
 
5266
5232
  if (a->grad || b->grad) {
@@ -5268,81 +5234,51 @@ static struct ggml_tensor * ggml_conv_2d_stage_0(
5268
5234
  is_node = true;
5269
5235
  }
5270
5236
 
5271
- const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
5272
- const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5237
+ const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
5238
+ const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
5273
5239
 
5274
5240
  const int64_t ne[4] = {
5275
- a->ne[2] * a->ne[1] * a->ne[0],
5241
+ is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
5276
5242
  OW,
5277
- OH,
5278
- b->ne[3],
5243
+ is_2D ? OH : b->ne[2],
5244
+ is_2D ? b->ne[3] : 1,
5279
5245
  };
5280
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5281
5246
 
5282
- int32_t params[] = { s0, s1, p0, p1, d0, d1 };
5247
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
5248
+ int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
5283
5249
  ggml_set_op_params(result, params, sizeof(params));
5284
5250
 
5285
- result->op = GGML_OP_CONV_2D_STAGE_0;
5286
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5287
- result->src[0] = a;
5288
- result->src[1] = b;
5289
-
5290
- return result;
5291
-
5292
- }
5293
-
5294
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
5295
- // a: [OC, IC, KH, KW]
5296
- // b: [N, OH, OW, IC * KH * KW]
5297
- // result: [N, OC, OH, OW]
5298
- static struct ggml_tensor * ggml_conv_2d_stage_1(
5299
- struct ggml_context * ctx,
5300
- struct ggml_tensor * a,
5301
- struct ggml_tensor * b) {
5302
-
5303
- bool is_node = false;
5304
-
5305
- if (a->grad || b->grad) {
5306
- GGML_ASSERT(false); // TODO: implement backward
5307
- is_node = true;
5308
- }
5309
-
5310
- const int64_t ne[4] = {
5311
- b->ne[1],
5312
- b->ne[2],
5313
- a->ne[3],
5314
- b->ne[3],
5315
- };
5316
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5317
-
5318
- result->op = GGML_OP_CONV_2D_STAGE_1;
5251
+ result->op = GGML_OP_IM2COL;
5319
5252
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5320
5253
  result->src[0] = a;
5321
5254
  result->src[1] = b;
5322
5255
 
5323
5256
  return result;
5324
-
5325
5257
  }
5326
5258
 
5327
5259
  // a: [OC,IC, KH, KW]
5328
5260
  // b: [N, IC, IH, IW]
5329
5261
  // result: [N, OC, OH, OW]
5330
5262
  struct ggml_tensor * ggml_conv_2d(
5331
- struct ggml_context * ctx,
5332
- struct ggml_tensor * a,
5333
- struct ggml_tensor * b,
5334
- int s0,
5335
- int s1,
5336
- int p0,
5337
- int p1,
5338
- int d0,
5339
- int d1) {
5263
+ struct ggml_context * ctx,
5264
+ struct ggml_tensor * a,
5265
+ struct ggml_tensor * b,
5266
+ int s0,
5267
+ int s1,
5268
+ int p0,
5269
+ int p1,
5270
+ int d0,
5271
+ int d1) {
5272
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
5340
5273
 
5341
- struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
5342
- result = ggml_conv_2d_stage_1(ctx, a, result);
5274
+ struct ggml_tensor * result =
5275
+ ggml_mul_mat(ctx,
5276
+ ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
5277
+ ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
5343
5278
 
5344
- return result;
5279
+ result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
5345
5280
 
5281
+ return result;
5346
5282
  }
5347
5283
 
5348
5284
  // ggml_conv_2d_sk_p0
@@ -5402,7 +5338,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
5402
5338
 
5403
5339
  // ggml_pool_*
5404
5340
 
5405
- static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
5341
+ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
5406
5342
  return (ins + 2 * p - ks) / s + 1;
5407
5343
  }
5408
5344
 
@@ -5449,8 +5385,8 @@ struct ggml_tensor * ggml_pool_2d(
5449
5385
  int k1,
5450
5386
  int s0,
5451
5387
  int s1,
5452
- int p0,
5453
- int p1) {
5388
+ float p0,
5389
+ float p1) {
5454
5390
 
5455
5391
  bool is_node = false;
5456
5392
 
@@ -8912,6 +8848,48 @@ static void ggml_compute_forward_silu(
8912
8848
  }
8913
8849
  }
8914
8850
 
8851
+ // ggml_compute_forward_leaky
8852
+
8853
+ static void ggml_compute_forward_leaky_f32(
8854
+ const struct ggml_compute_params * params,
8855
+ const struct ggml_tensor * src0,
8856
+ struct ggml_tensor * dst) {
8857
+ assert(params->ith == 0);
8858
+ assert(ggml_are_same_shape(src0, dst));
8859
+
8860
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
8861
+ return;
8862
+ }
8863
+
8864
+ const int n = ggml_nrows(src0);
8865
+ const int nc = src0->ne[0];
8866
+
8867
+ assert(dst->nb[0] == sizeof(float));
8868
+ assert(src0->nb[0] == sizeof(float));
8869
+
8870
+ for (int i = 0; i < n; i++) {
8871
+ ggml_vec_leaky_f32(nc,
8872
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
8873
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
8874
+ }
8875
+ }
8876
+
8877
+ static void ggml_compute_forward_leaky(
8878
+ const struct ggml_compute_params * params,
8879
+ const struct ggml_tensor * src0,
8880
+ struct ggml_tensor * dst) {
8881
+ switch (src0->type) {
8882
+ case GGML_TYPE_F32:
8883
+ {
8884
+ ggml_compute_forward_leaky_f32(params, src0, dst);
8885
+ } break;
8886
+ default:
8887
+ {
8888
+ GGML_ASSERT(false);
8889
+ } break;
8890
+ }
8891
+ }
8892
+
8915
8893
  // ggml_compute_forward_silu_back
8916
8894
 
8917
8895
  static void ggml_compute_forward_silu_back_f32(
@@ -9395,6 +9373,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
9395
9373
  // TODO: find the optimal values for these
9396
9374
  if (ggml_is_contiguous(src0) &&
9397
9375
  ggml_is_contiguous(src1) &&
9376
+ src0->type == GGML_TYPE_F32 &&
9377
+ src1->type == GGML_TYPE_F32 &&
9398
9378
  (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
9399
9379
 
9400
9380
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9433,7 +9413,7 @@ static void ggml_compute_forward_mul_mat(
9433
9413
 
9434
9414
  // we don't support permuted src0 or src1
9435
9415
  GGML_ASSERT(nb00 == ggml_type_size(type));
9436
- GGML_ASSERT(nb10 == sizeof(float));
9416
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
9437
9417
 
9438
9418
  // dst cannot be transposed or permuted
9439
9419
  GGML_ASSERT(nb0 == sizeof(float));
@@ -10974,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
10974
10954
  const struct ggml_compute_params * params,
10975
10955
  const struct ggml_tensor * src0,
10976
10956
  const struct ggml_tensor * src1,
10977
- struct ggml_tensor * dst) {
10957
+ struct ggml_tensor * dst,
10958
+ const bool forward) {
10978
10959
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
10979
10960
  return;
10980
10961
  }
@@ -11033,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
11033
11014
  const bool is_neox = mode & 2;
11034
11015
  const bool is_glm = mode & 4;
11035
11016
 
11017
+ // backward process uses inverse rotation by cos and sin.
11018
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
11019
+ // this essentially just switches the sign of sin.
11020
+ const float sin_sign = forward ? 1.0f : -1.0f;
11021
+
11036
11022
  const int32_t * pos = (const int32_t *) src1->data;
11037
11023
 
11038
11024
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11049,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
11049
11035
  float block_theta = MAX(p - (n_ctx - 2), 0);
11050
11036
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11051
11037
  const float cos_theta = cosf(theta_base);
11052
- const float sin_theta = sinf(theta_base);
11038
+ const float sin_theta = sinf(theta_base) * sin_sign;
11053
11039
  const float cos_block_theta = cosf(block_theta);
11054
- const float sin_block_theta = sinf(block_theta);
11040
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
11055
11041
 
11056
11042
  theta_base *= theta_scale;
11057
11043
  block_theta *= theta_scale;
@@ -11075,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
11075
11061
  rope_yarn(
11076
11062
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11077
11063
  );
11064
+ sin_theta *= sin_sign;
11078
11065
 
11079
11066
  // zeta scaling for xPos only:
11080
11067
  float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -11105,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
11105
11092
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11106
11093
  &cos_theta, &sin_theta
11107
11094
  );
11095
+ sin_theta *= sin_sign;
11108
11096
 
11109
11097
  theta_base *= theta_scale;
11110
11098
 
@@ -11130,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
11130
11118
  const struct ggml_compute_params * params,
11131
11119
  const struct ggml_tensor * src0,
11132
11120
  const struct ggml_tensor * src1,
11133
- struct ggml_tensor * dst) {
11121
+ struct ggml_tensor * dst,
11122
+ const bool forward) {
11134
11123
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11135
11124
  return;
11136
11125
  }
@@ -11182,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
11182
11171
  const bool is_neox = mode & 2;
11183
11172
  const bool is_glm = mode & 4;
11184
11173
 
11174
+ // backward process uses inverse rotation by cos and sin.
11175
+ // cos and sin build a rotation matrix, where the inverse is the transpose.
11176
+ // this essentially just switches the sign of sin.
11177
+ const float sin_sign = forward ? 1.0f : -1.0f;
11178
+
11185
11179
  const int32_t * pos = (const int32_t *) src1->data;
11186
11180
 
11187
11181
  for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11198,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
11198
11192
  float block_theta = MAX(p - (n_ctx - 2), 0);
11199
11193
  for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
11200
11194
  const float cos_theta = cosf(theta_base);
11201
- const float sin_theta = sinf(theta_base);
11195
+ const float sin_theta = sinf(theta_base) * sin_sign;
11202
11196
  const float cos_block_theta = cosf(block_theta);
11203
- const float sin_block_theta = sinf(block_theta);
11197
+ const float sin_block_theta = sinf(block_theta) * sin_sign;
11204
11198
 
11205
11199
  theta_base *= theta_scale;
11206
11200
  block_theta *= theta_scale;
@@ -11224,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
11224
11218
  rope_yarn(
11225
11219
  theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
11226
11220
  );
11221
+ sin_theta *= sin_sign;
11227
11222
 
11228
11223
  theta_base *= theta_scale;
11229
11224
 
@@ -11250,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
11250
11245
  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
11251
11246
  &cos_theta, &sin_theta
11252
11247
  );
11248
+ sin_theta *= sin_sign;
11253
11249
 
11254
11250
  theta_base *= theta_scale;
11255
11251
 
@@ -11279,11 +11275,11 @@ static void ggml_compute_forward_rope(
11279
11275
  switch (src0->type) {
11280
11276
  case GGML_TYPE_F16:
11281
11277
  {
11282
- ggml_compute_forward_rope_f16(params, src0, src1, dst);
11278
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
11283
11279
  } break;
11284
11280
  case GGML_TYPE_F32:
11285
11281
  {
11286
- ggml_compute_forward_rope_f32(params, src0, src1, dst);
11282
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
11287
11283
  } break;
11288
11284
  default:
11289
11285
  {
@@ -11294,693 +11290,73 @@ static void ggml_compute_forward_rope(
11294
11290
 
11295
11291
  // ggml_compute_forward_rope_back
11296
11292
 
11297
- static void ggml_compute_forward_rope_back_f32(
11293
+ static void ggml_compute_forward_rope_back(
11298
11294
  const struct ggml_compute_params * params,
11299
11295
  const struct ggml_tensor * src0,
11300
11296
  const struct ggml_tensor * src1,
11301
11297
  struct ggml_tensor * dst) {
11302
-
11303
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11304
- return;
11298
+ switch (src0->type) {
11299
+ case GGML_TYPE_F16:
11300
+ {
11301
+ ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
11302
+ } break;
11303
+ case GGML_TYPE_F32:
11304
+ {
11305
+ ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
11306
+ } break;
11307
+ default:
11308
+ {
11309
+ GGML_ASSERT(false);
11310
+ } break;
11305
11311
  }
11312
+ }
11306
11313
 
11307
- // y = rope(x, src1)
11308
- // dx = rope_back(dy, src1)
11309
- // src0 is dy, src1 contains options
11310
-
11311
- float freq_base;
11312
- float freq_scale;
11313
-
11314
- // these two only relevant for xPos RoPE:
11315
- float xpos_base;
11316
- bool xpos_down;
11317
-
11318
- //const int n_past = ((int32_t *) dst->op_params)[0];
11319
- const int n_dims = ((int32_t *) dst->op_params)[1];
11320
- const int mode = ((int32_t *) dst->op_params)[2];
11321
- const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
11322
- memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
11323
- memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
11324
- memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
11325
- memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
11314
+ // ggml_compute_forward_conv_transpose_1d
11326
11315
 
11327
- GGML_TENSOR_UNARY_OP_LOCALS
11316
+ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11317
+ const struct ggml_compute_params * params,
11318
+ const struct ggml_tensor * src0,
11319
+ const struct ggml_tensor * src1,
11320
+ struct ggml_tensor * dst) {
11321
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
11322
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
11323
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
11328
11324
 
11329
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
11330
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
11325
+ int64_t t0 = ggml_perf_time_us();
11326
+ UNUSED(t0);
11331
11327
 
11332
- assert(nb0 == sizeof(float));
11328
+ GGML_TENSOR_BINARY_OP_LOCALS
11333
11329
 
11334
11330
  const int ith = params->ith;
11335
11331
  const int nth = params->nth;
11336
11332
 
11337
- const int nr = ggml_nrows(dst);
11333
+ const int nk = ne00*ne01*ne02;
11338
11334
 
11339
- // rows per thread
11340
- const int dr = (nr + nth - 1)/nth;
11335
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11336
+ GGML_ASSERT(nb10 == sizeof(float));
11341
11337
 
11342
- // row range for this thread
11343
- const int ir0 = dr*ith;
11344
- const int ir1 = MIN(ir0 + dr, nr);
11338
+ if (params->type == GGML_TASK_INIT) {
11339
+ memset(params->wdata, 0, params->wsize);
11345
11340
 
11346
- // row index used to determine which thread to use
11347
- int ir = 0;
11341
+ // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11342
+ {
11343
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11348
11344
 
11349
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
11345
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
11346
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
11347
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11348
+ ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11349
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
11350
+ dst_data[i00*ne02 + i02] = src[i00];
11351
+ }
11352
+ }
11353
+ }
11354
+ }
11350
11355
 
11351
- const bool is_neox = mode & 2;
11352
-
11353
- const int32_t * pos = (const int32_t *) src1->data;
11354
-
11355
- for (int64_t i3 = 0; i3 < ne3; i3++) {
11356
- for (int64_t i2 = 0; i2 < ne2; i2++) {
11357
- const int64_t p = pos[i2];
11358
- for (int64_t i1 = 0; i1 < ne1; i1++) {
11359
- if (ir++ < ir0) continue;
11360
- if (ir > ir1) break;
11361
-
11362
- float theta_base = freq_scale * (float)p;
11363
-
11364
- if (!is_neox) {
11365
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11366
- const float cos_theta = cosf(theta_base);
11367
- const float sin_theta = sinf(theta_base);
11368
-
11369
- // zeta scaling for xPos only:
11370
- float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
11371
- if (xpos_down) zeta = 1.0f / zeta;
11372
-
11373
- theta_base *= theta_scale;
11374
-
11375
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11376
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11377
-
11378
- const float dy0 = dy[0];
11379
- const float dy1 = dy[1];
11380
-
11381
- dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
11382
- dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
11383
- }
11384
- } else {
11385
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11386
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11387
- const float cos_theta = cosf(theta_base);
11388
- const float sin_theta = sinf(theta_base);
11389
-
11390
- theta_base *= theta_scale;
11391
-
11392
- const int64_t i0 = ib*n_dims + ic/2;
11393
-
11394
- const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11395
- float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11396
-
11397
- const float dy0 = dy[0];
11398
- const float dy1 = dy[n_dims/2];
11399
-
11400
- dx[0] = dy0*cos_theta + dy1*sin_theta;
11401
- dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
11402
- }
11403
- }
11404
- }
11405
- }
11406
- }
11407
- }
11408
- }
11409
-
11410
- static void ggml_compute_forward_rope_back_f16(
11411
- const struct ggml_compute_params * params,
11412
- const struct ggml_tensor * src0,
11413
- const struct ggml_tensor * src1,
11414
- struct ggml_tensor * dst) {
11415
-
11416
- if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11417
- return;
11418
- }
11419
-
11420
- // y = rope(x, src1)
11421
- // dx = rope_back(dy, src1)
11422
- // src0 is dy, src1 contains options
11423
-
11424
- //const int n_past = ((int32_t *) dst->op_params)[0];
11425
- const int n_dims = ((int32_t *) dst->op_params)[1];
11426
- const int mode = ((int32_t *) dst->op_params)[2];
11427
-
11428
- GGML_TENSOR_UNARY_OP_LOCALS
11429
-
11430
- //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
11431
- //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
11432
-
11433
- assert(nb0 == sizeof(ggml_fp16_t));
11434
-
11435
- const int ith = params->ith;
11436
- const int nth = params->nth;
11437
-
11438
- const int nr = ggml_nrows(dst);
11439
-
11440
- // rows per thread
11441
- const int dr = (nr + nth - 1)/nth;
11442
-
11443
- // row range for this thread
11444
- const int ir0 = dr*ith;
11445
- const int ir1 = MIN(ir0 + dr, nr);
11446
-
11447
- // row index used to determine which thread to use
11448
- int ir = 0;
11449
-
11450
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
11451
-
11452
- const bool is_neox = mode & 2;
11453
-
11454
- const int32_t * pos = (const int32_t *) src1->data;
11455
-
11456
- for (int64_t i3 = 0; i3 < ne3; i3++) {
11457
- for (int64_t i2 = 0; i2 < ne2; i2++) {
11458
- const int64_t p = pos[i2];
11459
- for (int64_t i1 = 0; i1 < ne1; i1++) {
11460
- if (ir++ < ir0) continue;
11461
- if (ir > ir1) break;
11462
-
11463
- float theta_base = (float)p;
11464
-
11465
- if (!is_neox) {
11466
- for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
11467
- const float cos_theta = cosf(theta_base);
11468
- const float sin_theta = sinf(theta_base);
11469
-
11470
- theta_base *= theta_scale;
11471
-
11472
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11473
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11474
-
11475
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
11476
- const float dy1 = GGML_FP16_TO_FP32(dy[1]);
11477
-
11478
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
11479
- dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
11480
- }
11481
- } else {
11482
- for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
11483
- for (int64_t ic = 0; ic < n_dims; ic += 2) {
11484
- const float cos_theta = cosf(theta_base);
11485
- const float sin_theta = sinf(theta_base);
11486
-
11487
- theta_base *= theta_scale;
11488
-
11489
- const int64_t i0 = ib*n_dims + ic/2;
11490
-
11491
- const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
11492
- ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
11493
-
11494
- const float dy0 = GGML_FP16_TO_FP32(dy[0]);
11495
- const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
11496
-
11497
- dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
11498
- dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
11499
- }
11500
- }
11501
- }
11502
- }
11503
- }
11504
- }
11505
- }
11506
-
11507
- static void ggml_compute_forward_rope_back(
11508
- const struct ggml_compute_params * params,
11509
- const struct ggml_tensor * src0,
11510
- const struct ggml_tensor * src1,
11511
- struct ggml_tensor * dst) {
11512
- switch (src0->type) {
11513
- case GGML_TYPE_F16:
11514
- {
11515
- ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
11516
- } break;
11517
- case GGML_TYPE_F32:
11518
- {
11519
- ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
11520
- } break;
11521
- default:
11522
- {
11523
- GGML_ASSERT(false);
11524
- } break;
11525
- }
11526
- }
11527
-
11528
- // ggml_compute_forward_conv_1d
11529
-
11530
- static void ggml_compute_forward_conv_1d_f16_f32(
11531
- const struct ggml_compute_params * params,
11532
- const struct ggml_tensor * src0,
11533
- const struct ggml_tensor * src1,
11534
- struct ggml_tensor * dst) {
11535
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11536
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11537
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11538
-
11539
- int64_t t0 = ggml_perf_time_us();
11540
- UNUSED(t0);
11541
-
11542
- GGML_TENSOR_BINARY_OP_LOCALS
11543
-
11544
- const int ith = params->ith;
11545
- const int nth = params->nth;
11546
-
11547
- const int nk = ne00;
11548
-
11549
- // size of the convolution row - the kernel size unrolled across all input channels
11550
- const int ew0 = nk*ne01;
11551
-
11552
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11553
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11554
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11555
-
11556
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11557
- GGML_ASSERT(nb10 == sizeof(float));
11558
-
11559
- if (params->type == GGML_TASK_INIT) {
11560
- memset(params->wdata, 0, params->wsize);
11561
-
11562
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11563
-
11564
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11565
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11566
- ggml_fp16_t * dst_data = wdata;
11567
-
11568
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11569
- for (int64_t ik = 0; ik < nk; ik++) {
11570
- const int idx0 = i0*s0 + ik*d0 - p0;
11571
-
11572
- if(!(idx0 < 0 || idx0 >= ne10)) {
11573
- dst_data[i0*ew0 + i11*nk + ik] = GGML_FP32_TO_FP16(src[idx0]);
11574
- }
11575
- }
11576
- }
11577
- }
11578
-
11579
- return;
11580
- }
11581
-
11582
- if (params->type == GGML_TASK_FINALIZE) {
11583
- return;
11584
- }
11585
-
11586
- // total rows in dst
11587
- const int nr = ne2;
11588
-
11589
- // rows per thread
11590
- const int dr = (nr + nth - 1)/nth;
11591
-
11592
- // row range for this thread
11593
- const int ir0 = dr*ith;
11594
- const int ir1 = MIN(ir0 + dr, nr);
11595
-
11596
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11597
-
11598
- for (int i2 = 0; i2 < ne2; i2++) {
11599
- for (int i1 = ir0; i1 < ir1; i1++) {
11600
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11601
-
11602
- for (int i0 = 0; i0 < ne0; i0++) {
11603
- ggml_vec_dot_f16(ew0, dst_data + i0,
11604
- (ggml_fp16_t *) ((char *) src0->data + i1*nb02),
11605
- (ggml_fp16_t *) wdata + i2*nb2 + i0*ew0);
11606
- }
11607
- }
11608
- }
11609
- }
11610
-
11611
- static void ggml_compute_forward_conv_1d_f32(
11612
- const struct ggml_compute_params * params,
11613
- const struct ggml_tensor * src0,
11614
- const struct ggml_tensor * src1,
11615
- struct ggml_tensor * dst) {
11616
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
11617
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11618
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11619
-
11620
- int64_t t0 = ggml_perf_time_us();
11621
- UNUSED(t0);
11622
-
11623
- GGML_TENSOR_BINARY_OP_LOCALS
11624
-
11625
- const int ith = params->ith;
11626
- const int nth = params->nth;
11627
-
11628
- const int nk = ne00;
11629
-
11630
- const int ew0 = nk*ne01;
11631
-
11632
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11633
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11634
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11635
-
11636
- GGML_ASSERT(nb00 == sizeof(float));
11637
- GGML_ASSERT(nb10 == sizeof(float));
11638
-
11639
- if (params->type == GGML_TASK_INIT) {
11640
- memset(params->wdata, 0, params->wsize);
11641
-
11642
- float * const wdata = (float *) params->wdata + 0;
11643
-
11644
- for (int64_t i11 = 0; i11 < ne11; i11++) {
11645
- const float * const src = (float *)((char *) src1->data + i11*nb11);
11646
- float * dst_data = wdata;
11647
-
11648
- for (int64_t i0 = 0; i0 < ne0; i0++) {
11649
- for (int64_t ik = 0; ik < nk; ik++) {
11650
- const int idx0 = i0*s0 + ik*d0 - p0;
11651
-
11652
- if(!(idx0 < 0 || idx0 >= ne10)) {
11653
- dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
11654
- }
11655
- }
11656
- }
11657
- }
11658
-
11659
- return;
11660
- }
11661
-
11662
- if (params->type == GGML_TASK_FINALIZE) {
11663
- return;
11664
- }
11665
-
11666
- // total rows in dst
11667
- const int nr = ne02;
11668
-
11669
- // rows per thread
11670
- const int dr = (nr + nth - 1)/nth;
11671
-
11672
- // row range for this thread
11673
- const int ir0 = dr*ith;
11674
- const int ir1 = MIN(ir0 + dr, nr);
11675
-
11676
- float * const wdata = (float *) params->wdata + 0;
11677
-
11678
- for (int i2 = 0; i2 < ne2; i2++) {
11679
- for (int i1 = ir0; i1 < ir1; i1++) {
11680
- float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
11681
-
11682
- for (int i0 = 0; i0 < ne0; i0++) {
11683
- ggml_vec_dot_f32(ew0, dst_data + i0,
11684
- (float *) ((char *) src0->data + i1*nb02),
11685
- (float *) wdata + i2*nb2 + i0*ew0);
11686
- }
11687
- }
11688
- }
11689
- }
11690
-
11691
- // TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
11692
- static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
11693
- ggml_fp16_t * A,
11694
- ggml_fp16_t * B,
11695
- float * C,
11696
- const int ith, const int nth) {
11697
- // does not seem to make a difference
11698
- int64_t m0, m1, n0, n1;
11699
- // patches per thread
11700
- if (m > n) {
11701
- n0 = 0;
11702
- n1 = n;
11703
-
11704
- // total patches in dst
11705
- const int np = m;
11706
-
11707
- // patches per thread
11708
- const int dp = (np + nth - 1)/nth;
11709
-
11710
- // patch range for this thread
11711
- m0 = dp*ith;
11712
- m1 = MIN(m0 + dp, np);
11713
- } else {
11714
- m0 = 0;
11715
- m1 = m;
11716
-
11717
- // total patches in dst
11718
- const int np = n;
11719
-
11720
- // patches per thread
11721
- const int dp = (np + nth - 1)/nth;
11722
-
11723
- // patch range for this thread
11724
- n0 = dp*ith;
11725
- n1 = MIN(n0 + dp, np);
11726
- }
11727
-
11728
- // block-tiling attempt
11729
- int64_t blck_n = 16;
11730
- int64_t blck_m = 16;
11731
-
11732
- // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
11733
- // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(ggml_fp16_t) * K);
11734
- // if (blck_size > 0) {
11735
- // blck_0 = 4;
11736
- // blck_1 = blck_size / blck_0;
11737
- // if (blck_1 < 0) {
11738
- // blck_1 = 1;
11739
- // }
11740
- // // blck_0 = (int64_t)sqrt(blck_size);
11741
- // // blck_1 = blck_0;
11742
- // }
11743
- // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
11744
-
11745
- for (int j = n0; j < n1; j+=blck_n) {
11746
- for (int i = m0; i < m1; i+=blck_m) {
11747
- // printf("i j k => %d %d %d\n", i, j, K);
11748
- for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
11749
- for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
11750
- ggml_vec_dot_f16(k,
11751
- C + ii*n + jj,
11752
- A + ii * k,
11753
- B + jj * k);
11754
- }
11755
- }
11756
- }
11757
- }
11758
- }
11759
-
11760
- // src0: kernel [OC, IC, K]
11761
- // src1: signal [N, IC, IL]
11762
- // dst: result [N, OL, IC*K]
11763
- static void ggml_compute_forward_conv_1d_stage_0_f32(
11764
- const struct ggml_compute_params * params,
11765
- const struct ggml_tensor * src0,
11766
- const struct ggml_tensor * src1,
11767
- struct ggml_tensor * dst) {
11768
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11769
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11770
- GGML_ASSERT( dst->type == GGML_TYPE_F16);
11771
-
11772
- int64_t t0 = ggml_perf_time_us();
11773
- UNUSED(t0);
11774
-
11775
- GGML_TENSOR_BINARY_OP_LOCALS;
11776
-
11777
- const int64_t N = ne12;
11778
- const int64_t IC = ne11;
11779
- const int64_t IL = ne10;
11780
-
11781
- const int64_t K = ne00;
11782
-
11783
- const int64_t OL = ne1;
11784
-
11785
- const int ith = params->ith;
11786
- const int nth = params->nth;
11787
-
11788
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
11789
- const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
11790
- const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
11791
-
11792
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11793
- GGML_ASSERT(nb10 == sizeof(float));
11794
-
11795
- if (params->type == GGML_TASK_INIT) {
11796
- memset(dst->data, 0, ggml_nbytes(dst));
11797
- return;
11798
- }
11799
-
11800
- if (params->type == GGML_TASK_FINALIZE) {
11801
- return;
11802
- }
11803
-
11804
- // im2col: [N, IC, IL] => [N, OL, IC*K]
11805
- {
11806
- ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
11807
-
11808
- for (int64_t in = 0; in < N; in++) {
11809
- for (int64_t iol = 0; iol < OL; iol++) {
11810
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11811
-
11812
- // micro kernel
11813
- ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
11814
- const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
11815
-
11816
- for (int64_t ik = 0; ik < K; ik++) {
11817
- const int64_t iil = iol*s0 + ik*d0 - p0;
11818
-
11819
- if (!(iil < 0 || iil >= IL)) {
11820
- dst_data[iic*K + ik] = GGML_FP32_TO_FP16(src_data[iil]);
11821
- }
11822
- }
11823
- }
11824
- }
11825
- }
11826
- }
11827
- }
11828
-
11829
- // gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11830
- // src0: [OC, IC, K]
11831
- // src1: [N, OL, IC * K]
11832
- // result: [N, OC, OL]
11833
- static void ggml_compute_forward_conv_1d_stage_1_f16(
11834
- const struct ggml_compute_params * params,
11835
- const struct ggml_tensor * src0,
11836
- const struct ggml_tensor * src1,
11837
- struct ggml_tensor * dst) {
11838
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11839
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
11840
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11841
-
11842
- int64_t t0 = ggml_perf_time_us();
11843
- UNUSED(t0);
11844
-
11845
- if (params->type == GGML_TASK_INIT) {
11846
- return;
11847
- }
11848
-
11849
- if (params->type == GGML_TASK_FINALIZE) {
11850
- return;
11851
- }
11852
-
11853
- GGML_TENSOR_BINARY_OP_LOCALS;
11854
-
11855
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11856
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
11857
- GGML_ASSERT(nb0 == sizeof(float));
11858
-
11859
- const int N = ne12;
11860
- const int OL = ne11;
11861
-
11862
- const int OC = ne02;
11863
- const int IC = ne01;
11864
- const int K = ne00;
11865
-
11866
- const int ith = params->ith;
11867
- const int nth = params->nth;
11868
-
11869
- int64_t m = OC;
11870
- int64_t n = OL;
11871
- int64_t k = IC * K;
11872
-
11873
- // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
11874
- for (int i = 0; i < N; i++) {
11875
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
11876
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
11877
- float * C = (float *)dst->data + i * m * n; // [m, n]
11878
-
11879
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
11880
- }
11881
- }
11882
-
11883
- static void ggml_compute_forward_conv_1d(
11884
- const struct ggml_compute_params * params,
11885
- const struct ggml_tensor * src0,
11886
- const struct ggml_tensor * src1,
11887
- struct ggml_tensor * dst) {
11888
- switch(src0->type) {
11889
- case GGML_TYPE_F16:
11890
- {
11891
- ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
11892
- } break;
11893
- case GGML_TYPE_F32:
11894
- {
11895
- ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
11896
- } break;
11897
- default:
11898
- {
11899
- GGML_ASSERT(false);
11900
- } break;
11901
- }
11902
- }
11903
-
11904
- static void ggml_compute_forward_conv_1d_stage_0(
11905
- const struct ggml_compute_params * params,
11906
- const struct ggml_tensor * src0,
11907
- const struct ggml_tensor * src1,
11908
- struct ggml_tensor * dst) {
11909
- switch(src0->type) {
11910
- case GGML_TYPE_F16:
11911
- {
11912
- ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
11913
- } break;
11914
- default:
11915
- {
11916
- GGML_ASSERT(false);
11917
- } break;
11918
- }
11919
- }
11920
-
11921
- static void ggml_compute_forward_conv_1d_stage_1(
11922
- const struct ggml_compute_params * params,
11923
- const struct ggml_tensor * src0,
11924
- const struct ggml_tensor * src1,
11925
- struct ggml_tensor * dst) {
11926
- switch(src0->type) {
11927
- case GGML_TYPE_F16:
11928
- {
11929
- ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
11930
- } break;
11931
- default:
11932
- {
11933
- GGML_ASSERT(false);
11934
- } break;
11935
- }
11936
- }
11937
-
11938
- // ggml_compute_forward_conv_transpose_1d
11939
-
11940
- static void ggml_compute_forward_conv_transpose_1d_f16_f32(
11941
- const struct ggml_compute_params * params,
11942
- const struct ggml_tensor * src0,
11943
- const struct ggml_tensor * src1,
11944
- struct ggml_tensor * dst) {
11945
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
11946
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
11947
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
11948
-
11949
- int64_t t0 = ggml_perf_time_us();
11950
- UNUSED(t0);
11951
-
11952
- GGML_TENSOR_BINARY_OP_LOCALS
11953
-
11954
- const int ith = params->ith;
11955
- const int nth = params->nth;
11956
-
11957
- const int nk = ne00*ne01*ne02;
11958
-
11959
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
11960
- GGML_ASSERT(nb10 == sizeof(float));
11961
-
11962
- if (params->type == GGML_TASK_INIT) {
11963
- memset(params->wdata, 0, params->wsize);
11964
-
11965
- // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
11966
- {
11967
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
11968
-
11969
- for (int64_t i02 = 0; i02 < ne02; i02++) {
11970
- for (int64_t i01 = 0; i01 < ne01; i01++) {
11971
- const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
11972
- ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
11973
- for (int64_t i00 = 0; i00 < ne00; i00++) {
11974
- dst_data[i00*ne02 + i02] = src[i00];
11975
- }
11976
- }
11977
- }
11978
- }
11979
-
11980
- // permute source data (src1) from (L x Cin) to (Cin x L)
11981
- {
11982
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11983
- ggml_fp16_t * dst_data = wdata;
11356
+ // permute source data (src1) from (L x Cin) to (Cin x L)
11357
+ {
11358
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
11359
+ ggml_fp16_t * dst_data = wdata;
11984
11360
 
11985
11361
  for (int64_t i11 = 0; i11 < ne11; i11++) {
11986
11362
  const float * const src = (float *)((char *) src1->data + i11*nb11);
@@ -12146,12 +11522,10 @@ static void ggml_compute_forward_conv_transpose_1d(
12146
11522
  }
12147
11523
  }
12148
11524
 
12149
- // ggml_compute_forward_conv_2d
12150
-
12151
11525
  // src0: kernel [OC, IC, KH, KW]
12152
11526
  // src1: image [N, IC, IH, IW]
12153
11527
  // dst: result [N, OH, OW, IC*KH*KW]
12154
- static void ggml_compute_forward_conv_2d_stage_0_f32(
11528
+ static void ggml_compute_forward_im2col_f16(
12155
11529
  const struct ggml_compute_params * params,
12156
11530
  const struct ggml_tensor * src0,
12157
11531
  const struct ggml_tensor * src1,
@@ -12165,34 +11539,35 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12165
11539
 
12166
11540
  GGML_TENSOR_BINARY_OP_LOCALS;
12167
11541
 
12168
- const int64_t N = ne13;
12169
- const int64_t IC = ne12;
12170
- const int64_t IH = ne11;
11542
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
11543
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
11544
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
11545
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
11546
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
11547
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
11548
+ const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
11549
+
11550
+ const int ith = params->ith;
11551
+ const int nth = params->nth;
11552
+
11553
+ const int64_t N = is_2D ? ne13 : ne12;
11554
+ const int64_t IC = is_2D ? ne12 : ne11;
11555
+ const int64_t IH = is_2D ? ne11 : 1;
12171
11556
  const int64_t IW = ne10;
12172
11557
 
12173
- // const int64_t OC = ne03;
12174
- // const int64_t IC = ne02;
12175
- const int64_t KH = ne01;
11558
+ const int64_t KH = is_2D ? ne01 : 1;
12176
11559
  const int64_t KW = ne00;
12177
11560
 
12178
- const int64_t OH = ne2;
11561
+ const int64_t OH = is_2D ? ne2 : 1;
12179
11562
  const int64_t OW = ne1;
12180
11563
 
12181
- const int ith = params->ith;
12182
- const int nth = params->nth;
12183
-
12184
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12185
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12186
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12187
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12188
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12189
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
11564
+ int ofs0 = is_2D ? nb13 : nb12;
11565
+ int ofs1 = is_2D ? nb12 : nb11;
12190
11566
 
12191
11567
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12192
11568
  GGML_ASSERT(nb10 == sizeof(float));
12193
11569
 
12194
11570
  if (params->type == GGML_TASK_INIT) {
12195
- memset(dst->data, 0, ggml_nbytes(dst));
12196
11571
  return;
12197
11572
  }
12198
11573
 
@@ -12205,20 +11580,22 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12205
11580
  ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
12206
11581
 
12207
11582
  for (int64_t in = 0; in < N; in++) {
12208
- for (int64_t ioh = 0; ioh < OH; ioh++) {
11583
+ for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
12209
11584
  for (int64_t iow = 0; iow < OW; iow++) {
12210
- for (int64_t iic = ith; iic < IC; iic+=nth) {
11585
+ for (int64_t iic = ith; iic < IC; iic += nth) {
12211
11586
 
12212
11587
  // micro kernel
12213
11588
  ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12214
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
11589
+ const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
12215
11590
 
12216
- for (int64_t ikh = 0; ikh < KH; ikh++) {
11591
+ for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
12217
11592
  for (int64_t ikw = 0; ikw < KW; ikw++) {
12218
11593
  const int64_t iiw = iow*s0 + ikw*d0 - p0;
12219
11594
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
12220
11595
 
12221
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
11596
+ if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
11597
+ dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
11598
+ } else {
12222
11599
  dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12223
11600
  }
12224
11601
  }
@@ -12230,223 +11607,7 @@ static void ggml_compute_forward_conv_2d_stage_0_f32(
12230
11607
  }
12231
11608
  }
12232
11609
 
12233
- // gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12234
- // src0: [OC, IC, KH, KW]
12235
- // src1: [N, OH, OW, IC * KH * KW]
12236
- // result: [N, OC, OH, OW]
12237
- static void ggml_compute_forward_conv_2d_stage_1_f16(
12238
- const struct ggml_compute_params * params,
12239
- const struct ggml_tensor * src0,
12240
- const struct ggml_tensor * src1,
12241
- struct ggml_tensor * dst) {
12242
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12243
- GGML_ASSERT(src1->type == GGML_TYPE_F16);
12244
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12245
-
12246
- int64_t t0 = ggml_perf_time_us();
12247
- UNUSED(t0);
12248
-
12249
- if (params->type == GGML_TASK_INIT) {
12250
- return;
12251
- }
12252
-
12253
- if (params->type == GGML_TASK_FINALIZE) {
12254
- return;
12255
- }
12256
-
12257
- GGML_TENSOR_BINARY_OP_LOCALS;
12258
-
12259
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12260
- GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
12261
- GGML_ASSERT(nb0 == sizeof(float));
12262
-
12263
- const int N = ne13;
12264
- const int OH = ne12;
12265
- const int OW = ne11;
12266
-
12267
- const int OC = ne03;
12268
- const int IC = ne02;
12269
- const int KH = ne01;
12270
- const int KW = ne00;
12271
-
12272
- const int ith = params->ith;
12273
- const int nth = params->nth;
12274
-
12275
- int64_t m = OC;
12276
- int64_t n = OH * OW;
12277
- int64_t k = IC * KH * KW;
12278
-
12279
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12280
- for (int i = 0; i < N; i++) {
12281
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12282
- ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
12283
- float * C = (float *)dst->data + i * m * n; // [m, n]
12284
-
12285
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12286
- }
12287
- }
12288
-
12289
- static void ggml_compute_forward_conv_2d_f16_f32(
12290
- const struct ggml_compute_params * params,
12291
- const struct ggml_tensor * src0,
12292
- const struct ggml_tensor * src1,
12293
- struct ggml_tensor * dst) {
12294
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
12295
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
12296
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
12297
-
12298
- int64_t t0 = ggml_perf_time_us();
12299
- UNUSED(t0);
12300
-
12301
- GGML_TENSOR_BINARY_OP_LOCALS
12302
-
12303
- // src1: image [N, IC, IH, IW]
12304
- // src0: kernel [OC, IC, KH, KW]
12305
- // dst: result [N, OC, OH, OW]
12306
- // ne12: IC
12307
- // ne0: OW
12308
- // ne1: OH
12309
- // nk0: KW
12310
- // nk1: KH
12311
- // ne13: N
12312
-
12313
- const int N = ne13;
12314
- const int IC = ne12;
12315
- const int IH = ne11;
12316
- const int IW = ne10;
12317
-
12318
- const int OC = ne03;
12319
- // const int IC = ne02;
12320
- const int KH = ne01;
12321
- const int KW = ne00;
12322
-
12323
- const int OH = ne1;
12324
- const int OW = ne0;
12325
-
12326
- const int ith = params->ith;
12327
- const int nth = params->nth;
12328
-
12329
- // const int nk0 = ne00;
12330
- // const int nk1 = ne01;
12331
-
12332
- // size of the convolution row - the kernel size unrolled across all channels
12333
- // const int ew0 = nk0*nk1*ne02;
12334
- // ew0: IC*KH*KW
12335
-
12336
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
12337
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
12338
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
12339
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
12340
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
12341
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
12342
-
12343
- GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
12344
- GGML_ASSERT(nb10 == sizeof(float));
12345
-
12346
- if (params->type == GGML_TASK_INIT) {
12347
- memset(params->wdata, 0, params->wsize);
12348
-
12349
- // prepare source data (src1)
12350
- // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
12351
-
12352
- {
12353
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12354
-
12355
- for (int in = 0; in < N; in++) {
12356
- for (int iic = 0; iic < IC; iic++) {
12357
- for (int ioh = 0; ioh < OH; ioh++) {
12358
- for (int iow = 0; iow < OW; iow++) {
12359
-
12360
- // micro kernel
12361
- ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
12362
- const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
12363
-
12364
- for (int ikh = 0; ikh < KH; ikh++) {
12365
- for (int ikw = 0; ikw < KW; ikw++) {
12366
- const int iiw = iow*s0 + ikw*d0 - p0;
12367
- const int iih = ioh*s1 + ikh*d1 - p1;
12368
-
12369
- if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
12370
- dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
12371
- }
12372
- }
12373
- }
12374
- }
12375
- }
12376
- }
12377
- }
12378
- }
12379
-
12380
- return;
12381
- }
12382
-
12383
- if (params->type == GGML_TASK_FINALIZE) {
12384
- return;
12385
- }
12386
-
12387
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
12388
- // wdata: [N*OH*OW, IC*KH*KW]
12389
- // dst: result [N, OC, OH, OW]
12390
- // src0: kernel [OC, IC, KH, KW]
12391
-
12392
- int64_t m = OC;
12393
- int64_t n = OH * OW;
12394
- int64_t k = IC * KH * KW;
12395
-
12396
- // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
12397
- for (int i = 0; i < N; i++) {
12398
- ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
12399
- ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
12400
- float * C = (float *)dst->data + i * m * n; // [m * k]
12401
-
12402
- gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
12403
- }
12404
- }
12405
-
12406
- static void ggml_compute_forward_conv_2d(
12407
- const struct ggml_compute_params * params,
12408
- const struct ggml_tensor * src0,
12409
- const struct ggml_tensor * src1,
12410
- struct ggml_tensor * dst) {
12411
- switch (src0->type) {
12412
- case GGML_TYPE_F16:
12413
- {
12414
- ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
12415
- } break;
12416
- case GGML_TYPE_F32:
12417
- {
12418
- //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
12419
- GGML_ASSERT(false);
12420
- } break;
12421
- default:
12422
- {
12423
- GGML_ASSERT(false);
12424
- } break;
12425
- }
12426
- }
12427
-
12428
- static void ggml_compute_forward_conv_2d_stage_0(
12429
- const struct ggml_compute_params * params,
12430
- const struct ggml_tensor * src0,
12431
- const struct ggml_tensor * src1,
12432
- struct ggml_tensor * dst) {
12433
- switch (src0->type) {
12434
- case GGML_TYPE_F16:
12435
- {
12436
- ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
12437
- } break;
12438
- case GGML_TYPE_F32:
12439
- {
12440
- GGML_ASSERT(false);
12441
- } break;
12442
- default:
12443
- {
12444
- GGML_ASSERT(false);
12445
- } break;
12446
- }
12447
- }
12448
-
12449
- static void ggml_compute_forward_conv_2d_stage_1(
11610
+ static void ggml_compute_forward_im2col(
12450
11611
  const struct ggml_compute_params * params,
12451
11612
  const struct ggml_tensor * src0,
12452
11613
  const struct ggml_tensor * src1,
@@ -12454,7 +11615,7 @@ static void ggml_compute_forward_conv_2d_stage_1(
12454
11615
  switch (src0->type) {
12455
11616
  case GGML_TYPE_F16:
12456
11617
  {
12457
- ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
11618
+ ggml_compute_forward_im2col_f16(params, src0, src1, dst);
12458
11619
  } break;
12459
11620
  case GGML_TYPE_F32:
12460
11621
  {
@@ -12639,14 +11800,11 @@ static void ggml_compute_forward_pool_1d(
12639
11800
  ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
12640
11801
  }
12641
11802
 
12642
- // ggml_compute_forward_pool_2d_sk_p0
11803
+ // ggml_compute_forward_pool_2d
12643
11804
 
12644
- static void ggml_compute_forward_pool_2d_sk_p0(
11805
+ static void ggml_compute_forward_pool_2d(
12645
11806
  const struct ggml_compute_params * params,
12646
- const enum ggml_op_pool op,
12647
11807
  const struct ggml_tensor * src,
12648
- const int k0,
12649
- const int k1,
12650
11808
  struct ggml_tensor * dst) {
12651
11809
  assert(src->type == GGML_TYPE_F32);
12652
11810
  assert(params->ith == 0);
@@ -12655,6 +11813,14 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12655
11813
  return;
12656
11814
  }
12657
11815
 
11816
+ const int32_t * opts = (const int32_t *)dst->op_params;
11817
+ enum ggml_op_pool op = opts[0];
11818
+ const int k0 = opts[1];
11819
+ const int k1 = opts[2];
11820
+ const int s0 = opts[3];
11821
+ const int s1 = opts[4];
11822
+ const int p0 = opts[5];
11823
+ const int p1 = opts[6];
12658
11824
  const char * cdata = (const char*)src->data;
12659
11825
  const char * const data_end = cdata + ggml_nbytes(src);
12660
11826
 
@@ -12665,6 +11831,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12665
11831
  float * dplane = (float *)dst->data;
12666
11832
 
12667
11833
  const int ka = k0 * k1;
11834
+ const int offset0 = -p0;
11835
+ const int offset1 = -p1;
12668
11836
 
12669
11837
  while (cdata < data_end) {
12670
11838
  for (int oy = 0; oy < py; ++oy) {
@@ -12677,13 +11845,15 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12677
11845
  case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
12678
11846
  }
12679
11847
 
12680
- const int ix = ox * k0;
12681
- const int iy = oy * k1;
11848
+ const int ix = offset0 + ox * s0;
11849
+ const int iy = offset1 + oy * s1;
12682
11850
 
12683
11851
  for (int ky = 0; ky < k1; ++ky) {
11852
+ if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
12684
11853
  const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
12685
11854
  for (int kx = 0; kx < k0; ++kx) {
12686
11855
  int j = ix + kx;
11856
+ if (j < 0 || j >= src->ne[0]) continue;
12687
11857
  switch (op) {
12688
11858
  case GGML_OP_POOL_AVG: *out += srow[j]; break;
12689
11859
  case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
@@ -12700,31 +11870,8 @@ static void ggml_compute_forward_pool_2d_sk_p0(
12700
11870
  }
12701
11871
 
12702
11872
  cdata += src->nb[2];
12703
- dplane += pa;
12704
- }
12705
- }
12706
-
12707
- // ggml_compute_forward_pool_2d
12708
-
12709
- static void ggml_compute_forward_pool_2d(
12710
- const struct ggml_compute_params * params,
12711
- const struct ggml_tensor * src0,
12712
- struct ggml_tensor * dst) {
12713
-
12714
- const int32_t * opts = (const int32_t *)dst->op_params;
12715
- enum ggml_op_pool op = opts[0];
12716
- const int k0 = opts[1];
12717
- const int k1 = opts[2];
12718
- const int s0 = opts[3];
12719
- const int s1 = opts[4];
12720
- const int p0 = opts[5];
12721
- const int p1 = opts[6];
12722
- GGML_ASSERT(p0 == 0);
12723
- GGML_ASSERT(p1 == 0); // padding not supported
12724
- GGML_ASSERT(k0 == s0);
12725
- GGML_ASSERT(k1 == s1); // only s = k supported
12726
-
12727
- ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
11873
+ dplane += pa;
11874
+ }
12728
11875
  }
12729
11876
 
12730
11877
  // ggml_compute_forward_upscale
@@ -13928,6 +13075,10 @@ static void ggml_compute_forward_unary(
13928
13075
  {
13929
13076
  ggml_compute_forward_silu(params, src0, dst);
13930
13077
  } break;
13078
+ case GGML_UNARY_OP_LEAKY:
13079
+ {
13080
+ ggml_compute_forward_leaky(params, src0, dst);
13081
+ } break;
13931
13082
  default:
13932
13083
  {
13933
13084
  GGML_ASSERT(false);
@@ -14681,33 +13832,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14681
13832
  {
14682
13833
  ggml_compute_forward_clamp(params, tensor->src[0], tensor);
14683
13834
  } break;
14684
- case GGML_OP_CONV_1D:
14685
- {
14686
- ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
14687
- } break;
14688
- case GGML_OP_CONV_1D_STAGE_0:
14689
- {
14690
- ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14691
- } break;
14692
- case GGML_OP_CONV_1D_STAGE_1:
14693
- {
14694
- ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
14695
- } break;
14696
13835
  case GGML_OP_CONV_TRANSPOSE_1D:
14697
13836
  {
14698
13837
  ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
14699
13838
  } break;
14700
- case GGML_OP_CONV_2D:
14701
- {
14702
- ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
14703
- } break;
14704
- case GGML_OP_CONV_2D_STAGE_0:
13839
+ case GGML_OP_IM2COL:
14705
13840
  {
14706
- ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
14707
- } break;
14708
- case GGML_OP_CONV_2D_STAGE_1:
14709
- {
14710
- ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
13841
+ ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
14711
13842
  } break;
14712
13843
  case GGML_OP_CONV_TRANSPOSE_2D:
14713
13844
  {
@@ -14836,62 +13967,109 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14836
13967
 
14837
13968
  ////////////////////////////////////////////////////////////////////////////////
14838
13969
 
14839
- static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
13970
+ static size_t ggml_hash_size(size_t min_sz) {
13971
+ // next primes after powers of two
13972
+ static const size_t primes[] = {
13973
+ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
13974
+ 2053, 4099, 8209, 16411, 32771, 65537, 131101,
13975
+ 262147, 524309, 1048583, 2097169, 4194319, 8388617,
13976
+ 16777259, 33554467, 67108879, 134217757, 268435459,
13977
+ 536870923, 1073741827, 2147483659
13978
+ };
13979
+ static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
13980
+
13981
+ // find the smallest prime that is larger or equal to min_sz
13982
+ size_t l = 0;
13983
+ size_t r = n_primes;
13984
+ while (l < r) {
13985
+ size_t m = (l + r)/2;
13986
+ if (primes[m] < min_sz) {
13987
+ l = m + 1;
13988
+ } else {
13989
+ r = m;
13990
+ }
13991
+ }
13992
+ size_t sz = l < n_primes ? primes[l] : min_sz | 1;
13993
+ return sz;
13994
+ }
14840
13995
 
14841
- static size_t hash(void * p) {
14842
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
13996
+ static size_t ggml_hash(const void * p) {
13997
+ return (size_t)p;
14843
13998
  }
14844
13999
 
14845
- static size_t hash_find(void * hash_table[], void * p) {
14846
- size_t h = hash(p);
14000
+ size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14001
+ size_t h = ggml_hash(key) % hash_set.size;
14847
14002
 
14848
14003
  // linear probing
14849
14004
  size_t i = h;
14850
- while (hash_table[i] != NULL && hash_table[i] != p) {
14851
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
14005
+ while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
14006
+ i = (i + 1) % hash_set.size;
14852
14007
  if (i == h) {
14853
14008
  // visited all hash table entries -> not found
14854
- return GGML_GRAPH_HASHTABLE_SIZE;
14009
+ return GGML_HASHTABLE_FULL;
14855
14010
  }
14856
14011
  }
14857
14012
  return i;
14858
14013
  }
14859
14014
 
14860
- static bool hash_insert(void * hash_table[], void * p) {
14861
- size_t i = hash_find(hash_table, p);
14015
+ bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14016
+ size_t i = ggml_hash_find(hash_set, key);
14017
+ return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
14018
+ }
14019
+
14020
+ size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14021
+ size_t i = ggml_hash_find(hash_set, key);
14862
14022
 
14863
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14023
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14864
14024
 
14865
- if (hash_table[i] == p) {
14866
- return true;
14025
+ if (hash_set.keys[i] == key) {
14026
+ return GGML_HASHTABLE_ALREADY_EXISTS;
14867
14027
  }
14868
14028
 
14869
14029
  // insert
14870
- GGML_ASSERT(hash_table[i] == NULL);
14871
- hash_table[i] = p;
14872
- return false;
14030
+ GGML_ASSERT(hash_set.keys[i] == NULL);
14031
+ hash_set.keys[i] = key;
14032
+ return i;
14033
+ }
14034
+
14035
+ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
14036
+ size_t i = ggml_hash_find(hash_set, key);
14037
+
14038
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL);
14039
+
14040
+ hash_set.keys[i] = key;
14041
+ return i;
14042
+ }
14043
+
14044
+ static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14045
+ size = ggml_hash_size(size);
14046
+ struct ggml_hash_set result;
14047
+ result.size = size;
14048
+ result.keys = malloc(sizeof(struct ggml_tensor *) * size);
14049
+ memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
14050
+ return result;
14873
14051
  }
14874
14052
 
14875
- static bool hash_contains(void * hash_table[], void * p) {
14876
- size_t i = hash_find(hash_table, p);
14877
- return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
14053
+ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
14054
+ free(hash_set.keys);
14878
14055
  }
14879
14056
 
14880
14057
  struct hash_map {
14881
- void * keys[GGML_GRAPH_HASHTABLE_SIZE];
14882
- void * vals[GGML_GRAPH_HASHTABLE_SIZE];
14058
+ struct ggml_hash_set set;
14059
+ struct ggml_tensor ** vals;
14883
14060
  };
14884
14061
 
14885
- static struct hash_map * new_hash_map(void) {
14062
+ static struct hash_map * ggml_new_hash_map(size_t size) {
14886
14063
  struct hash_map * result = malloc(sizeof(struct hash_map));
14887
- for (int i=0; i<GGML_GRAPH_HASHTABLE_SIZE; ++i) {
14888
- result->keys[i] = NULL;
14889
- result->vals[i] = NULL;
14890
- }
14064
+ result->set = ggml_hash_set_new(size);
14065
+ result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
14066
+ memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
14891
14067
  return result;
14892
14068
  }
14893
14069
 
14894
- static void free_hash_map(struct hash_map * map) {
14070
+ static void ggml_hash_map_free(struct hash_map * map) {
14071
+ ggml_hash_set_free(map->set);
14072
+ free(map->vals);
14895
14073
  free(map);
14896
14074
  }
14897
14075
 
@@ -14911,7 +14089,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14911
14089
  return node;
14912
14090
  }
14913
14091
 
14914
- if (!hash_contains(graph->visited_hash_table, node)) {
14092
+ if (!ggml_hash_contains(graph->visited_hash_table, node)) {
14915
14093
  return node;
14916
14094
  }
14917
14095
 
@@ -14926,17 +14104,17 @@ static struct ggml_tensor * ggml_recompute_graph_node(
14926
14104
  return node;
14927
14105
  }
14928
14106
 
14929
- size_t i = hash_find(replacements->keys, node);
14930
- GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14931
- if (replacements->keys[i] == node) {
14932
- return (struct ggml_tensor *) replacements->vals[i];
14107
+ size_t i = ggml_hash_find(replacements->set, node);
14108
+ GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
14109
+ if (replacements->set.keys[i] == node) {
14110
+ return replacements->vals[i];
14933
14111
  }
14934
14112
 
14935
14113
  struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
14936
14114
 
14937
14115
  // insert clone into replacements
14938
- GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite
14939
- replacements->keys[i] = node;
14116
+ GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
14117
+ replacements->set.keys[i] = node;
14940
14118
  replacements->vals[i] = clone;
14941
14119
 
14942
14120
  clone->op = node->op;
@@ -14973,26 +14151,26 @@ void ggml_build_backward_gradient_checkpointing(
14973
14151
  struct ggml_cgraph * gb_tmp,
14974
14152
  struct ggml_tensor * * checkpoints,
14975
14153
  int n_checkpoints) {
14976
- *gb_tmp = *gf;
14154
+ ggml_graph_cpy(gf, gb_tmp);
14977
14155
  ggml_build_backward_expand(ctx, gf, gb_tmp, true);
14978
14156
 
14979
14157
  if (n_checkpoints <= 0) {
14980
- *gb = *gb_tmp;
14158
+ ggml_graph_cpy(gb_tmp, gb);
14981
14159
  return;
14982
14160
  }
14983
14161
 
14984
- struct hash_map * replacements = new_hash_map();
14162
+ struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
14985
14163
 
14986
14164
  // insert checkpoints in replacements
14987
14165
  for (int i = 0; i < n_checkpoints; ++i) {
14988
- size_t k = hash_find(replacements->keys, checkpoints[i]);
14989
- GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full
14990
- GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite
14991
- replacements->keys[k] = checkpoints[i];
14992
- replacements->vals[k] = checkpoints[i];
14166
+ size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
14167
+ GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
14168
+ GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
14169
+ replacements->set.keys[k] = checkpoints[i];
14170
+ replacements->vals[k] = checkpoints[i];
14993
14171
  }
14994
14172
 
14995
- *gb = *gf;
14173
+ ggml_graph_cpy(gf, gb);
14996
14174
  // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
14997
14175
  // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
14998
14176
  // by recomputing them from checkpoints
@@ -15009,21 +14187,21 @@ void ggml_build_backward_gradient_checkpointing(
15009
14187
  ggml_build_forward_expand(gb, node);
15010
14188
  }
15011
14189
 
15012
- free_hash_map(replacements);
14190
+ ggml_hash_map_free(replacements);
15013
14191
  }
15014
14192
 
15015
14193
  // functions to change gradients considering the case that input a might be initial gradient with zero value
15016
14194
 
15017
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
15018
- if (hash_contains(zero_table, a)) {
14195
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14196
+ if (ggml_hash_contains(zero_table, a)) {
15019
14197
  return b;
15020
14198
  } else {
15021
14199
  return ggml_add_impl(ctx, a, b, false);
15022
14200
  }
15023
14201
  }
15024
14202
 
15025
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, void * zero_table[]) {
15026
- if (hash_contains(zero_table, a)) {
14203
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
14204
+ if (ggml_hash_contains(zero_table, a)) {
15027
14205
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
15028
14206
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
15029
14207
  } else {
@@ -15031,23 +14209,23 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
15031
14209
  }
15032
14210
  }
15033
14211
 
15034
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
15035
- if (hash_contains(zero_table, a)) {
14212
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14213
+ if (ggml_hash_contains(zero_table, a)) {
15036
14214
  return ggml_repeat(ctx, b, a);
15037
14215
  } else {
15038
14216
  return ggml_add1_impl(ctx, a, b, false);
15039
14217
  }
15040
14218
  }
15041
14219
 
15042
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, void * zero_table[]) {
15043
- if (hash_contains(zero_table, a)) {
14220
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
14221
+ if (ggml_hash_contains(zero_table, a)) {
15044
14222
  return ggml_neg(ctx, b);
15045
14223
  } else {
15046
14224
  return ggml_sub_impl(ctx, a, b, false);
15047
14225
  }
15048
14226
  }
15049
14227
 
15050
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, void * zero_table[]) {
14228
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
15051
14229
  struct ggml_tensor * src0 = tensor->src[0];
15052
14230
  struct ggml_tensor * src1 = tensor->src[1];
15053
14231
 
@@ -15559,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15559
14737
  // necessary for llama
15560
14738
  if (src0->grad) {
15561
14739
  //const int n_past = ((int32_t *) tensor->op_params)[0];
15562
- const int n_dims = ((int32_t *) tensor->op_params)[1];
15563
- const int mode = ((int32_t *) tensor->op_params)[2];
15564
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
15565
- float freq_base;
15566
- float freq_scale;
15567
- float xpos_base;
15568
- bool xpos_down;
15569
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
15570
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
15571
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
15572
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
14740
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
14741
+ const int mode = ((int32_t *) tensor->op_params)[2];
14742
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
14743
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
14744
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
14745
+
14746
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
14747
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
14748
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
14749
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
14750
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
14751
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
14752
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
14753
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
15573
14754
 
15574
14755
  src0->grad = ggml_add_or_set(ctx,
15575
14756
  src0->grad,
@@ -15579,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15579
14760
  n_dims,
15580
14761
  mode,
15581
14762
  n_ctx,
14763
+ n_orig_ctx,
15582
14764
  freq_base,
15583
14765
  freq_scale,
14766
+ ext_factor,
14767
+ attn_factor,
14768
+ beta_fast,
14769
+ beta_slow,
15584
14770
  xpos_base,
15585
14771
  xpos_down),
15586
14772
  zero_table);
@@ -15590,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15590
14776
  {
15591
14777
  if (src0->grad) {
15592
14778
  //const int n_past = ((int32_t *) tensor->op_params)[0];
15593
- const int n_dims = ((int32_t *) tensor->op_params)[1];
15594
- const int mode = ((int32_t *) tensor->op_params)[2];
15595
- const int n_ctx = ((int32_t *) tensor->op_params)[3];
15596
- float freq_base;
15597
- float freq_scale;
15598
- float xpos_base;
15599
- bool xpos_down;
15600
- memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
15601
- memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
15602
- memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
15603
- memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
14779
+ const int n_dims = ((int32_t *) tensor->op_params)[1];
14780
+ const int mode = ((int32_t *) tensor->op_params)[2];
14781
+ const int n_ctx = ((int32_t *) tensor->op_params)[3];
14782
+ const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
14783
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
14784
+
14785
+ memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
14786
+ memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
14787
+ memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
14788
+ memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
14789
+ memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
14790
+ memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
14791
+ memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
14792
+ memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
15604
14793
 
15605
14794
  src0->grad = ggml_add_or_set(ctx,
15606
14795
  src0->grad,
@@ -15609,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15609
14798
  src1,
15610
14799
  n_dims,
15611
14800
  mode,
15612
- 0,
15613
14801
  n_ctx,
14802
+ n_orig_ctx,
15614
14803
  freq_base,
15615
14804
  freq_scale,
15616
- 0.0f,
15617
- 1.0f,
15618
- 0.0f,
15619
- 0.0f,
14805
+ ext_factor,
14806
+ attn_factor,
14807
+ beta_fast,
14808
+ beta_slow,
15620
14809
  xpos_base,
15621
14810
  xpos_down,
15622
14811
  false),
@@ -15631,31 +14820,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15631
14820
  {
15632
14821
  GGML_ASSERT(false); // TODO: not implemented
15633
14822
  } break;
15634
- case GGML_OP_CONV_1D:
15635
- {
15636
- GGML_ASSERT(false); // TODO: not implemented
15637
- } break;
15638
- case GGML_OP_CONV_1D_STAGE_0:
15639
- {
15640
- GGML_ASSERT(false); // TODO: not implemented
15641
- } break;
15642
- case GGML_OP_CONV_1D_STAGE_1:
15643
- {
15644
- GGML_ASSERT(false); // TODO: not implemented
15645
- } break;
15646
14823
  case GGML_OP_CONV_TRANSPOSE_1D:
15647
14824
  {
15648
14825
  GGML_ASSERT(false); // TODO: not implemented
15649
14826
  } break;
15650
- case GGML_OP_CONV_2D:
15651
- {
15652
- GGML_ASSERT(false); // TODO: not implemented
15653
- } break;
15654
- case GGML_OP_CONV_2D_STAGE_0:
15655
- {
15656
- GGML_ASSERT(false); // TODO: not implemented
15657
- } break;
15658
- case GGML_OP_CONV_2D_STAGE_1:
14827
+ case GGML_OP_IM2COL:
15659
14828
  {
15660
14829
  GGML_ASSERT(false); // TODO: not implemented
15661
14830
  } break;
@@ -15869,7 +15038,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15869
15038
  }
15870
15039
 
15871
15040
  // check if already visited
15872
- if (hash_insert(cgraph->visited_hash_table, node)) {
15041
+ if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
15873
15042
  return;
15874
15043
  }
15875
15044
 
@@ -15885,7 +15054,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15885
15054
 
15886
15055
  if (node->op == GGML_OP_NONE && node->grad == NULL) {
15887
15056
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
15888
- GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15057
+ GGML_ASSERT(cgraph->n_leafs < cgraph->size);
15889
15058
 
15890
15059
  if (strlen(node->name) == 0) {
15891
15060
  ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
@@ -15894,22 +15063,24 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15894
15063
  cgraph->leafs[cgraph->n_leafs] = node;
15895
15064
  cgraph->n_leafs++;
15896
15065
  } else {
15897
- GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15066
+ GGML_ASSERT(cgraph->n_nodes < cgraph->size);
15898
15067
 
15899
15068
  if (strlen(node->name) == 0) {
15900
15069
  ggml_format_name(node, "node_%d", cgraph->n_nodes);
15901
15070
  }
15902
15071
 
15903
15072
  cgraph->nodes[cgraph->n_nodes] = node;
15904
- cgraph->grads[cgraph->n_nodes] = node->grad;
15073
+ if (cgraph->grads) {
15074
+ cgraph->grads[cgraph->n_nodes] = node->grad;
15075
+ }
15905
15076
  cgraph->n_nodes++;
15906
15077
  }
15907
15078
  }
15908
15079
 
15909
15080
  static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
15910
15081
  if (!expand) {
15911
- cgraph->n_nodes = 0;
15912
- cgraph->n_leafs = 0;
15082
+ // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
15083
+ ggml_graph_clear(cgraph);
15913
15084
  }
15914
15085
 
15915
15086
  const int n0 = cgraph->n_nodes;
@@ -15930,25 +15101,6 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
15930
15101
  ggml_build_forward_impl(cgraph, tensor, true);
15931
15102
  }
15932
15103
 
15933
- struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
15934
- struct ggml_cgraph result = {
15935
- /*.n_nodes =*/ 0,
15936
- /*.n_leafs =*/ 0,
15937
- /*.nodes =*/ { NULL },
15938
- /*.grads =*/ { NULL },
15939
- /*.leafs =*/ { NULL },
15940
- /*.hash_table =*/ { NULL },
15941
- /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
15942
- /*.perf_runs =*/ 0,
15943
- /*.perf_cycles =*/ 0,
15944
- /*.perf_time_us =*/ 0,
15945
- };
15946
-
15947
- ggml_build_forward_impl(&result, tensor, false);
15948
-
15949
- return result;
15950
- }
15951
-
15952
15104
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
15953
15105
  GGML_ASSERT(gf->n_nodes > 0);
15954
15106
 
@@ -15965,11 +15117,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15965
15117
  }
15966
15118
 
15967
15119
  // remember original gradients which start with zero values
15968
- void ** zero_table = malloc(sizeof(void *) * GGML_GRAPH_HASHTABLE_SIZE);
15969
- memset(zero_table, 0, sizeof(void*) * GGML_GRAPH_HASHTABLE_SIZE);
15120
+ struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
15970
15121
  for (int i = 0; i < gf->n_nodes; i++) {
15971
15122
  if (gf->grads[i]) {
15972
- hash_insert(zero_table, gf->grads[i]);
15123
+ ggml_hash_insert(zero_table, gf->grads[i]);
15973
15124
  }
15974
15125
  }
15975
15126
 
@@ -15992,26 +15143,54 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
15992
15143
  }
15993
15144
  }
15994
15145
 
15995
- free(zero_table);
15146
+ ggml_hash_set_free(zero_table);
15996
15147
  }
15997
15148
 
15998
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
15999
- struct ggml_cgraph result = *gf;
16000
- ggml_build_backward_expand(ctx, gf, &result, keep);
16001
- return result;
15149
+ static size_t ggml_graph_nbytes(size_t size, bool grads) {
15150
+ size_t nbytes = sizeof(struct ggml_cgraph);
15151
+ nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
15152
+ if (grads) {
15153
+ nbytes += size * sizeof(struct ggml_tensor *); // grads
15154
+ }
15155
+ nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
15156
+ return nbytes;
16002
15157
  }
16003
15158
 
16004
- struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
16005
- struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
15159
+ size_t ggml_graph_overhead_custom(size_t size, bool grads) {
15160
+ return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
15161
+ }
15162
+
15163
+ size_t ggml_graph_overhead(void) {
15164
+ return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
15165
+ }
15166
+
15167
+ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
15168
+ const size_t obj_size = ggml_graph_nbytes(size, grads);
15169
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
16006
15170
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
16007
15171
 
15172
+ struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
15173
+
15174
+ size_t hash_size = ggml_hash_size(size * 2);
15175
+ struct ggml_tensor ** nodes_ptr = data_start;
15176
+ struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
15177
+ struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
15178
+ struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
15179
+
15180
+ // check that we allocated the correct amount of memory
15181
+ assert(obj_size == (size_t) (
15182
+ (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
15183
+
15184
+ memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
15185
+
16008
15186
  *cgraph = (struct ggml_cgraph) {
15187
+ /*.size =*/ size,
16009
15188
  /*.n_nodes =*/ 0,
16010
15189
  /*.n_leafs =*/ 0,
16011
- /*.nodes =*/ { NULL },
16012
- /*.grads =*/ { NULL },
16013
- /*.leafs =*/ { NULL },
16014
- /*.hash_table =*/ { NULL },
15190
+ /*.nodes =*/ nodes_ptr,
15191
+ /*.grads =*/ grads_ptr,
15192
+ /*.leafs =*/ leafs_ptr,
15193
+ /*.hash_table =*/ { hash_size, hash_keys_ptr },
16015
15194
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
16016
15195
  /*.perf_runs =*/ 0,
16017
15196
  /*.perf_cycles =*/ 0,
@@ -16021,14 +15200,85 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
16021
15200
  return cgraph;
16022
15201
  }
16023
15202
 
16024
- struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
16025
- struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
16026
- ggml_build_forward_impl(cgraph, tensor, false);
15203
+ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
15204
+ return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
15205
+ }
15206
+
15207
+ struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
15208
+ const size_t obj_size = sizeof(struct ggml_cgraph);
15209
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
15210
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
15211
+
15212
+ *cgraph = (struct ggml_cgraph) {
15213
+ /*.size =*/ 0,
15214
+ /*.n_nodes =*/ i1 - i0,
15215
+ /*.n_leafs =*/ 0,
15216
+ /*.nodes =*/ cgraph0->nodes + i0,
15217
+ /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
15218
+ /*.leafs =*/ NULL,
15219
+ /*.hash_table =*/ { 0, NULL },
15220
+ /*.order =*/ cgraph0->order,
15221
+ /*.perf_runs =*/ 0,
15222
+ /*.perf_cycles =*/ 0,
15223
+ /*.perf_time_us =*/ 0,
15224
+ };
15225
+
16027
15226
  return cgraph;
16028
15227
  }
16029
15228
 
16030
- size_t ggml_graph_overhead(void) {
16031
- return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
15229
+ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
15230
+ GGML_ASSERT(dst->size >= src->n_leafs);
15231
+ GGML_ASSERT(dst->size >= src->n_nodes);
15232
+ GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
15233
+
15234
+ dst->n_leafs = src->n_leafs;
15235
+ dst->n_nodes = src->n_nodes;
15236
+ dst->order = src->order;
15237
+
15238
+ for (int i = 0; i < src->n_leafs; ++i) {
15239
+ dst->leafs[i] = src->leafs[i];
15240
+ }
15241
+
15242
+ for (int i = 0; i < src->n_nodes; ++i) {
15243
+ dst->nodes[i] = src->nodes[i];
15244
+ }
15245
+
15246
+ if (src->grads) {
15247
+ GGML_ASSERT(dst->grads != NULL);
15248
+ for (int i = 0; i < src->n_nodes; ++i) {
15249
+ dst->grads[i] = src->grads[i];
15250
+ }
15251
+ }
15252
+
15253
+ for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
15254
+ if (src->visited_hash_table.keys[i]) {
15255
+ ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
15256
+ }
15257
+ }
15258
+ }
15259
+
15260
+ struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
15261
+ struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
15262
+ ggml_graph_cpy(cgraph, result);
15263
+ return result;
15264
+ }
15265
+
15266
+ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
15267
+ GGML_ASSERT(cgraph->grads != NULL);
15268
+
15269
+ for (int i = 0; i < cgraph->n_nodes; i++) {
15270
+ struct ggml_tensor * grad = cgraph->grads[i];
15271
+
15272
+ if (grad) {
15273
+ ggml_set_zero(grad);
15274
+ }
15275
+ }
15276
+ }
15277
+
15278
+ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
15279
+ cgraph->n_leafs = 0;
15280
+ cgraph->n_nodes = 0;
15281
+ memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
16032
15282
  }
16033
15283
 
16034
15284
  //
@@ -16140,45 +15390,266 @@ static void clear_numa_thread_affinity(void) {
16140
15390
  strerror(rv));
16141
15391
  }
16142
15392
 
16143
- CPU_FREE(cpus);
16144
- }
16145
- #else
16146
- // TODO: Windows etc.
16147
- // (the linux implementation may also work on BSD, someone should test)
16148
- static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16149
- static void clear_numa_thread_affinity(void) {}
16150
- #endif
16151
-
16152
- struct ggml_compute_state_shared {
16153
- const struct ggml_cgraph * cgraph;
16154
- const struct ggml_cplan * cplan;
16155
-
16156
- int64_t perf_node_start_cycles;
16157
- int64_t perf_node_start_time_us;
16158
-
16159
- const int n_threads;
16160
-
16161
- // synchronization primitives
16162
- atomic_int n_active; // num active threads
16163
- atomic_int node_n; // active graph node
16164
-
16165
- bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16166
- void * abort_callback_data;
16167
- };
16168
-
16169
- struct ggml_compute_state {
16170
- ggml_thread_t thrd;
16171
- int ith;
16172
- struct ggml_compute_state_shared * shared;
16173
- };
16174
-
16175
- static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16176
- int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16177
- int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15393
+ CPU_FREE(cpus);
15394
+ }
15395
+ #else
15396
+ // TODO: Windows etc.
15397
+ // (the linux implementation may also work on BSD, someone should test)
15398
+ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
15399
+ static void clear_numa_thread_affinity(void) {}
15400
+ #endif
15401
+
15402
+ struct ggml_compute_state_shared {
15403
+ const struct ggml_cgraph * cgraph;
15404
+ const struct ggml_cplan * cplan;
15405
+
15406
+ int64_t perf_node_start_cycles;
15407
+ int64_t perf_node_start_time_us;
15408
+
15409
+ const int n_threads;
15410
+
15411
+ // synchronization primitives
15412
+ atomic_int n_active; // num active threads
15413
+ atomic_int node_n; // active graph node
15414
+
15415
+ bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
15416
+ void * abort_callback_data;
15417
+ };
15418
+
15419
+ struct ggml_compute_state {
15420
+ ggml_thread_t thrd;
15421
+ int ith;
15422
+ struct ggml_compute_state_shared * shared;
15423
+ };
15424
+
15425
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
15426
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
15427
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
15428
+
15429
+ node->perf_runs++;
15430
+ node->perf_cycles += cycles_cur;
15431
+ node->perf_time_us += time_us_cur;
15432
+ }
15433
+
15434
+ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
15435
+ int n_tasks = 0;
15436
+
15437
+ switch (node->op) {
15438
+ case GGML_OP_CPY:
15439
+ case GGML_OP_DUP:
15440
+ case GGML_OP_ADD:
15441
+ case GGML_OP_ADD1:
15442
+ case GGML_OP_ACC:
15443
+ {
15444
+ n_tasks = n_threads;
15445
+ } break;
15446
+ case GGML_OP_SUB:
15447
+ case GGML_OP_DIV:
15448
+ case GGML_OP_SQR:
15449
+ case GGML_OP_SQRT:
15450
+ case GGML_OP_LOG:
15451
+ case GGML_OP_SUM:
15452
+ case GGML_OP_SUM_ROWS:
15453
+ case GGML_OP_MEAN:
15454
+ case GGML_OP_ARGMAX:
15455
+ case GGML_OP_REPEAT:
15456
+ case GGML_OP_REPEAT_BACK:
15457
+ {
15458
+ n_tasks = 1;
15459
+ } break;
15460
+ case GGML_OP_UNARY:
15461
+ switch (ggml_get_unary_op(node)) {
15462
+ case GGML_UNARY_OP_ABS:
15463
+ case GGML_UNARY_OP_SGN:
15464
+ case GGML_UNARY_OP_NEG:
15465
+ case GGML_UNARY_OP_STEP:
15466
+ case GGML_UNARY_OP_TANH:
15467
+ case GGML_UNARY_OP_ELU:
15468
+ case GGML_UNARY_OP_RELU:
15469
+ case GGML_UNARY_OP_LEAKY:
15470
+ {
15471
+ n_tasks = 1;
15472
+ } break;
15473
+
15474
+ case GGML_UNARY_OP_GELU:
15475
+ case GGML_UNARY_OP_GELU_QUICK:
15476
+ case GGML_UNARY_OP_SILU:
15477
+ {
15478
+ n_tasks = n_threads;
15479
+ } break;
15480
+ }
15481
+ break;
15482
+ case GGML_OP_SILU_BACK:
15483
+ case GGML_OP_MUL:
15484
+ case GGML_OP_NORM:
15485
+ case GGML_OP_RMS_NORM:
15486
+ case GGML_OP_RMS_NORM_BACK:
15487
+ case GGML_OP_GROUP_NORM:
15488
+ case GGML_OP_CONCAT:
15489
+ {
15490
+ n_tasks = n_threads;
15491
+ } break;
15492
+ case GGML_OP_MUL_MAT:
15493
+ {
15494
+ n_tasks = n_threads;
15495
+
15496
+ // TODO: use different scheduling for different matrix sizes
15497
+ //const int nr0 = ggml_nrows(node->src[0]);
15498
+ //const int nr1 = ggml_nrows(node->src[1]);
15499
+
15500
+ //n_tasks = MIN(n_threads, MAX(1, nr0/128));
15501
+ //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
15502
+
15503
+ #if defined(GGML_USE_CUBLAS)
15504
+ if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
15505
+ n_tasks = 1; // TODO: this actually is doing nothing
15506
+ // the threads are still spinning
15507
+ }
15508
+ #elif defined(GGML_USE_CLBLAST)
15509
+ if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
15510
+ n_tasks = 1; // TODO: this actually is doing nothing
15511
+ // the threads are still spinning
15512
+ }
15513
+ #endif
15514
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
15515
+ if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
15516
+ n_tasks = 1; // TODO: this actually is doing nothing
15517
+ // the threads are still spinning
15518
+ }
15519
+ #endif
15520
+ } break;
15521
+ case GGML_OP_OUT_PROD:
15522
+ {
15523
+ n_tasks = n_threads;
15524
+ } break;
15525
+ case GGML_OP_SCALE:
15526
+ case GGML_OP_SET:
15527
+ case GGML_OP_CONT:
15528
+ case GGML_OP_RESHAPE:
15529
+ case GGML_OP_VIEW:
15530
+ case GGML_OP_PERMUTE:
15531
+ case GGML_OP_TRANSPOSE:
15532
+ case GGML_OP_GET_ROWS:
15533
+ case GGML_OP_GET_ROWS_BACK:
15534
+ case GGML_OP_DIAG:
15535
+ {
15536
+ n_tasks = 1;
15537
+ } break;
15538
+ case GGML_OP_DIAG_MASK_ZERO:
15539
+ case GGML_OP_DIAG_MASK_INF:
15540
+ case GGML_OP_SOFT_MAX:
15541
+ case GGML_OP_SOFT_MAX_BACK:
15542
+ case GGML_OP_ROPE:
15543
+ case GGML_OP_ROPE_BACK:
15544
+ case GGML_OP_ADD_REL_POS:
15545
+ {
15546
+ n_tasks = n_threads;
15547
+ } break;
15548
+ case GGML_OP_ALIBI:
15549
+ {
15550
+ n_tasks = 1; //TODO
15551
+ } break;
15552
+ case GGML_OP_CLAMP:
15553
+ {
15554
+ n_tasks = 1; //TODO
15555
+ } break;
15556
+ case GGML_OP_CONV_TRANSPOSE_1D:
15557
+ {
15558
+ n_tasks = n_threads;
15559
+ } break;
15560
+ case GGML_OP_IM2COL:
15561
+ {
15562
+ n_tasks = n_threads;
15563
+ } break;
15564
+ case GGML_OP_CONV_TRANSPOSE_2D:
15565
+ {
15566
+ n_tasks = n_threads;
15567
+ } break;
15568
+ case GGML_OP_POOL_1D:
15569
+ case GGML_OP_POOL_2D:
15570
+ {
15571
+ n_tasks = 1;
15572
+ } break;
15573
+ case GGML_OP_UPSCALE:
15574
+ {
15575
+ n_tasks = n_threads;
15576
+ } break;
15577
+ case GGML_OP_FLASH_ATTN:
15578
+ {
15579
+ n_tasks = n_threads;
15580
+ } break;
15581
+ case GGML_OP_FLASH_FF:
15582
+ {
15583
+ n_tasks = n_threads;
15584
+ } break;
15585
+ case GGML_OP_FLASH_ATTN_BACK:
15586
+ {
15587
+ n_tasks = n_threads;
15588
+ } break;
15589
+ case GGML_OP_WIN_PART:
15590
+ case GGML_OP_WIN_UNPART:
15591
+ case GGML_OP_GET_REL_POS:
15592
+ case GGML_OP_MAP_UNARY:
15593
+ case GGML_OP_MAP_BINARY:
15594
+ case GGML_OP_MAP_CUSTOM1_F32:
15595
+ case GGML_OP_MAP_CUSTOM2_F32:
15596
+ case GGML_OP_MAP_CUSTOM3_F32:
15597
+ {
15598
+ n_tasks = 1;
15599
+ } break;
15600
+ case GGML_OP_MAP_CUSTOM1:
15601
+ {
15602
+ struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
15603
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15604
+ n_tasks = n_threads;
15605
+ } else {
15606
+ n_tasks = MIN(p->n_tasks, n_threads);
15607
+ }
15608
+ } break;
15609
+ case GGML_OP_MAP_CUSTOM2:
15610
+ {
15611
+ struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
15612
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15613
+ n_tasks = n_threads;
15614
+ } else {
15615
+ n_tasks = MIN(p->n_tasks, n_threads);
15616
+ }
15617
+ } break;
15618
+ case GGML_OP_MAP_CUSTOM3:
15619
+ {
15620
+ struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
15621
+ if (p->n_tasks == GGML_N_TASKS_MAX) {
15622
+ n_tasks = n_threads;
15623
+ } else {
15624
+ n_tasks = MIN(p->n_tasks, n_threads);
15625
+ }
15626
+ } break;
15627
+ case GGML_OP_CROSS_ENTROPY_LOSS:
15628
+ {
15629
+ n_tasks = n_threads;
15630
+ } break;
15631
+ case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
15632
+ {
15633
+ n_tasks = n_threads;
15634
+ } break;
15635
+ case GGML_OP_NONE:
15636
+ {
15637
+ n_tasks = 1;
15638
+ } break;
15639
+ case GGML_OP_COUNT:
15640
+ {
15641
+ GGML_ASSERT(false);
15642
+ } break;
15643
+ default:
15644
+ {
15645
+ printf("%s: op %s not implemented\n", __func__, ggml_op_name(node->op));
15646
+ GGML_ASSERT(false);
15647
+ } break;
15648
+ }
15649
+
15650
+ assert(n_tasks > 0);
16178
15651
 
16179
- node->perf_runs++;
16180
- node->perf_cycles += cycles_cur;
16181
- node->perf_time_us += time_us_cur;
15652
+ return n_tasks;
16182
15653
  }
16183
15654
 
16184
15655
  static thread_ret_t ggml_graph_compute_thread(void * data) {
@@ -16187,7 +15658,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16187
15658
  const struct ggml_cgraph * cgraph = state->shared->cgraph;
16188
15659
  const struct ggml_cplan * cplan = state->shared->cplan;
16189
15660
 
16190
- const int * n_tasks_arr = cplan->n_tasks;
16191
15661
  const int n_threads = state->shared->n_threads;
16192
15662
 
16193
15663
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16212,9 +15682,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16212
15682
 
16213
15683
  if (node_n != -1) {
16214
15684
  /* FINALIZE */
16215
- struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
15685
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16216
15686
  if (GGML_OP_HAS_FINALIZE[node->op]) {
16217
- params.nth = n_tasks_arr[node_n];
15687
+ params.nth = ggml_get_n_tasks(node, n_threads);
16218
15688
  ggml_compute_forward(&params, node);
16219
15689
  }
16220
15690
  ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16225,7 +15695,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16225
15695
  GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16226
15696
 
16227
15697
  struct ggml_tensor * node = cgraph->nodes[node_n];
16228
- const int n_tasks = n_tasks_arr[node_n];
15698
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16229
15699
 
16230
15700
  state->shared->perf_node_start_cycles = ggml_perf_cycles();
16231
15701
  state->shared->perf_node_start_time_us = ggml_perf_time_us();
@@ -16283,7 +15753,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16283
15753
 
16284
15754
  /* COMPUTE */
16285
15755
  struct ggml_tensor * node = cgraph->nodes[node_n];
16286
- const int n_tasks = n_tasks_arr[node_n];
15756
+ const int n_tasks = ggml_get_n_tasks(node, n_threads);
16287
15757
 
16288
15758
  struct ggml_compute_params params = {
16289
15759
  /*.type =*/ GGML_TASK_COMPUTE,
@@ -16317,121 +15787,46 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16317
15787
 
16318
15788
  struct ggml_tensor * node = cgraph->nodes[i];
16319
15789
 
15790
+ size_t cur = 0;
15791
+
16320
15792
  switch (node->op) {
16321
15793
  case GGML_OP_CPY:
16322
15794
  case GGML_OP_DUP:
16323
15795
  {
16324
15796
  n_tasks = n_threads;
16325
15797
 
16326
- size_t cur = 0;
16327
15798
  if (ggml_is_quantized(node->type)) {
16328
15799
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
16329
15800
  }
16330
-
16331
- work_size = MAX(work_size, cur);
16332
15801
  } break;
16333
15802
  case GGML_OP_ADD:
16334
15803
  case GGML_OP_ADD1:
16335
15804
  {
16336
15805
  n_tasks = n_threads;
16337
15806
 
16338
- size_t cur = 0;
16339
-
16340
15807
  if (ggml_is_quantized(node->src[0]->type)) {
16341
15808
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16342
15809
  }
16343
-
16344
- work_size = MAX(work_size, cur);
16345
15810
  } break;
16346
15811
  case GGML_OP_ACC:
16347
15812
  {
16348
15813
  n_tasks = n_threads;
16349
15814
 
16350
- size_t cur = 0;
16351
-
16352
15815
  if (ggml_is_quantized(node->src[0]->type)) {
16353
15816
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
16354
15817
  }
16355
-
16356
- work_size = MAX(work_size, cur);
16357
- } break;
16358
- case GGML_OP_SUB:
16359
- case GGML_OP_DIV:
16360
- case GGML_OP_SQR:
16361
- case GGML_OP_SQRT:
16362
- case GGML_OP_LOG:
16363
- case GGML_OP_SUM:
16364
- case GGML_OP_SUM_ROWS:
16365
- case GGML_OP_MEAN:
16366
- case GGML_OP_ARGMAX:
16367
- case GGML_OP_REPEAT:
16368
- case GGML_OP_REPEAT_BACK:
16369
- {
16370
- n_tasks = 1;
16371
- } break;
16372
-
16373
- case GGML_OP_UNARY:
16374
- {
16375
- switch (ggml_get_unary_op(node)) {
16376
- case GGML_UNARY_OP_ABS:
16377
- case GGML_UNARY_OP_SGN:
16378
- case GGML_UNARY_OP_NEG:
16379
- case GGML_UNARY_OP_STEP:
16380
- case GGML_UNARY_OP_TANH:
16381
- case GGML_UNARY_OP_ELU:
16382
- case GGML_UNARY_OP_RELU:
16383
- {
16384
- n_tasks = 1;
16385
- } break;
16386
-
16387
- case GGML_UNARY_OP_GELU:
16388
- case GGML_UNARY_OP_GELU_QUICK:
16389
- case GGML_UNARY_OP_SILU:
16390
- {
16391
- n_tasks = n_threads;
16392
- } break;
16393
- }
16394
15818
  } break;
16395
- case GGML_OP_SILU_BACK:
16396
- case GGML_OP_MUL:
16397
- case GGML_OP_NORM:
16398
- case GGML_OP_RMS_NORM:
16399
- case GGML_OP_RMS_NORM_BACK:
16400
- case GGML_OP_GROUP_NORM:
16401
- {
16402
- n_tasks = n_threads;
16403
- } break;
16404
- case GGML_OP_CONCAT:
16405
15819
  case GGML_OP_MUL_MAT:
16406
15820
  {
16407
- n_tasks = n_threads;
16408
-
16409
- // TODO: use different scheduling for different matrix sizes
16410
- //const int nr0 = ggml_nrows(node->src[0]);
16411
- //const int nr1 = ggml_nrows(node->src[1]);
16412
-
16413
- //n_tasks = MIN(n_threads, MAX(1, nr0/128));
16414
- //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16415
-
16416
- size_t cur = 0;
16417
15821
  const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
16418
15822
 
16419
- #if defined(GGML_USE_CUBLAS)
16420
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16421
- n_tasks = 1; // TODO: this actually is doing nothing
16422
- // the threads are still spinning
16423
- } else
16424
- #elif defined(GGML_USE_CLBLAST)
15823
+ #if defined(GGML_USE_CLBLAST)
16425
15824
  if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16426
- n_tasks = 1; // TODO: this actually is doing nothing
16427
- // the threads are still spinning
16428
15825
  cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
16429
15826
  } else
16430
15827
  #endif
16431
15828
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16432
15829
  if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16433
- n_tasks = 1; // TODO: this actually is doing nothing
16434
- // the threads are still spinning
16435
15830
  if (node->src[0]->type != GGML_TYPE_F32) {
16436
15831
  // here we need memory just for single 2D matrix from src0
16437
15832
  cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
@@ -16440,108 +15835,18 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16440
15835
  #endif
16441
15836
  if (node->src[1]->type != vec_dot_type) {
16442
15837
  cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
16443
- } else {
16444
- cur = 0;
16445
15838
  }
16446
-
16447
- work_size = MAX(work_size, cur);
16448
15839
  } break;
16449
15840
  case GGML_OP_OUT_PROD:
16450
15841
  {
16451
15842
  n_tasks = n_threads;
16452
15843
 
16453
- size_t cur = 0;
16454
-
16455
15844
  if (ggml_is_quantized(node->src[0]->type)) {
16456
15845
  cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
16457
15846
  }
16458
-
16459
- work_size = MAX(work_size, cur);
16460
- } break;
16461
- case GGML_OP_SCALE:
16462
- {
16463
- n_tasks = 1;
16464
- } break;
16465
- case GGML_OP_SET:
16466
- case GGML_OP_CONT:
16467
- case GGML_OP_RESHAPE:
16468
- case GGML_OP_VIEW:
16469
- case GGML_OP_PERMUTE:
16470
- case GGML_OP_TRANSPOSE:
16471
- case GGML_OP_GET_ROWS:
16472
- case GGML_OP_GET_ROWS_BACK:
16473
- case GGML_OP_DIAG:
16474
- {
16475
- n_tasks = 1;
16476
- } break;
16477
- case GGML_OP_DIAG_MASK_ZERO:
16478
- case GGML_OP_DIAG_MASK_INF:
16479
- case GGML_OP_SOFT_MAX:
16480
- case GGML_OP_SOFT_MAX_BACK:
16481
- case GGML_OP_ROPE:
16482
- case GGML_OP_ROPE_BACK:
16483
- case GGML_OP_ADD_REL_POS:
16484
- {
16485
- n_tasks = n_threads;
16486
- } break;
16487
- case GGML_OP_ALIBI:
16488
- {
16489
- n_tasks = 1; //TODO
16490
- } break;
16491
- case GGML_OP_CLAMP:
16492
- {
16493
- n_tasks = 1; //TODO
16494
- } break;
16495
- case GGML_OP_CONV_1D:
16496
- {
16497
- n_tasks = n_threads;
16498
-
16499
- GGML_ASSERT(node->src[0]->ne[3] == 1);
16500
- GGML_ASSERT(node->src[1]->ne[2] == 1);
16501
- GGML_ASSERT(node->src[1]->ne[3] == 1);
16502
-
16503
- const int64_t ne00 = node->src[0]->ne[0];
16504
- const int64_t ne01 = node->src[0]->ne[1];
16505
- const int64_t ne02 = node->src[0]->ne[2];
16506
-
16507
- const int64_t ne10 = node->src[1]->ne[0];
16508
- const int64_t ne11 = node->src[1]->ne[1];
16509
-
16510
- const int64_t ne0 = node->ne[0];
16511
- const int64_t ne1 = node->ne[1];
16512
- const int64_t nk = ne00;
16513
- const int64_t ew0 = nk * ne01;
16514
-
16515
- UNUSED(ne02);
16516
- UNUSED(ne10);
16517
- UNUSED(ne11);
16518
-
16519
- size_t cur = 0;
16520
-
16521
- if (node->src[0]->type == GGML_TYPE_F16 &&
16522
- node->src[1]->type == GGML_TYPE_F32) {
16523
- cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
16524
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16525
- node->src[1]->type == GGML_TYPE_F32) {
16526
- cur = sizeof(float)*(ne0*ne1*ew0);
16527
- } else {
16528
- GGML_ASSERT(false);
16529
- }
16530
-
16531
- work_size = MAX(work_size, cur);
16532
- } break;
16533
- case GGML_OP_CONV_1D_STAGE_0:
16534
- {
16535
- n_tasks = n_threads;
16536
- } break;
16537
- case GGML_OP_CONV_1D_STAGE_1:
16538
- {
16539
- n_tasks = n_threads;
16540
15847
  } break;
16541
15848
  case GGML_OP_CONV_TRANSPOSE_1D:
16542
15849
  {
16543
- n_tasks = n_threads;
16544
-
16545
15850
  GGML_ASSERT(node->src[0]->ne[3] == 1);
16546
15851
  GGML_ASSERT(node->src[1]->ne[2] == 1);
16547
15852
  GGML_ASSERT(node->src[1]->ne[3] == 1);
@@ -16553,7 +15858,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16553
15858
  const int64_t ne10 = node->src[1]->ne[0]; // L
16554
15859
  const int64_t ne11 = node->src[1]->ne[1]; // Cin
16555
15860
 
16556
- size_t cur = 0;
16557
15861
  if (node->src[0]->type == GGML_TYPE_F16 &&
16558
15862
  node->src[1]->type == GGML_TYPE_F32) {
16559
15863
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
@@ -16565,59 +15869,13 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16565
15869
  } else {
16566
15870
  GGML_ASSERT(false);
16567
15871
  }
16568
-
16569
- work_size = MAX(work_size, cur);
16570
- } break;
16571
- case GGML_OP_CONV_2D:
16572
- {
16573
- n_tasks = n_threads;
16574
-
16575
- const int64_t ne00 = node->src[0]->ne[0]; // W
16576
- const int64_t ne01 = node->src[0]->ne[1]; // H
16577
- const int64_t ne02 = node->src[0]->ne[2]; // C
16578
- const int64_t ne03 = node->src[0]->ne[3]; // N
16579
-
16580
- const int64_t ne10 = node->src[1]->ne[0]; // W
16581
- const int64_t ne11 = node->src[1]->ne[1]; // H
16582
- const int64_t ne12 = node->src[1]->ne[2]; // C
16583
-
16584
- const int64_t ne0 = node->ne[0];
16585
- const int64_t ne1 = node->ne[1];
16586
- const int64_t ne2 = node->ne[2];
16587
- const int64_t ne3 = node->ne[3];
16588
- const int64_t nk = ne00*ne01;
16589
- const int64_t ew0 = nk * ne02;
16590
-
16591
- UNUSED(ne03);
16592
- UNUSED(ne2);
16593
-
16594
- size_t cur = 0;
16595
-
16596
- if (node->src[0]->type == GGML_TYPE_F16 &&
16597
- node->src[1]->type == GGML_TYPE_F32) {
16598
- // im2col: [N*OH*OW, IC*KH*KW]
16599
- cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
16600
- } else if (node->src[0]->type == GGML_TYPE_F32 &&
16601
- node->src[1]->type == GGML_TYPE_F32) {
16602
- cur = sizeof(float)* (ne10*ne11*ne12);
16603
- } else {
16604
- GGML_ASSERT(false);
16605
- }
16606
-
16607
- work_size = MAX(work_size, cur);
16608
- } break;
16609
- case GGML_OP_CONV_2D_STAGE_0:
16610
- {
16611
- n_tasks = n_threads;
16612
15872
  } break;
16613
- case GGML_OP_CONV_2D_STAGE_1:
15873
+ case GGML_OP_IM2COL:
16614
15874
  {
16615
15875
  n_tasks = n_threads;
16616
15876
  } break;
16617
15877
  case GGML_OP_CONV_TRANSPOSE_2D:
16618
15878
  {
16619
- n_tasks = n_threads;
16620
-
16621
15879
  const int64_t ne00 = node->src[0]->ne[0]; // W
16622
15880
  const int64_t ne01 = node->src[0]->ne[1]; // H
16623
15881
  const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
@@ -16627,141 +15885,66 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16627
15885
  const int64_t ne11 = node->src[1]->ne[1]; // H
16628
15886
  const int64_t ne12 = node->src[1]->ne[2]; // Channels In
16629
15887
 
16630
- size_t cur = 0;
16631
15888
  cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
16632
15889
  cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
16633
-
16634
- work_size = MAX(work_size, cur);
16635
- } break;
16636
- case GGML_OP_POOL_1D:
16637
- case GGML_OP_POOL_2D:
16638
- {
16639
- n_tasks = 1;
16640
- } break;
16641
- case GGML_OP_UPSCALE:
16642
- {
16643
- n_tasks = n_threads;
16644
15890
  } break;
16645
15891
  case GGML_OP_FLASH_ATTN:
16646
15892
  {
16647
15893
  n_tasks = n_threads;
16648
15894
 
16649
- size_t cur = 0;
16650
-
16651
15895
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16652
15896
 
16653
15897
  if (node->src[1]->type == GGML_TYPE_F32) {
16654
15898
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16655
15899
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16656
- }
16657
-
16658
- if (node->src[1]->type == GGML_TYPE_F16) {
15900
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16659
15901
  cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
16660
15902
  cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
16661
15903
  }
16662
-
16663
- work_size = MAX(work_size, cur);
16664
15904
  } break;
16665
15905
  case GGML_OP_FLASH_FF:
16666
15906
  {
16667
15907
  n_tasks = n_threads;
16668
15908
 
16669
- size_t cur = 0;
16670
-
16671
15909
  if (node->src[1]->type == GGML_TYPE_F32) {
16672
15910
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16673
15911
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16674
- }
16675
-
16676
- if (node->src[1]->type == GGML_TYPE_F16) {
15912
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16677
15913
  cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
16678
15914
  cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
16679
15915
  }
16680
-
16681
- work_size = MAX(work_size, cur);
16682
15916
  } break;
16683
15917
  case GGML_OP_FLASH_ATTN_BACK:
16684
15918
  {
16685
15919
  n_tasks = n_threads;
16686
15920
 
16687
- size_t cur = 0;
16688
-
16689
15921
  const int64_t D = node->src[0]->ne[0];
16690
15922
  const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
16691
15923
  const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
16692
15924
  if (node->src[1]->type == GGML_TYPE_F32) {
16693
15925
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16694
15926
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16695
- }
16696
-
16697
- if (node->src[1]->type == GGML_TYPE_F16) {
15927
+ } else if (node->src[1]->type == GGML_TYPE_F16) {
16698
15928
  cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
16699
15929
  cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
16700
15930
  }
16701
-
16702
- work_size = MAX(work_size, cur);
16703
- } break;
16704
- case GGML_OP_WIN_PART:
16705
- case GGML_OP_WIN_UNPART:
16706
- case GGML_OP_GET_REL_POS:
16707
- case GGML_OP_MAP_UNARY:
16708
- case GGML_OP_MAP_BINARY:
16709
- case GGML_OP_MAP_CUSTOM1_F32:
16710
- case GGML_OP_MAP_CUSTOM2_F32:
16711
- case GGML_OP_MAP_CUSTOM3_F32:
16712
- {
16713
- n_tasks = 1;
16714
- } break;
16715
- case GGML_OP_MAP_CUSTOM1:
16716
- {
16717
- struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
16718
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16719
- n_tasks = n_threads;
16720
- } else {
16721
- n_tasks = MIN(p->n_tasks, n_threads);
16722
- }
16723
- } break;
16724
- case GGML_OP_MAP_CUSTOM2:
16725
- {
16726
- struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
16727
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16728
- n_tasks = n_threads;
16729
- } else {
16730
- n_tasks = MIN(p->n_tasks, n_threads);
16731
- }
16732
- } break;
16733
- case GGML_OP_MAP_CUSTOM3:
16734
- {
16735
- struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
16736
- if (p->n_tasks == GGML_N_TASKS_MAX) {
16737
- n_tasks = n_threads;
16738
- } else {
16739
- n_tasks = MIN(p->n_tasks, n_threads);
16740
- }
16741
15931
  } break;
15932
+
16742
15933
  case GGML_OP_CROSS_ENTROPY_LOSS:
16743
15934
  {
16744
15935
  n_tasks = n_threads;
16745
15936
 
16746
- size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16747
-
16748
- work_size = MAX(work_size, cur);
16749
- } break;
16750
- case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
16751
- {
16752
- n_tasks = n_threads;
16753
- } break;
16754
- case GGML_OP_NONE:
16755
- {
16756
- n_tasks = 1;
15937
+ cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
16757
15938
  } break;
16758
15939
  case GGML_OP_COUNT:
16759
15940
  {
16760
15941
  GGML_ASSERT(false);
16761
15942
  } break;
15943
+ default:
15944
+ break;
16762
15945
  }
16763
15946
 
16764
- cplan.n_tasks[i] = n_tasks;
15947
+ work_size = MAX(work_size, cur);
16765
15948
  }
16766
15949
 
16767
15950
  if (work_size > 0) {
@@ -16783,12 +15966,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16783
15966
  if (cplan->work_size > 0) {
16784
15967
  GGML_ASSERT(cplan->work_data);
16785
15968
  }
16786
-
16787
- for (int i = 0; i < cgraph->n_nodes; ++i) {
16788
- if (cgraph->nodes[i]->op != GGML_OP_NONE) {
16789
- GGML_ASSERT(cplan->n_tasks[i] > 0);
16790
- }
16791
- }
16792
15969
  }
16793
15970
 
16794
15971
  const int n_threads = cplan->n_threads;
@@ -16861,16 +16038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16861
16038
  return compute_status;
16862
16039
  }
16863
16040
 
16864
- void ggml_graph_reset(struct ggml_cgraph * cgraph) {
16865
- for (int i = 0; i < cgraph->n_nodes; i++) {
16866
- struct ggml_tensor * grad = cgraph->grads[i];
16867
-
16868
- if (grad) {
16869
- ggml_set_zero(grad);
16870
- }
16871
- }
16872
- }
16873
-
16874
16041
  void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
16875
16042
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
16876
16043
 
@@ -16997,12 +16164,12 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
16997
16164
  const uint32_t magic = GGML_FILE_MAGIC;
16998
16165
  const uint32_t version = GGML_FILE_VERSION;
16999
16166
  const uint32_t n_leafs = cgraph->n_leafs;
17000
- const uint32_t nodes = cgraph->n_nodes;
16167
+ const uint32_t n_nodes = cgraph->n_nodes;
17001
16168
 
17002
16169
  fwrite(&magic, sizeof(uint32_t), 1, fout);
17003
16170
  fwrite(&version, sizeof(uint32_t), 1, fout);
17004
16171
  fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
17005
- fwrite(&nodes, sizeof(uint32_t), 1, fout);
16172
+ fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
17006
16173
  fwrite(&size_eval, sizeof(uint64_t), 1, fout);
17007
16174
  }
17008
16175
 
@@ -17090,7 +16257,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17090
16257
  if (idx == -1) {
17091
16258
  for (int k = 0; k < cgraph->n_nodes; ++k) {
17092
16259
  if (args[j] == cgraph->nodes[k]) {
17093
- idx = GGML_MAX_NODES + k;
16260
+ idx = cgraph->n_leafs + k;
17094
16261
  break;
17095
16262
  }
17096
16263
  }
@@ -17117,11 +16284,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
17117
16284
  }
17118
16285
  }
17119
16286
 
17120
- struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
16287
+ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
17121
16288
  assert(*ctx_data == NULL);
17122
16289
  assert(*ctx_eval == NULL);
17123
16290
 
17124
- struct ggml_cgraph result = { 0 };
16291
+ struct ggml_cgraph * result = NULL;
17125
16292
 
17126
16293
  struct ggml_tensor * data = NULL;
17127
16294
 
@@ -17193,13 +16360,11 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17193
16360
  const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
17194
16361
  const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
17195
16362
  const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
17196
-
17197
- result.n_leafs = n_leafs;
17198
- result.n_nodes = n_nodes;
16363
+ const int graph_size = MAX(n_leafs, n_nodes);
17199
16364
 
17200
16365
  // create the data context
17201
16366
  {
17202
- const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
16367
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
17203
16368
 
17204
16369
  struct ggml_init_params params = {
17205
16370
  .mem_size = size_eval + overhead,
@@ -17215,6 +16380,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17215
16380
  }
17216
16381
  }
17217
16382
 
16383
+ result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
16384
+
16385
+ result->n_leafs = n_leafs;
16386
+ result->n_nodes = n_nodes;
16387
+
16388
+
17218
16389
  // leafs
17219
16390
  {
17220
16391
  uint32_t type;
@@ -17253,7 +16424,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17253
16424
  tensor->nb[j] = nb[j];
17254
16425
  }
17255
16426
 
17256
- result.leafs[i] = tensor;
16427
+ result->leafs[i] = tensor;
17257
16428
 
17258
16429
  ptr += ggml_nbytes(tensor);
17259
16430
 
@@ -17305,10 +16476,10 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17305
16476
  continue;
17306
16477
  }
17307
16478
 
17308
- if (arg_idx < GGML_MAX_NODES) {
17309
- args[j] = result.leafs[arg_idx];
16479
+ if (arg_idx < result->n_leafs) {
16480
+ args[j] = result->leafs[arg_idx];
17310
16481
  } else {
17311
- args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
16482
+ args[j] = result->nodes[arg_idx - result->n_leafs];
17312
16483
  }
17313
16484
  }
17314
16485
 
@@ -17360,7 +16531,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
17360
16531
  tensor->src[j] = args[j];
17361
16532
  }
17362
16533
 
17363
- result.nodes[i] = tensor;
16534
+ result->nodes[i] = tensor;
17364
16535
 
17365
16536
  fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
17366
16537
  }
@@ -18265,10 +17436,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18265
17436
  case GGML_OPT_ADAM:
18266
17437
  {
18267
17438
  result = (struct ggml_opt_params) {
18268
- .type = GGML_OPT_ADAM,
18269
- .n_threads = 1,
18270
- .past = 0,
18271
- .delta = 1e-5f,
17439
+ .type = GGML_OPT_ADAM,
17440
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17441
+ .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
17442
+ .past = 0,
17443
+ .delta = 1e-5f,
18272
17444
 
18273
17445
  .max_no_improvement = 100,
18274
17446
 
@@ -18295,10 +17467,11 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
18295
17467
  case GGML_OPT_LBFGS:
18296
17468
  {
18297
17469
  result = (struct ggml_opt_params) {
18298
- .type = GGML_OPT_LBFGS,
18299
- .n_threads = 1,
18300
- .past = 0,
18301
- .delta = 1e-5f,
17470
+ .type = GGML_OPT_LBFGS,
17471
+ .graph_size = GGML_DEFAULT_GRAPH_SIZE,
17472
+ .n_threads = 1,
17473
+ .past = 0,
17474
+ .delta = 1e-5f,
18302
17475
 
18303
17476
  .max_no_improvement = 0,
18304
17477
 
@@ -18440,14 +17613,11 @@ enum ggml_opt_result ggml_opt_resume(
18440
17613
  struct ggml_tensor * f) {
18441
17614
 
18442
17615
  // build forward + backward compute graphs
18443
- struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18444
- struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0));
18445
-
18446
- struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
18447
- struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
17616
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
17617
+ ggml_build_forward_expand(gf, f);
18448
17618
 
18449
- *gf = ggml_build_forward (f);
18450
- *gb = ggml_build_backward(ctx, gf, true);
17619
+ struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
17620
+ ggml_build_backward_expand(ctx, gf, gb, true);
18451
17621
 
18452
17622
  return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
18453
17623
  }
@@ -18903,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18903
18073
  {
18904
18074
  ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
18905
18075
 
18906
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
18076
+ for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
18907
18077
  struct gguf_kv * kv = &ctx->kv[i];
18908
18078
 
18909
18079
  //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18950,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18950
18120
  case GGUF_TYPE_STRING:
18951
18121
  {
18952
18122
  kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
18953
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
18123
+ for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
18954
18124
  ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
18955
18125
  }
18956
18126
  } break;
@@ -18978,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
18978
18148
  {
18979
18149
  ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
18980
18150
 
18981
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18151
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
18982
18152
  struct gguf_tensor_info * info = &ctx->infos[i];
18983
18153
 
18984
18154
  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -19025,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19025
18195
  // compute the total size of the data section, taking into account the alignment
19026
18196
  {
19027
18197
  ctx->size = 0;
19028
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18198
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19029
18199
  struct gguf_tensor_info * info = &ctx->infos[i];
19030
18200
 
19031
18201
  const int64_t ne =
@@ -19094,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
19094
18264
  ggml_set_no_alloc(ctx_data, true);
19095
18265
 
19096
18266
  // create the tensors
19097
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
18267
+ for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
19098
18268
  const int64_t ne[GGML_MAX_DIMS] = {
19099
18269
  ctx->infos[i].ne[0],
19100
18270
  ctx->infos[i].ne[1],