llama_cpp 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
- // Defines CLOCK_MONOTONIC on Linux
2
- #define _GNU_SOURCE
1
+ #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
3
 
4
4
  #include "ggml.h"
5
5
 
@@ -24,6 +24,7 @@
24
24
  #include <stdio.h>
25
25
  #include <float.h>
26
26
  #include <limits.h>
27
+ #include <stdarg.h>
27
28
 
28
29
  #ifdef GGML_USE_METAL
29
30
  #include <unistd.h>
@@ -90,6 +91,11 @@ static int sched_yield (void) {
90
91
  #include <stdatomic.h>
91
92
 
92
93
  typedef void* thread_ret_t;
94
+
95
+ #include <sys/types.h>
96
+ #include <sys/stat.h>
97
+ #include <unistd.h>
98
+
93
99
  #endif
94
100
 
95
101
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -118,6 +124,30 @@ typedef void* thread_ret_t;
118
124
  #define GGML_SOFT_MAX_UNROLL 4
119
125
  #define GGML_VEC_DOT_UNROLL 2
120
126
 
127
+ //
128
+ // logging
129
+ //
130
+
131
+ #if (GGML_DEBUG >= 1)
132
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
133
+ #else
134
+ #define GGML_PRINT_DEBUG(...)
135
+ #endif
136
+
137
+ #if (GGML_DEBUG >= 5)
138
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
139
+ #else
140
+ #define GGML_PRINT_DEBUG_5(...)
141
+ #endif
142
+
143
+ #if (GGML_DEBUG >= 10)
144
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
145
+ #else
146
+ #define GGML_PRINT_DEBUG_10(...)
147
+ #endif
148
+
149
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
150
+
121
151
  #ifdef GGML_USE_ACCELERATE
122
152
  // uncomment to use vDSP for soft max computation
123
153
  // note: not sure if it is actually faster
@@ -130,6 +160,34 @@ typedef void* thread_ret_t;
130
160
  #define GGML_MEM_ALIGN 16
131
161
  #endif
132
162
 
163
+ //
164
+ // logging
165
+ //
166
+
167
+ #if (GGML_DEBUG >= 1)
168
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
169
+ #else
170
+ #define GGML_PRINT_DEBUG(...)
171
+ #endif
172
+
173
+ #if (GGML_DEBUG >= 5)
174
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
175
+ #else
176
+ #define GGML_PRINT_DEBUG_5(...)
177
+ #endif
178
+
179
+ #if (GGML_DEBUG >= 10)
180
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
181
+ #else
182
+ #define GGML_PRINT_DEBUG_10(...)
183
+ #endif
184
+
185
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
186
+
187
+ //
188
+ // end of logging block
189
+ //
190
+
133
191
  #if defined(_MSC_VER) || defined(__MINGW32__)
134
192
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
135
193
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -143,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
143
201
  #endif
144
202
  if (result != 0) {
145
203
  // Handle allocation failure
204
+ const char *error_desc = "unknown allocation error";
205
+ switch (result) {
206
+ case EINVAL:
207
+ error_desc = "invalid alignment value";
208
+ break;
209
+ case ENOMEM:
210
+ error_desc = "insufficient memory";
211
+ break;
212
+ }
213
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
214
+ __func__, error_desc, size/(1024.0*1024.0));
146
215
  return NULL;
147
216
  }
148
217
  return aligned_memory;
@@ -419,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
419
488
  }
420
489
  }
421
490
 
422
-
423
491
  //
424
492
  // timing
425
493
  //
@@ -482,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
482
550
  #define ggml_perf_cycles_per_ms() 0
483
551
  #endif
484
552
 
553
+
485
554
  //
486
555
  // cache line
487
556
  //
@@ -3529,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3529
3598
  *s = 1.f/(*s);
3530
3599
  }
3531
3600
 
3532
- //
3533
- // logging
3534
- //
3535
-
3536
- #if (GGML_DEBUG >= 1)
3537
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
3538
- #else
3539
- #define GGML_PRINT_DEBUG(...)
3540
- #endif
3541
-
3542
- #if (GGML_DEBUG >= 5)
3543
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
3544
- #else
3545
- #define GGML_PRINT_DEBUG_5(...)
3546
- #endif
3547
-
3548
- #if (GGML_DEBUG >= 10)
3549
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
3550
- #else
3551
- #define GGML_PRINT_DEBUG_10(...)
3552
- #endif
3553
-
3554
- #define GGML_PRINT(...) printf(__VA_ARGS__)
3555
-
3556
3601
  //
3557
3602
  // data types
3558
3603
  //
@@ -3712,11 +3757,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3712
3757
  "MAP_UNARY",
3713
3758
  "MAP_BINARY",
3714
3759
 
3760
+ "MAP_CUSTOM1",
3761
+ "MAP_CUSTOM2",
3762
+ "MAP_CUSTOM3",
3763
+
3715
3764
  "CROSS_ENTROPY_LOSS",
3716
3765
  "CROSS_ENTROPY_LOSS_BACK",
3717
3766
  };
3718
3767
 
3719
- static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3768
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3720
3769
 
3721
3770
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3722
3771
  "none",
@@ -3784,11 +3833,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3784
3833
  "f(x)",
3785
3834
  "f(x,y)",
3786
3835
 
3836
+ "custom(x)",
3837
+ "custom(x,y)",
3838
+ "custom(x,y,z)",
3839
+
3787
3840
  "cross_entropy_loss(x,y)",
3788
3841
  "cross_entropy_loss_back(x,y)",
3789
3842
  };
3790
3843
 
3791
- static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3844
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3792
3845
 
3793
3846
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3794
3847
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3819,12 +3872,31 @@ struct ggml_context_container {
3819
3872
  struct ggml_context context;
3820
3873
  };
3821
3874
 
3875
+ //
3876
+ // NUMA support
3877
+ //
3878
+
3879
+ #define GGML_NUMA_MAX_NODES 8
3880
+ #define GGML_NUMA_MAX_CPUS 512
3881
+
3882
+ struct ggml_numa_node {
3883
+ uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
3884
+ uint32_t n_cpus;
3885
+ };
3886
+
3887
+ struct ggml_numa_nodes {
3888
+ struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
3889
+ uint32_t n_nodes;
3890
+ uint32_t total_cpus; // hardware threads on system
3891
+ };
3892
+
3822
3893
  //
3823
3894
  // ggml state
3824
3895
  //
3825
3896
 
3826
3897
  struct ggml_state {
3827
3898
  struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
3899
+ struct ggml_numa_nodes numa;
3828
3900
  };
3829
3901
 
3830
3902
  // global state
@@ -3849,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
3849
3921
  atomic_fetch_sub(&g_state_barrier, 1);
3850
3922
  }
3851
3923
 
3924
+ void ggml_numa_init(void) {
3925
+ if (g_state.numa.n_nodes > 0) {
3926
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
3927
+
3928
+ return;
3929
+ }
3930
+
3931
+ #ifdef __linux__
3932
+ struct stat st;
3933
+ char path[256];
3934
+ int rv;
3935
+
3936
+ // enumerate nodes
3937
+ while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
3938
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
3939
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3940
+ if (stat(path, &st) != 0) { break; }
3941
+ ++g_state.numa.n_nodes;
3942
+ }
3943
+
3944
+ // enumerate CPUs
3945
+ while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
3946
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
3947
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3948
+ if (stat(path, &st) != 0) { break; }
3949
+ ++g_state.numa.total_cpus;
3950
+ }
3951
+
3952
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
3953
+
3954
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
3955
+ g_state.numa.n_nodes = 0;
3956
+ return;
3957
+ }
3958
+
3959
+ for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
3960
+ struct ggml_numa_node * node = &g_state.numa.nodes[n];
3961
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
3962
+ node->n_cpus = 0;
3963
+ for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
3964
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
3965
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3966
+ if (stat(path, &st) == 0) {
3967
+ node->cpus[node->n_cpus++] = c;
3968
+ GGML_PRINT_DEBUG(" %u", c);
3969
+ }
3970
+ }
3971
+ GGML_PRINT_DEBUG("\n");
3972
+ }
3973
+
3974
+ if (ggml_is_numa()) {
3975
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
3976
+ if (fptr != NULL) {
3977
+ char buf[42];
3978
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3979
+ GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3980
+ }
3981
+ fclose(fptr);
3982
+ }
3983
+ }
3984
+ #else
3985
+ // TODO
3986
+ #endif
3987
+ }
3988
+
3989
+ bool ggml_is_numa(void) {
3990
+ return g_state.numa.n_nodes > 1;
3991
+ }
3992
+
3852
3993
  ////////////////////////////////////////////////////////////////////////////////
3853
3994
 
3854
3995
  void ggml_print_object(const struct ggml_object * obj) {
@@ -4105,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4105
4246
 
4106
4247
  g_state = (struct ggml_state) {
4107
4248
  /*.contexts =*/ { { 0 } },
4249
+ /*.numa =*/ {
4250
+ .n_nodes = 0,
4251
+ .total_cpus = 0,
4252
+ },
4108
4253
  };
4109
4254
 
4110
4255
  for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -4734,10 +4879,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
4734
4879
  return tensor;
4735
4880
  }
4736
4881
 
4882
+ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
4883
+ va_list args;
4884
+ va_start(args, fmt);
4885
+ vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
4886
+ va_end(args);
4887
+ return tensor;
4888
+ }
4889
+
4737
4890
  struct ggml_tensor * ggml_view_tensor(
4738
4891
  struct ggml_context * ctx,
4739
4892
  const struct ggml_tensor * src) {
4740
4893
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
4894
+ ggml_format_name(result, "%s (view)", src->name);
4741
4895
 
4742
4896
  result->nb[0] = src->nb[0];
4743
4897
  result->nb[1] = src->nb[1];
@@ -5899,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
5899
6053
 
5900
6054
  // make a view of the destination
5901
6055
  struct ggml_tensor * result = ggml_view_tensor(ctx, b);
6056
+ if (strlen(b->name) > 0) {
6057
+ ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
6058
+ } else {
6059
+ ggml_format_name(result, "%s (copy)", a->name);
6060
+ }
5902
6061
 
5903
6062
  result->op = GGML_OP_CPY;
5904
6063
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5935,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
5935
6094
  }
5936
6095
 
5937
6096
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6097
+ ggml_format_name(result, "%s (cont)", a->name);
5938
6098
 
5939
6099
  result->op = GGML_OP_CONT;
5940
6100
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5978,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
5978
6138
  }
5979
6139
 
5980
6140
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6141
+ ggml_format_name(result, "%s (reshaped)", a->name);
5981
6142
 
5982
6143
  result->op = GGML_OP_RESHAPE;
5983
6144
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6002,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
6002
6163
 
6003
6164
  const int64_t ne[1] = { ne0 };
6004
6165
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6166
+ ggml_format_name(result, "%s (reshaped)", a->name);
6005
6167
 
6006
6168
  result->op = GGML_OP_RESHAPE;
6007
6169
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6027,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
6027
6189
 
6028
6190
  const int64_t ne[2] = { ne0, ne1 };
6029
6191
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6192
+ ggml_format_name(result, "%s (reshaped)", a->name);
6030
6193
 
6031
6194
  result->op = GGML_OP_RESHAPE;
6032
6195
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6053,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
6053
6216
 
6054
6217
  const int64_t ne[3] = { ne0, ne1, ne2 };
6055
6218
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6219
+ ggml_format_name(result, "%s (reshaped)", a->name);
6056
6220
 
6057
6221
  result->op = GGML_OP_RESHAPE;
6058
6222
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6081,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
6081
6245
 
6082
6246
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6083
6247
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6248
+ ggml_format_name(result, "%s (reshaped)", a->name);
6084
6249
 
6085
6250
  result->op = GGML_OP_RESHAPE;
6086
6251
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6105,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
6105
6270
  }
6106
6271
 
6107
6272
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6273
+ ggml_format_name(result, "%s (view)", a->name);
6108
6274
 
6109
6275
  ggml_scratch_save(ctx);
6110
6276
 
6111
6277
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6278
+ ggml_set_name(offs, "offset");
6112
6279
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6113
6280
 
6114
6281
  ggml_scratch_load(ctx);
@@ -6141,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
6141
6308
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6142
6309
 
6143
6310
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6311
+ ggml_format_name(result, "%s (view)", a->name);
6144
6312
 
6145
6313
  ggml_scratch_save(ctx);
6146
6314
 
6147
6315
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6316
+ ggml_set_name(offs, "offset");
6148
6317
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6149
6318
 
6150
6319
  ggml_scratch_load(ctx);
@@ -6183,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
6183
6352
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6184
6353
 
6185
6354
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6355
+ ggml_format_name(result, "%s (view)", a->name);
6186
6356
 
6187
6357
  ggml_scratch_save(ctx);
6188
6358
 
6189
6359
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6360
+ ggml_set_name(offs, "offset");
6190
6361
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6191
6362
 
6192
6363
  ggml_scratch_load(ctx);
@@ -6227,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
6227
6398
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6228
6399
 
6229
6400
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6401
+ ggml_format_name(result, "%s (view)", a->name);
6230
6402
 
6231
6403
  ggml_scratch_save(ctx);
6232
6404
 
6233
6405
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6406
+ ggml_set_name(offs, "offset");
6234
6407
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6235
6408
 
6236
6409
  ggml_scratch_load(ctx);
@@ -6276,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
6276
6449
  }
6277
6450
 
6278
6451
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6452
+ ggml_format_name(result, "%s (permuted)", a->name);
6279
6453
 
6280
6454
  int ne[GGML_MAX_DIMS];
6281
6455
  int nb[GGML_MAX_DIMS];
@@ -6335,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
6335
6509
  }
6336
6510
 
6337
6511
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6512
+ ggml_format_name(result, "%s (transposed)", a->name);
6338
6513
 
6339
6514
  result->ne[0] = a->ne[1];
6340
6515
  result->ne[1] = a->ne[0];
@@ -6603,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
6603
6778
  int n_past,
6604
6779
  int n_dims,
6605
6780
  int mode,
6781
+ int n_ctx,
6606
6782
  bool inplace) {
6607
6783
  GGML_ASSERT(n_past >= 0);
6608
6784
  bool is_node = false;
@@ -6615,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
6615
6791
 
6616
6792
  ggml_scratch_save(ctx);
6617
6793
 
6618
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6794
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6619
6795
 
6620
6796
  ((int32_t *) b->data)[0] = n_past;
6621
6797
  ((int32_t *) b->data)[1] = n_dims;
6622
6798
  ((int32_t *) b->data)[2] = mode;
6799
+ ((int32_t *) b->data)[3] = n_ctx;
6623
6800
 
6624
6801
  ggml_scratch_load(ctx);
6625
6802
 
@@ -6636,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
6636
6813
  struct ggml_tensor * a,
6637
6814
  int n_past,
6638
6815
  int n_dims,
6639
- int mode) {
6640
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
6816
+ int mode,
6817
+ int n_ctx) {
6818
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6641
6819
  }
6642
6820
 
6643
6821
  struct ggml_tensor * ggml_rope_inplace(
@@ -6645,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
6645
6823
  struct ggml_tensor * a,
6646
6824
  int n_past,
6647
6825
  int n_dims,
6648
- int mode) {
6649
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
6826
+ int mode,
6827
+ int n_ctx) {
6828
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6650
6829
  }
6651
6830
 
6652
6831
  // ggml_rope_back
@@ -7063,9 +7242,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7063
7242
  is_node = true;
7064
7243
  }
7065
7244
 
7245
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_scratch_save(ctx);
7248
+
7066
7249
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7067
7250
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7068
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7251
+
7252
+ ggml_scratch_load(ctx);
7069
7253
 
7070
7254
  result->op = GGML_OP_MAP_UNARY;
7071
7255
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7105,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7105
7289
  is_node = true;
7106
7290
  }
7107
7291
 
7292
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7293
+
7294
+ ggml_scratch_save(ctx);
7295
+
7108
7296
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7109
7297
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7110
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7298
+
7299
+ ggml_scratch_load(ctx);
7111
7300
 
7112
7301
  result->op = GGML_OP_MAP_BINARY;
7113
7302
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7134,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7134
7323
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7135
7324
  }
7136
7325
 
7326
+ // ggml_map_custom1
7327
+
7328
+ struct ggml_tensor * ggml_map_custom1_impl_f32(
7329
+ struct ggml_context * ctx,
7330
+ struct ggml_tensor * a,
7331
+ const ggml_custom1_op_f32_t fun,
7332
+ bool inplace) {
7333
+ bool is_node = false;
7334
+
7335
+ if (!inplace && a->grad) {
7336
+ is_node = true;
7337
+ }
7338
+
7339
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7340
+
7341
+ ggml_scratch_save(ctx);
7342
+
7343
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7344
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7345
+
7346
+ ggml_scratch_load(ctx);
7347
+
7348
+ result->op = GGML_OP_MAP_CUSTOM1;
7349
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7350
+ result->src0 = a;
7351
+ result->opt[0] = addr_tensor;
7352
+
7353
+ return result;
7354
+ }
7355
+
7356
+ struct ggml_tensor * ggml_map_custom1_f32(
7357
+ struct ggml_context * ctx,
7358
+ struct ggml_tensor * a,
7359
+ const ggml_custom1_op_f32_t fun) {
7360
+ return ggml_map_custom1_impl_f32(ctx, a, fun, false);
7361
+ }
7362
+
7363
+ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7364
+ struct ggml_context * ctx,
7365
+ struct ggml_tensor * a,
7366
+ const ggml_custom1_op_f32_t fun) {
7367
+ return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7368
+ }
7369
+
7370
+ // ggml_map_custom2
7371
+
7372
+ struct ggml_tensor * ggml_map_custom2_impl_f32(
7373
+ struct ggml_context * ctx,
7374
+ struct ggml_tensor * a,
7375
+ struct ggml_tensor * b,
7376
+ const ggml_custom2_op_f32_t fun,
7377
+ bool inplace) {
7378
+ bool is_node = false;
7379
+
7380
+ if (!inplace && (a->grad || b->grad)) {
7381
+ is_node = true;
7382
+ }
7383
+
7384
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7385
+
7386
+ ggml_scratch_save(ctx);
7387
+
7388
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7389
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7390
+
7391
+ ggml_scratch_load(ctx);
7392
+
7393
+ result->op = GGML_OP_MAP_CUSTOM2;
7394
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7395
+ result->src0 = a;
7396
+ result->src1 = b;
7397
+ result->opt[0] = addr_tensor;
7398
+
7399
+ return result;
7400
+ }
7401
+
7402
+ struct ggml_tensor * ggml_map_custom2_f32(
7403
+ struct ggml_context * ctx,
7404
+ struct ggml_tensor * a,
7405
+ struct ggml_tensor * b,
7406
+ const ggml_custom2_op_f32_t fun) {
7407
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
7408
+ }
7409
+
7410
+ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7411
+ struct ggml_context * ctx,
7412
+ struct ggml_tensor * a,
7413
+ struct ggml_tensor * b,
7414
+ const ggml_custom2_op_f32_t fun) {
7415
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7416
+ }
7417
+
7418
+ // ggml_map_custom3
7419
+
7420
+ struct ggml_tensor * ggml_map_custom3_impl_f32(
7421
+ struct ggml_context * ctx,
7422
+ struct ggml_tensor * a,
7423
+ struct ggml_tensor * b,
7424
+ struct ggml_tensor * c,
7425
+ const ggml_custom3_op_f32_t fun,
7426
+ bool inplace) {
7427
+ bool is_node = false;
7428
+
7429
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7430
+ is_node = true;
7431
+ }
7432
+
7433
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7434
+
7435
+ ggml_scratch_save(ctx);
7436
+
7437
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7438
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7439
+
7440
+ ggml_scratch_load(ctx);
7441
+
7442
+ result->op = GGML_OP_MAP_CUSTOM3;
7443
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7444
+ result->src0 = a;
7445
+ result->src1 = b;
7446
+ result->opt[0] = addr_tensor;
7447
+ result->opt[1] = c;
7448
+
7449
+ return result;
7450
+ }
7451
+
7452
+ struct ggml_tensor * ggml_map_custom3_f32(
7453
+ struct ggml_context * ctx,
7454
+ struct ggml_tensor * a,
7455
+ struct ggml_tensor * b,
7456
+ struct ggml_tensor * c,
7457
+ const ggml_custom3_op_f32_t fun) {
7458
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
7459
+ }
7460
+
7461
+ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7462
+ struct ggml_context * ctx,
7463
+ struct ggml_tensor * a,
7464
+ struct ggml_tensor * b,
7465
+ struct ggml_tensor * c,
7466
+ const ggml_custom3_op_f32_t fun) {
7467
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7468
+ }
7469
+
7137
7470
  // ggml_cross_entropy_loss
7138
7471
 
7139
7472
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -12111,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
12111
12444
  const struct ggml_tensor * src1,
12112
12445
  struct ggml_tensor * dst) {
12113
12446
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12114
- GGML_ASSERT(ggml_nelements(src1) == 3);
12447
+ GGML_ASSERT(ggml_nelements(src1) == 4);
12115
12448
 
12116
12449
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12117
12450
  return;
@@ -12120,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
12120
12453
  const int n_past = ((int32_t *) src1->data)[0];
12121
12454
  const int n_dims = ((int32_t *) src1->data)[1];
12122
12455
  const int mode = ((int32_t *) src1->data)[2];
12456
+ const int n_ctx = ((int32_t *) src1->data)[3];
12123
12457
 
12124
12458
  assert(n_past >= 0);
12125
12459
 
@@ -12164,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
12164
12498
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12165
12499
 
12166
12500
  const bool is_neox = mode & 2;
12501
+ const bool is_glm = mode & 4;
12167
12502
 
12168
12503
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12169
12504
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12174,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
12174
12509
 
12175
12510
  float theta = (float)p;
12176
12511
 
12177
- if (!is_neox) {
12512
+ if (is_glm) {
12513
+ theta = MIN(p, n_ctx - 2);
12514
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12515
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12516
+ const float cos_theta = cosf(theta);
12517
+ const float sin_theta = sinf(theta);
12518
+ const float cos_block_theta = cosf(block_theta);
12519
+ const float sin_block_theta = sinf(block_theta);
12520
+
12521
+ theta *= theta_scale;
12522
+ block_theta *= theta_scale;
12523
+
12524
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12525
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12526
+
12527
+ const float x0 = src[0];
12528
+ const float x1 = src[n_dims/2];
12529
+ const float x2 = src[n_dims];
12530
+ const float x3 = src[n_dims/2*3];
12531
+
12532
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
12533
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
12534
+ dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
12535
+ dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12536
+ }
12537
+ } else if (!is_neox) {
12178
12538
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12179
12539
  const float cos_theta = cosf(theta);
12180
12540
  const float sin_theta = sinf(theta);
@@ -12224,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
12224
12584
  const struct ggml_tensor * src1,
12225
12585
  struct ggml_tensor * dst) {
12226
12586
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12227
- GGML_ASSERT(ggml_nelements(src1) == 3);
12587
+ GGML_ASSERT(ggml_nelements(src1) == 4);
12228
12588
 
12229
12589
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12230
12590
  return;
@@ -12233,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
12233
12593
  const int n_past = ((int32_t *) src1->data)[0];
12234
12594
  const int n_dims = ((int32_t *) src1->data)[1];
12235
12595
  const int mode = ((int32_t *) src1->data)[2];
12596
+ const int n_ctx = ((int32_t *) src1->data)[3];
12236
12597
 
12237
12598
  assert(n_past >= 0);
12238
12599
 
@@ -12277,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
12277
12638
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12278
12639
 
12279
12640
  const bool is_neox = mode & 2;
12641
+ const bool is_glm = mode & 4;
12280
12642
 
12281
12643
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12282
12644
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12287,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
12287
12649
 
12288
12650
  float theta = (float)p;
12289
12651
 
12290
- if (!is_neox) {
12652
+ if (is_glm) {
12653
+ theta = MIN(p, n_ctx - 2);
12654
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12655
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12656
+ const float cos_theta = cosf(theta);
12657
+ const float sin_theta = sinf(theta);
12658
+ const float cos_block_theta = cosf(block_theta);
12659
+ const float sin_block_theta = sinf(block_theta);
12660
+
12661
+ theta *= theta_scale;
12662
+ block_theta *= theta_scale;
12663
+
12664
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12665
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12666
+
12667
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
12668
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12669
+ const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
12670
+ const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
12671
+
12672
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12673
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12674
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
12675
+ dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12676
+ }
12677
+ } if (!is_neox) {
12291
12678
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12292
12679
  const float cos_theta = cosf(theta);
12293
12680
  const float sin_theta = sinf(theta);
@@ -13179,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13179
13566
  const int nk1 = ne01;
13180
13567
 
13181
13568
  // size of the convolution row - the kernel size unrolled across all channels
13182
- // round-up so it is more suitable for SIMD
13183
- const int ew0 = ggml_up32(nk0*nk1*ne02);
13569
+ const int ew0 = nk0*nk1*ne02;
13184
13570
 
13185
13571
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13186
13572
  GGML_ASSERT(nb10 == sizeof(float));
@@ -14590,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
14590
14976
  }
14591
14977
  }
14592
14978
 
14979
+ // ggml_compute_forward_map_custom1
14980
+
14981
+ static void ggml_compute_forward_map_custom1_f32(
14982
+ const struct ggml_compute_params * params,
14983
+ const struct ggml_tensor * a,
14984
+ struct ggml_tensor * dst,
14985
+ const ggml_custom1_op_f32_t fun) {
14986
+ assert(params->ith == 0);
14987
+
14988
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14989
+ return;
14990
+ }
14991
+
14992
+ fun(dst, a);
14993
+ }
14994
+
14995
+
14996
+ static void ggml_compute_forward_map_custom1(
14997
+ const struct ggml_compute_params * params,
14998
+ const struct ggml_tensor * a,
14999
+ struct ggml_tensor * dst,
15000
+ const ggml_custom1_op_f32_t fun) {
15001
+ switch (a->type) {
15002
+ case GGML_TYPE_F32:
15003
+ {
15004
+ ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
15005
+ } break;
15006
+ default:
15007
+ {
15008
+ GGML_ASSERT(false);
15009
+ } break;
15010
+ }
15011
+ }
15012
+
15013
+ // ggml_compute_forward_map_custom2
15014
+
15015
+ static void ggml_compute_forward_map_custom2_f32(
15016
+ const struct ggml_compute_params * params,
15017
+ const struct ggml_tensor * a,
15018
+ const struct ggml_tensor * b,
15019
+ struct ggml_tensor * dst,
15020
+ const ggml_custom2_op_f32_t fun) {
15021
+ assert(params->ith == 0);
15022
+
15023
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15024
+ return;
15025
+ }
15026
+
15027
+ fun(dst, a, b);
15028
+ }
15029
+
15030
+
15031
+ static void ggml_compute_forward_map_custom2(
15032
+ const struct ggml_compute_params * params,
15033
+ const struct ggml_tensor * a,
15034
+ const struct ggml_tensor * b,
15035
+ struct ggml_tensor * dst,
15036
+ const ggml_custom2_op_f32_t fun) {
15037
+ switch (a->type) {
15038
+ case GGML_TYPE_F32:
15039
+ {
15040
+ ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
15041
+ } break;
15042
+ default:
15043
+ {
15044
+ GGML_ASSERT(false);
15045
+ } break;
15046
+ }
15047
+ }
15048
+
15049
+ // ggml_compute_forward_map_custom3
15050
+
15051
+ static void ggml_compute_forward_map_custom3_f32(
15052
+ const struct ggml_compute_params * params,
15053
+ const struct ggml_tensor * a,
15054
+ const struct ggml_tensor * b,
15055
+ const struct ggml_tensor * c,
15056
+ struct ggml_tensor * dst,
15057
+ const ggml_custom3_op_f32_t fun) {
15058
+ assert(params->ith == 0);
15059
+
15060
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15061
+ return;
15062
+ }
15063
+
15064
+ fun(dst, a, b, c);
15065
+ }
15066
+
15067
+
15068
+ static void ggml_compute_forward_map_custom3(
15069
+ const struct ggml_compute_params * params,
15070
+ const struct ggml_tensor * a,
15071
+ const struct ggml_tensor * b,
15072
+ const struct ggml_tensor * c,
15073
+ struct ggml_tensor * dst,
15074
+ const ggml_custom3_op_f32_t fun) {
15075
+ switch (a->type) {
15076
+ case GGML_TYPE_F32:
15077
+ {
15078
+ ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
15079
+ } break;
15080
+ default:
15081
+ {
15082
+ GGML_ASSERT(false);
15083
+ } break;
15084
+ }
15085
+ }
15086
+
14593
15087
  // ggml_compute_forward_cross_entropy_loss
14594
15088
 
14595
15089
  static void ggml_compute_forward_cross_entropy_loss_f32(
@@ -14880,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14880
15374
  if (skip_cpu) {
14881
15375
  return;
14882
15376
  }
14883
- GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
15377
+ GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
14884
15378
  GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14885
15379
  #endif // GGML_USE_CUBLAS
14886
15380
 
@@ -15127,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15127
15621
  ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
15128
15622
  }
15129
15623
  break;
15624
+ case GGML_OP_MAP_CUSTOM1:
15625
+ {
15626
+ const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
15627
+ ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
15628
+ }
15629
+ break;
15630
+ case GGML_OP_MAP_CUSTOM2:
15631
+ {
15632
+ const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
15633
+ ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
15634
+ }
15635
+ break;
15636
+ case GGML_OP_MAP_CUSTOM3:
15637
+ {
15638
+ const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
15639
+ ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
15640
+ }
15641
+ break;
15130
15642
  case GGML_OP_CROSS_ENTROPY_LOSS:
15131
15643
  {
15132
15644
  ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
@@ -15735,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15735
16247
  {
15736
16248
  if (src0->grad) {
15737
16249
  assert(src1->type == GGML_TYPE_I32);
15738
- assert(ggml_nelements(src1) == 3);
16250
+ assert(ggml_nelements(src1) == 4);
15739
16251
  const int n_past = ((int32_t *) src1->data)[0];
15740
16252
  const int n_dims = ((int32_t *) src1->data)[1];
15741
16253
  const int mode = ((int32_t *) src1->data)[2];
16254
+ const int n_ctx = ((int32_t *) src1->data)[3];
15742
16255
  src0->grad = ggml_add_impl(ctx,
15743
16256
  src0->grad,
15744
16257
  ggml_rope(ctx,
15745
16258
  tensor->grad,
15746
16259
  n_past,
15747
16260
  n_dims,
15748
- mode),
16261
+ mode,
16262
+ n_ctx),
15749
16263
  inplace);
15750
16264
  }
15751
16265
  if (src1->grad) {
@@ -15933,6 +16447,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15933
16447
  case GGML_OP_WIN_UNPART:
15934
16448
  case GGML_OP_MAP_UNARY:
15935
16449
  case GGML_OP_MAP_BINARY:
16450
+ case GGML_OP_MAP_CUSTOM1:
16451
+ case GGML_OP_MAP_CUSTOM2:
16452
+ case GGML_OP_MAP_CUSTOM3:
15936
16453
  {
15937
16454
  GGML_ASSERT(false); // not supported
15938
16455
  } break;
@@ -16004,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16004
16521
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
16005
16522
 
16006
16523
  if (strlen(node->name) == 0) {
16007
- snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
16524
+ ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
16008
16525
  }
16009
16526
 
16010
16527
  cgraph->leafs[cgraph->n_leafs] = node;
@@ -16013,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16013
16530
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
16014
16531
 
16015
16532
  if (strlen(node->name) == 0) {
16016
- snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
16533
+ ggml_format_name(node, "node_%d", cgraph->n_nodes);
16017
16534
  }
16018
16535
 
16019
16536
  cgraph->nodes[cgraph->n_nodes] = node;
@@ -16167,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
16167
16684
 
16168
16685
  #endif
16169
16686
 
16687
+ // Android's libc implementation "bionic" does not support setting affinity
16688
+ #if defined(__linux__) && !defined(__BIONIC__)
16689
+ void set_numa_thread_affinity(int thread_n, int n_threads) {
16690
+ if (!ggml_is_numa()) {
16691
+ return;
16692
+ }
16693
+
16694
+ // run thread on node_num thread_n / (threads per node)
16695
+ const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16696
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16697
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16698
+
16699
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16700
+ CPU_ZERO_S(setsize, cpus);
16701
+ for (size_t i = 0; i < node->n_cpus; ++i) {
16702
+ CPU_SET_S(node->cpus[i], setsize, cpus);
16703
+ }
16704
+
16705
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16706
+ if (rv) {
16707
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16708
+ strerror(rv));
16709
+ }
16710
+
16711
+ CPU_FREE(cpus);
16712
+ }
16713
+
16714
+ void clear_numa_thread_affinity(void) {
16715
+ if (!ggml_is_numa()) {
16716
+ return;
16717
+ }
16718
+
16719
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16720
+
16721
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16722
+ CPU_ZERO_S(setsize, cpus);
16723
+ for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
16724
+ CPU_SET_S(i, setsize, cpus);
16725
+ }
16726
+
16727
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16728
+ if (rv) {
16729
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16730
+ strerror(rv));
16731
+ }
16732
+
16733
+ CPU_FREE(cpus);
16734
+ }
16735
+ #else
16736
+ // TODO: Windows etc.
16737
+ // (the linux implementation may also work on BSD, someone should test)
16738
+ void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16739
+ void clear_numa_thread_affinity(void) {}
16740
+ #endif
16741
+
16170
16742
  struct ggml_compute_state_shared {
16171
- ggml_lock_t spin;
16743
+ struct ggml_cgraph * cgraph;
16744
+
16745
+ int64_t perf_node_start_cycles;
16746
+ int64_t perf_node_start_time_us;
16172
16747
 
16173
16748
  int n_threads;
16174
16749
 
16175
16750
  // synchronization primitives
16176
- atomic_int n_ready;
16177
- atomic_bool has_work;
16178
- atomic_bool stop; // stop all threads
16751
+ atomic_int n_active; // num active threads
16752
+ atomic_int node_n; // active graph node
16179
16753
  };
16180
16754
 
16181
16755
  struct ggml_compute_state {
16182
16756
  ggml_thread_t thrd;
16183
-
16184
- struct ggml_compute_params params;
16185
- struct ggml_tensor * node;
16186
-
16757
+ int ith;
16187
16758
  struct ggml_compute_state_shared * shared;
16188
16759
  };
16189
16760
 
16761
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16762
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16763
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
16764
+
16765
+ node->perf_runs++;
16766
+ node->perf_cycles += cycles_cur;
16767
+ node->perf_time_us += time_us_cur;
16768
+ }
16769
+
16190
16770
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16191
16771
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16772
+ struct ggml_cgraph * cgraph = state->shared->cgraph;
16192
16773
 
16193
16774
  const int n_threads = state->shared->n_threads;
16775
+ set_numa_thread_affinity(state->ith, n_threads);
16776
+
16777
+ int node_n = -1;
16194
16778
 
16195
16779
  while (true) {
16196
- if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
16197
- atomic_store(&state->shared->has_work, false);
16198
- } else {
16199
- while (atomic_load(&state->shared->has_work)) {
16200
- if (atomic_load(&state->shared->stop)) {
16201
- return 0;
16202
- }
16203
- ggml_lock_lock (&state->shared->spin);
16204
- ggml_lock_unlock(&state->shared->spin);
16780
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16781
+ // all other threads are finished and spinning
16782
+ // do finalize and init here so we don't have synchronize again
16783
+ struct ggml_compute_params params = {
16784
+ /*.type =*/ GGML_TASK_FINALIZE,
16785
+ /*.ith =*/ 0,
16786
+ /*.nth =*/ 0,
16787
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16788
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16789
+ };
16790
+
16791
+ if (node_n != -1) {
16792
+ /* FINALIZE */
16793
+ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
+ params.nth = node->n_tasks;
16795
+ ggml_compute_forward(&params, node);
16796
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16205
16797
  }
16206
- }
16207
16798
 
16208
- atomic_fetch_sub(&state->shared->n_ready, 1);
16799
+ // distribute new work or execute it direct if 1T
16800
+ while (++node_n < cgraph->n_nodes) {
16801
+ GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16802
+
16803
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16804
+
16805
+ state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
+ state->shared->perf_node_start_time_us = ggml_perf_time_us();
16209
16807
 
16210
- // wait for work
16211
- while (!atomic_load(&state->shared->has_work)) {
16212
- if (atomic_load(&state->shared->stop)) {
16213
- return 0;
16808
+ /* INIT */
16809
+ params.type = GGML_TASK_INIT;
16810
+ params.nth = node->n_tasks;
16811
+ ggml_compute_forward(&params, node);
16812
+
16813
+ if (node->n_tasks == 1) {
16814
+ // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16815
+ // they do something more efficient than spinning (?)
16816
+ params.type = GGML_TASK_COMPUTE;
16817
+ ggml_compute_forward(&params, node);
16818
+
16819
+ params.type = GGML_TASK_FINALIZE;
16820
+ ggml_compute_forward(&params, node);
16821
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16822
+ } else {
16823
+ break;
16824
+ }
16214
16825
  }
16215
- ggml_lock_lock (&state->shared->spin);
16216
- ggml_lock_unlock(&state->shared->spin);
16826
+
16827
+ atomic_store(&state->shared->n_active, n_threads);
16828
+ atomic_store(&state->shared->node_n, node_n);
16829
+ } else {
16830
+ // wait for other threads to finish
16831
+ const int last = node_n;
16832
+ do {
16833
+ sched_yield();
16834
+ node_n = atomic_load(&state->shared->node_n);
16835
+ } while (node_n == last);
16217
16836
  }
16218
16837
 
16219
16838
  // check if we should stop
16220
- if (atomic_load(&state->shared->stop)) {
16221
- break;
16222
- }
16839
+ if (node_n >= cgraph->n_nodes) break;
16223
16840
 
16224
- if (state->node) {
16225
- if (state->params.ith < state->params.nth) {
16226
- ggml_compute_forward(&state->params, state->node);
16227
- }
16841
+ /* COMPUTE */
16842
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16228
16843
 
16229
- state->node = NULL;
16230
- } else {
16231
- break;
16844
+ struct ggml_compute_params params = {
16845
+ /*.type =*/ GGML_TASK_COMPUTE,
16846
+ /*.ith =*/ state->ith,
16847
+ /*.nth =*/ node->n_tasks,
16848
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16849
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16850
+ };
16851
+
16852
+ if (state->ith < node->n_tasks) {
16853
+ ggml_compute_forward(&params, node);
16232
16854
  }
16233
16855
  }
16234
16856
 
@@ -16239,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16239
16861
  const int n_threads = cgraph->n_threads;
16240
16862
 
16241
16863
  struct ggml_compute_state_shared state_shared = {
16242
- /*.spin =*/ GGML_LOCK_INITIALIZER,
16243
- /*.n_threads =*/ n_threads,
16244
- /*.n_ready =*/ 0,
16245
- /*.has_work =*/ false,
16246
- /*.stop =*/ false,
16864
+ /*.cgraph =*/ cgraph,
16865
+ /*.perf_node_start_cycles =*/ 0,
16866
+ /*.perf_node_start_time_us =*/ 0,
16867
+ /*.n_threads =*/ n_threads,
16868
+ /*.n_active =*/ n_threads,
16869
+ /*.node_n =*/ -1,
16247
16870
  };
16248
- struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
16249
-
16250
- // create thread pool
16251
- if (n_threads > 1) {
16252
- ggml_lock_init(&state_shared.spin);
16253
-
16254
- atomic_store(&state_shared.has_work, true);
16255
-
16256
- for (int j = 0; j < n_threads - 1; j++) {
16257
- workers[j] = (struct ggml_compute_state) {
16258
- .thrd = 0,
16259
- .params = {
16260
- .type = GGML_TASK_COMPUTE,
16261
- .ith = j + 1,
16262
- .nth = n_threads,
16263
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16264
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16265
- },
16266
- .node = NULL,
16267
- .shared = &state_shared,
16268
- };
16269
-
16270
- int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16271
- GGML_ASSERT(rc == 0);
16272
- UNUSED(rc);
16273
- }
16274
- }
16871
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16275
16872
 
16276
16873
  // initialize tasks + work buffer
16277
16874
  {
@@ -16415,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16415
17012
  } break;
16416
17013
  case GGML_OP_SCALE:
16417
17014
  {
16418
- node->n_tasks = n_threads;
17015
+ node->n_tasks = 1;
16419
17016
  } break;
16420
17017
  case GGML_OP_SET:
16421
17018
  case GGML_OP_CONT:
@@ -16574,6 +17171,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16574
17171
  case GGML_OP_WIN_UNPART:
16575
17172
  case GGML_OP_MAP_UNARY:
16576
17173
  case GGML_OP_MAP_BINARY:
17174
+ case GGML_OP_MAP_CUSTOM1:
17175
+ case GGML_OP_MAP_CUSTOM2:
17176
+ case GGML_OP_MAP_CUSTOM3:
16577
17177
  {
16578
17178
  node->n_tasks = 1;
16579
17179
  } break;
@@ -16616,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16616
17216
  }
16617
17217
  }
16618
17218
 
16619
- const int64_t perf_start_cycles = ggml_perf_cycles();
16620
- const int64_t perf_start_time_us = ggml_perf_time_us();
16621
-
16622
- for (int i = 0; i < cgraph->n_nodes; i++) {
16623
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
16624
-
16625
- struct ggml_tensor * node = cgraph->nodes[i];
16626
-
16627
- // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
16628
- //if (node->grad == NULL && node->perf_runs > 0) {
16629
- // continue;
16630
- //}
16631
-
16632
- const int64_t perf_node_start_cycles = ggml_perf_cycles();
16633
- const int64_t perf_node_start_time_us = ggml_perf_time_us();
16634
-
16635
- // INIT
16636
- struct ggml_compute_params params = {
16637
- /*.type =*/ GGML_TASK_INIT,
16638
- /*.ith =*/ 0,
16639
- /*.nth =*/ node->n_tasks,
16640
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16641
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16642
- };
16643
-
16644
- ggml_compute_forward(&params, node);
16645
-
16646
- // COMPUTE
16647
- if (node->n_tasks > 1) {
16648
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16649
- atomic_store(&state_shared.has_work, false);
16650
- }
16651
-
16652
- while (atomic_load(&state_shared.has_work)) {
16653
- ggml_lock_lock (&state_shared.spin);
16654
- ggml_lock_unlock(&state_shared.spin);
16655
- }
16656
-
16657
- // launch thread pool
16658
- for (int j = 0; j < n_threads - 1; j++) {
16659
- workers[j].params = (struct ggml_compute_params) {
16660
- .type = GGML_TASK_COMPUTE,
16661
- .ith = j + 1,
16662
- .nth = node->n_tasks,
16663
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16664
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16665
- };
16666
- workers[j].node = node;
16667
- }
16668
-
16669
- atomic_fetch_sub(&state_shared.n_ready, 1);
16670
-
16671
- while (atomic_load(&state_shared.n_ready) > 0) {
16672
- ggml_lock_lock (&state_shared.spin);
16673
- ggml_lock_unlock(&state_shared.spin);
16674
- }
16675
-
16676
- atomic_store(&state_shared.has_work, true);
16677
- }
16678
-
16679
- params.type = GGML_TASK_COMPUTE;
16680
- ggml_compute_forward(&params, node);
16681
-
16682
- // wait for thread pool
16683
- if (node->n_tasks > 1) {
16684
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16685
- atomic_store(&state_shared.has_work, false);
16686
- }
16687
-
16688
- while (atomic_load(&state_shared.has_work)) {
16689
- ggml_lock_lock (&state_shared.spin);
16690
- ggml_lock_unlock(&state_shared.spin);
16691
- }
16692
-
16693
- atomic_fetch_sub(&state_shared.n_ready, 1);
16694
-
16695
- while (atomic_load(&state_shared.n_ready) != 0) {
16696
- ggml_lock_lock (&state_shared.spin);
16697
- ggml_lock_unlock(&state_shared.spin);
16698
- }
16699
- }
16700
-
16701
- // FINALIZE
16702
- if (node->n_tasks > 1) {
16703
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16704
- atomic_store(&state_shared.has_work, false);
16705
- }
16706
-
16707
- while (atomic_load(&state_shared.has_work)) {
16708
- ggml_lock_lock (&state_shared.spin);
16709
- ggml_lock_unlock(&state_shared.spin);
16710
- }
16711
-
16712
- // launch thread pool
16713
- for (int j = 0; j < n_threads - 1; j++) {
16714
- workers[j].params = (struct ggml_compute_params) {
16715
- .type = GGML_TASK_FINALIZE,
16716
- .ith = j + 1,
16717
- .nth = node->n_tasks,
16718
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16719
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16720
- };
16721
- workers[j].node = node;
16722
- }
16723
-
16724
- atomic_fetch_sub(&state_shared.n_ready, 1);
16725
-
16726
- while (atomic_load(&state_shared.n_ready) > 0) {
16727
- ggml_lock_lock (&state_shared.spin);
16728
- ggml_lock_unlock(&state_shared.spin);
16729
- }
17219
+ // create thread pool
17220
+ if (n_threads > 1) {
17221
+ for (int j = 1; j < n_threads; ++j) {
17222
+ workers[j] = (struct ggml_compute_state) {
17223
+ .thrd = 0,
17224
+ .ith = j,
17225
+ .shared = &state_shared,
17226
+ };
16730
17227
 
16731
- atomic_store(&state_shared.has_work, true);
17228
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
17229
+ GGML_ASSERT(rc == 0);
16732
17230
  }
17231
+ }
17232
+ workers[0].ith = 0;
17233
+ workers[0].shared = &state_shared;
16733
17234
 
16734
- params.type = GGML_TASK_FINALIZE;
16735
- ggml_compute_forward(&params, node);
16736
-
16737
- // wait for thread pool
16738
- if (node->n_tasks > 1) {
16739
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16740
- atomic_store(&state_shared.has_work, false);
16741
- }
16742
-
16743
- while (atomic_load(&state_shared.has_work)) {
16744
- ggml_lock_lock (&state_shared.spin);
16745
- ggml_lock_unlock(&state_shared.spin);
16746
- }
16747
-
16748
- atomic_fetch_sub(&state_shared.n_ready, 1);
16749
-
16750
- while (atomic_load(&state_shared.n_ready) != 0) {
16751
- ggml_lock_lock (&state_shared.spin);
16752
- ggml_lock_unlock(&state_shared.spin);
16753
- }
16754
- }
17235
+ const int64_t perf_start_cycles = ggml_perf_cycles();
17236
+ const int64_t perf_start_time_us = ggml_perf_time_us();
16755
17237
 
16756
- // performance stats (node)
16757
- {
16758
- int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
16759
- int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
17238
+ // this is a work thread too
17239
+ ggml_graph_compute_thread(&workers[0]);
16760
17240
 
16761
- node->perf_runs++;
16762
- node->perf_cycles += perf_cycles_cur;
16763
- node->perf_time_us += perf_time_us_cur;
16764
- }
16765
- }
17241
+ // don't leave affinity set on the main thread
17242
+ clear_numa_thread_affinity();
16766
17243
 
16767
17244
  // join thread pool
16768
17245
  if (n_threads > 1) {
16769
- atomic_store(&state_shared.stop, true);
16770
- atomic_store(&state_shared.has_work, true);
16771
-
16772
- for (int j = 0; j < n_threads - 1; j++) {
16773
- int rc = ggml_thread_join(workers[j].thrd, NULL);
17246
+ for (int j = 1; j < n_threads; j++) {
17247
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
16774
17248
  GGML_ASSERT(rc == 0);
16775
- UNUSED(rc);
16776
17249
  }
16777
-
16778
- ggml_lock_destroy(&state_shared.spin);
16779
17250
  }
16780
17251
 
16781
17252
  // performance stats (graph)
@@ -17397,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
17397
17868
  return NULL;
17398
17869
  }
17399
17870
 
17871
+ static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17872
+ struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
17873
+ struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
17874
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
17875
+ gparent0 ? (void *) gparent0 : (void *) parent,
17876
+ gparent0 ? "g" : "x",
17877
+ gparent ? (void *) gparent : (void *) node,
17878
+ gparent ? "g" : "x",
17879
+ gparent ? "empty" : "vee",
17880
+ gparent ? "dashed" : "solid",
17881
+ label);
17882
+ }
17883
+
17884
+ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17885
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
17886
+ (void *) parent, "x",
17887
+ (void *) node, "x",
17888
+ label);
17889
+ }
17890
+
17400
17891
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
17401
17892
  char color[16];
17402
17893
 
@@ -17432,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17432
17923
  (void *) node, color);
17433
17924
 
17434
17925
  if (strlen(node->name) > 0) {
17435
- fprintf(fp, "%s |", node->name);
17926
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17927
+ } else {
17928
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
17436
17929
  }
17437
17930
 
17438
17931
  if (node->n_dims == 2) {
@@ -17441,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17441
17934
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17442
17935
  }
17443
17936
 
17444
-
17445
17937
  if (node->grad) {
17446
17938
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17447
17939
  } else {
@@ -17460,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17460
17952
  (void *) node, color);
17461
17953
 
17462
17954
  if (strlen(node->name) > 0) {
17463
- fprintf(fp, "%s | ", node->name);
17955
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17956
+ } else {
17957
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
17464
17958
  }
17465
- if (ggml_nelements(node) == 1) {
17466
- if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17467
- fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
17468
- }
17469
- else {
17470
- fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
17959
+
17960
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17961
+ if (ggml_nelements(node) < 5) {
17962
+ fprintf(fp, " | (");
17963
+ for (int j = 0; j < ggml_nelements(node); j++) {
17964
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17965
+ fprintf(fp, "%d", ggml_get_i32_1d(node, j));
17966
+ }
17967
+ else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
17968
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
17969
+ }
17970
+ else {
17971
+ fprintf(fp, "#");
17972
+ }
17973
+ if (j < ggml_nelements(node) - 1) {
17974
+ fprintf(fp, ", ");
17975
+ }
17471
17976
  }
17472
- }
17473
- else {
17474
- fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17977
+ fprintf(fp, ")");
17475
17978
  }
17476
17979
  fprintf(fp, "\"; ]\n");
17477
17980
  }
@@ -17479,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17479
17982
  for (int i = 0; i < gb->n_nodes; i++) {
17480
17983
  struct ggml_tensor * node = gb->nodes[i];
17481
17984
 
17482
- struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
17483
-
17484
17985
  if (node->src0) {
17485
- struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
17486
-
17487
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
17488
- parent0 ? (void *) parent0 : (void *) node->src0,
17489
- parent0 ? "g" : "x",
17490
- parent ? (void *) parent : (void *) node,
17491
- parent ? "g" : "x",
17492
- parent ? "empty" : "vee",
17493
- parent ? "dashed" : "solid");
17986
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
17494
17987
  }
17495
17988
 
17496
17989
  if (node->src1) {
17497
- struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
17498
-
17499
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
17500
- parent1 ? (void *) parent1 : (void *) node->src1,
17501
- parent1 ? "g" : "x",
17502
- parent ? (void *) parent : (void *) node,
17503
- parent ? "g" : "x",
17504
- parent ? "empty" : "vee",
17505
- parent ? "dashed" : "solid");
17990
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
17991
+ }
17992
+
17993
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
17994
+ if (node->opt[j]) {
17995
+ char label[16];
17996
+ snprintf(label, sizeof(label), "opt %d", j);
17997
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
17998
+ }
17506
17999
  }
17507
18000
  }
17508
18001
 
@@ -17510,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17510
18003
  struct ggml_tensor * node = gb->leafs[i];
17511
18004
 
17512
18005
  if (node->src0) {
17513
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
17514
- (void *) node->src0, "x",
17515
- (void *) node, "x");
18006
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
17516
18007
  }
17517
18008
 
17518
18009
  if (node->src1) {
17519
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
17520
- (void *) node->src1, "x",
17521
- (void *) node, "x");
18010
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
18011
+ }
18012
+
18013
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
18014
+ if (node->opt[j]) {
18015
+ char label[16];
18016
+ snprintf(label, sizeof(label), "opt %d", j);
18017
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
18018
+ }
17522
18019
  }
17523
18020
  }
17524
18021