llama_cpp 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- // Defines CLOCK_MONOTONIC on Linux
2
- #define _GNU_SOURCE
1
+ #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
+ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
3
 
4
4
  #include "ggml.h"
5
5
 
@@ -24,6 +24,7 @@
24
24
  #include <stdio.h>
25
25
  #include <float.h>
26
26
  #include <limits.h>
27
+ #include <stdarg.h>
27
28
 
28
29
  #ifdef GGML_USE_METAL
29
30
  #include <unistd.h>
@@ -90,6 +91,11 @@ static int sched_yield (void) {
90
91
  #include <stdatomic.h>
91
92
 
92
93
  typedef void* thread_ret_t;
94
+
95
+ #include <sys/types.h>
96
+ #include <sys/stat.h>
97
+ #include <unistd.h>
98
+
93
99
  #endif
94
100
 
95
101
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -118,6 +124,30 @@ typedef void* thread_ret_t;
118
124
  #define GGML_SOFT_MAX_UNROLL 4
119
125
  #define GGML_VEC_DOT_UNROLL 2
120
126
 
127
+ //
128
+ // logging
129
+ //
130
+
131
+ #if (GGML_DEBUG >= 1)
132
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
133
+ #else
134
+ #define GGML_PRINT_DEBUG(...)
135
+ #endif
136
+
137
+ #if (GGML_DEBUG >= 5)
138
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
139
+ #else
140
+ #define GGML_PRINT_DEBUG_5(...)
141
+ #endif
142
+
143
+ #if (GGML_DEBUG >= 10)
144
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
145
+ #else
146
+ #define GGML_PRINT_DEBUG_10(...)
147
+ #endif
148
+
149
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
150
+
121
151
  #ifdef GGML_USE_ACCELERATE
122
152
  // uncomment to use vDSP for soft max computation
123
153
  // note: not sure if it is actually faster
@@ -130,6 +160,34 @@ typedef void* thread_ret_t;
130
160
  #define GGML_MEM_ALIGN 16
131
161
  #endif
132
162
 
163
+ //
164
+ // logging
165
+ //
166
+
167
+ #if (GGML_DEBUG >= 1)
168
+ #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
169
+ #else
170
+ #define GGML_PRINT_DEBUG(...)
171
+ #endif
172
+
173
+ #if (GGML_DEBUG >= 5)
174
+ #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
175
+ #else
176
+ #define GGML_PRINT_DEBUG_5(...)
177
+ #endif
178
+
179
+ #if (GGML_DEBUG >= 10)
180
+ #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
181
+ #else
182
+ #define GGML_PRINT_DEBUG_10(...)
183
+ #endif
184
+
185
+ #define GGML_PRINT(...) printf(__VA_ARGS__)
186
+
187
+ //
188
+ // end of logging block
189
+ //
190
+
133
191
  #if defined(_MSC_VER) || defined(__MINGW32__)
134
192
  #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
135
193
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
@@ -143,6 +201,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
143
201
  #endif
144
202
  if (result != 0) {
145
203
  // Handle allocation failure
204
+ const char *error_desc = "unknown allocation error";
205
+ switch (result) {
206
+ case EINVAL:
207
+ error_desc = "invalid alignment value";
208
+ break;
209
+ case ENOMEM:
210
+ error_desc = "insufficient memory";
211
+ break;
212
+ }
213
+ GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
214
+ __func__, error_desc, size/(1024.0*1024.0));
146
215
  return NULL;
147
216
  }
148
217
  return aligned_memory;
@@ -419,7 +488,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
419
488
  }
420
489
  }
421
490
 
422
-
423
491
  //
424
492
  // timing
425
493
  //
@@ -482,6 +550,7 @@ int64_t ggml_cycles_per_ms(void) {
482
550
  #define ggml_perf_cycles_per_ms() 0
483
551
  #endif
484
552
 
553
+
485
554
  //
486
555
  // cache line
487
556
  //
@@ -3529,30 +3598,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
3529
3598
  *s = 1.f/(*s);
3530
3599
  }
3531
3600
 
3532
- //
3533
- // logging
3534
- //
3535
-
3536
- #if (GGML_DEBUG >= 1)
3537
- #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
3538
- #else
3539
- #define GGML_PRINT_DEBUG(...)
3540
- #endif
3541
-
3542
- #if (GGML_DEBUG >= 5)
3543
- #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
3544
- #else
3545
- #define GGML_PRINT_DEBUG_5(...)
3546
- #endif
3547
-
3548
- #if (GGML_DEBUG >= 10)
3549
- #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
3550
- #else
3551
- #define GGML_PRINT_DEBUG_10(...)
3552
- #endif
3553
-
3554
- #define GGML_PRINT(...) printf(__VA_ARGS__)
3555
-
3556
3601
  //
3557
3602
  // data types
3558
3603
  //
@@ -3712,11 +3757,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3712
3757
  "MAP_UNARY",
3713
3758
  "MAP_BINARY",
3714
3759
 
3760
+ "MAP_CUSTOM1",
3761
+ "MAP_CUSTOM2",
3762
+ "MAP_CUSTOM3",
3763
+
3715
3764
  "CROSS_ENTROPY_LOSS",
3716
3765
  "CROSS_ENTROPY_LOSS_BACK",
3717
3766
  };
3718
3767
 
3719
- static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3768
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3720
3769
 
3721
3770
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3722
3771
  "none",
@@ -3784,11 +3833,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3784
3833
  "f(x)",
3785
3834
  "f(x,y)",
3786
3835
 
3836
+ "custom(x)",
3837
+ "custom(x,y)",
3838
+ "custom(x,y,z)",
3839
+
3787
3840
  "cross_entropy_loss(x,y)",
3788
3841
  "cross_entropy_loss_back(x,y)",
3789
3842
  };
3790
3843
 
3791
- static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3844
+ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
3792
3845
 
3793
3846
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3794
3847
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -3819,12 +3872,31 @@ struct ggml_context_container {
3819
3872
  struct ggml_context context;
3820
3873
  };
3821
3874
 
3875
+ //
3876
+ // NUMA support
3877
+ //
3878
+
3879
+ #define GGML_NUMA_MAX_NODES 8
3880
+ #define GGML_NUMA_MAX_CPUS 512
3881
+
3882
+ struct ggml_numa_node {
3883
+ uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
3884
+ uint32_t n_cpus;
3885
+ };
3886
+
3887
+ struct ggml_numa_nodes {
3888
+ struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
3889
+ uint32_t n_nodes;
3890
+ uint32_t total_cpus; // hardware threads on system
3891
+ };
3892
+
3822
3893
  //
3823
3894
  // ggml state
3824
3895
  //
3825
3896
 
3826
3897
  struct ggml_state {
3827
3898
  struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
3899
+ struct ggml_numa_nodes numa;
3828
3900
  };
3829
3901
 
3830
3902
  // global state
@@ -3849,6 +3921,75 @@ inline static void ggml_critical_section_end(void) {
3849
3921
  atomic_fetch_sub(&g_state_barrier, 1);
3850
3922
  }
3851
3923
 
3924
+ void ggml_numa_init(void) {
3925
+ if (g_state.numa.n_nodes > 0) {
3926
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
3927
+
3928
+ return;
3929
+ }
3930
+
3931
+ #ifdef __linux__
3932
+ struct stat st;
3933
+ char path[256];
3934
+ int rv;
3935
+
3936
+ // enumerate nodes
3937
+ while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
3938
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
3939
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3940
+ if (stat(path, &st) != 0) { break; }
3941
+ ++g_state.numa.n_nodes;
3942
+ }
3943
+
3944
+ // enumerate CPUs
3945
+ while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
3946
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
3947
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3948
+ if (stat(path, &st) != 0) { break; }
3949
+ ++g_state.numa.total_cpus;
3950
+ }
3951
+
3952
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
3953
+
3954
+ if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
3955
+ g_state.numa.n_nodes = 0;
3956
+ return;
3957
+ }
3958
+
3959
+ for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
3960
+ struct ggml_numa_node * node = &g_state.numa.nodes[n];
3961
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
3962
+ node->n_cpus = 0;
3963
+ for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
3964
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
3965
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
3966
+ if (stat(path, &st) == 0) {
3967
+ node->cpus[node->n_cpus++] = c;
3968
+ GGML_PRINT_DEBUG(" %u", c);
3969
+ }
3970
+ }
3971
+ GGML_PRINT_DEBUG("\n");
3972
+ }
3973
+
3974
+ if (ggml_is_numa()) {
3975
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
3976
+ if (fptr != NULL) {
3977
+ char buf[42];
3978
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
3979
+ GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
3980
+ }
3981
+ fclose(fptr);
3982
+ }
3983
+ }
3984
+ #else
3985
+ // TODO
3986
+ #endif
3987
+ }
3988
+
3989
+ bool ggml_is_numa(void) {
3990
+ return g_state.numa.n_nodes > 1;
3991
+ }
3992
+
3852
3993
  ////////////////////////////////////////////////////////////////////////////////
3853
3994
 
3854
3995
  void ggml_print_object(const struct ggml_object * obj) {
@@ -4105,6 +4246,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4105
4246
 
4106
4247
  g_state = (struct ggml_state) {
4107
4248
  /*.contexts =*/ { { 0 } },
4249
+ /*.numa =*/ {
4250
+ .n_nodes = 0,
4251
+ .total_cpus = 0,
4252
+ },
4108
4253
  };
4109
4254
 
4110
4255
  for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -4734,10 +4879,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
4734
4879
  return tensor;
4735
4880
  }
4736
4881
 
4882
+ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
4883
+ va_list args;
4884
+ va_start(args, fmt);
4885
+ vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
4886
+ va_end(args);
4887
+ return tensor;
4888
+ }
4889
+
4737
4890
  struct ggml_tensor * ggml_view_tensor(
4738
4891
  struct ggml_context * ctx,
4739
4892
  const struct ggml_tensor * src) {
4740
4893
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
4894
+ ggml_format_name(result, "%s (view)", src->name);
4741
4895
 
4742
4896
  result->nb[0] = src->nb[0];
4743
4897
  result->nb[1] = src->nb[1];
@@ -5899,6 +6053,11 @@ struct ggml_tensor * ggml_cpy_impl(
5899
6053
 
5900
6054
  // make a view of the destination
5901
6055
  struct ggml_tensor * result = ggml_view_tensor(ctx, b);
6056
+ if (strlen(b->name) > 0) {
6057
+ ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
6058
+ } else {
6059
+ ggml_format_name(result, "%s (copy)", a->name);
6060
+ }
5902
6061
 
5903
6062
  result->op = GGML_OP_CPY;
5904
6063
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5935,6 +6094,7 @@ struct ggml_tensor * ggml_cont_impl(
5935
6094
  }
5936
6095
 
5937
6096
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6097
+ ggml_format_name(result, "%s (cont)", a->name);
5938
6098
 
5939
6099
  result->op = GGML_OP_CONT;
5940
6100
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5978,6 +6138,7 @@ struct ggml_tensor * ggml_reshape(
5978
6138
  }
5979
6139
 
5980
6140
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
6141
+ ggml_format_name(result, "%s (reshaped)", a->name);
5981
6142
 
5982
6143
  result->op = GGML_OP_RESHAPE;
5983
6144
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6002,6 +6163,7 @@ struct ggml_tensor * ggml_reshape_1d(
6002
6163
 
6003
6164
  const int64_t ne[1] = { ne0 };
6004
6165
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6166
+ ggml_format_name(result, "%s (reshaped)", a->name);
6005
6167
 
6006
6168
  result->op = GGML_OP_RESHAPE;
6007
6169
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6027,6 +6189,7 @@ struct ggml_tensor * ggml_reshape_2d(
6027
6189
 
6028
6190
  const int64_t ne[2] = { ne0, ne1 };
6029
6191
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6192
+ ggml_format_name(result, "%s (reshaped)", a->name);
6030
6193
 
6031
6194
  result->op = GGML_OP_RESHAPE;
6032
6195
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6053,6 +6216,7 @@ struct ggml_tensor * ggml_reshape_3d(
6053
6216
 
6054
6217
  const int64_t ne[3] = { ne0, ne1, ne2 };
6055
6218
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6219
+ ggml_format_name(result, "%s (reshaped)", a->name);
6056
6220
 
6057
6221
  result->op = GGML_OP_RESHAPE;
6058
6222
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6081,6 +6245,7 @@ struct ggml_tensor * ggml_reshape_4d(
6081
6245
 
6082
6246
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6083
6247
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6248
+ ggml_format_name(result, "%s (reshaped)", a->name);
6084
6249
 
6085
6250
  result->op = GGML_OP_RESHAPE;
6086
6251
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6105,10 +6270,12 @@ struct ggml_tensor * ggml_view_1d(
6105
6270
  }
6106
6271
 
6107
6272
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6273
+ ggml_format_name(result, "%s (view)", a->name);
6108
6274
 
6109
6275
  ggml_scratch_save(ctx);
6110
6276
 
6111
6277
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6278
+ ggml_set_name(offs, "offset");
6112
6279
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6113
6280
 
6114
6281
  ggml_scratch_load(ctx);
@@ -6141,10 +6308,12 @@ struct ggml_tensor * ggml_view_2d(
6141
6308
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6142
6309
 
6143
6310
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6311
+ ggml_format_name(result, "%s (view)", a->name);
6144
6312
 
6145
6313
  ggml_scratch_save(ctx);
6146
6314
 
6147
6315
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6316
+ ggml_set_name(offs, "offset");
6148
6317
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6149
6318
 
6150
6319
  ggml_scratch_load(ctx);
@@ -6183,10 +6352,12 @@ struct ggml_tensor * ggml_view_3d(
6183
6352
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6184
6353
 
6185
6354
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6355
+ ggml_format_name(result, "%s (view)", a->name);
6186
6356
 
6187
6357
  ggml_scratch_save(ctx);
6188
6358
 
6189
6359
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6360
+ ggml_set_name(offs, "offset");
6190
6361
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6191
6362
 
6192
6363
  ggml_scratch_load(ctx);
@@ -6227,10 +6398,12 @@ struct ggml_tensor * ggml_view_4d(
6227
6398
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6228
6399
 
6229
6400
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6401
+ ggml_format_name(result, "%s (view)", a->name);
6230
6402
 
6231
6403
  ggml_scratch_save(ctx);
6232
6404
 
6233
6405
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6406
+ ggml_set_name(offs, "offset");
6234
6407
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6235
6408
 
6236
6409
  ggml_scratch_load(ctx);
@@ -6276,6 +6449,7 @@ struct ggml_tensor * ggml_permute(
6276
6449
  }
6277
6450
 
6278
6451
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6452
+ ggml_format_name(result, "%s (permuted)", a->name);
6279
6453
 
6280
6454
  int ne[GGML_MAX_DIMS];
6281
6455
  int nb[GGML_MAX_DIMS];
@@ -6335,6 +6509,7 @@ struct ggml_tensor * ggml_transpose(
6335
6509
  }
6336
6510
 
6337
6511
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6512
+ ggml_format_name(result, "%s (transposed)", a->name);
6338
6513
 
6339
6514
  result->ne[0] = a->ne[1];
6340
6515
  result->ne[1] = a->ne[0];
@@ -6603,6 +6778,7 @@ struct ggml_tensor * ggml_rope_impl(
6603
6778
  int n_past,
6604
6779
  int n_dims,
6605
6780
  int mode,
6781
+ int n_ctx,
6606
6782
  bool inplace) {
6607
6783
  GGML_ASSERT(n_past >= 0);
6608
6784
  bool is_node = false;
@@ -6615,11 +6791,12 @@ struct ggml_tensor * ggml_rope_impl(
6615
6791
 
6616
6792
  ggml_scratch_save(ctx);
6617
6793
 
6618
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6794
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
6619
6795
 
6620
6796
  ((int32_t *) b->data)[0] = n_past;
6621
6797
  ((int32_t *) b->data)[1] = n_dims;
6622
6798
  ((int32_t *) b->data)[2] = mode;
6799
+ ((int32_t *) b->data)[3] = n_ctx;
6623
6800
 
6624
6801
  ggml_scratch_load(ctx);
6625
6802
 
@@ -6636,8 +6813,9 @@ struct ggml_tensor * ggml_rope(
6636
6813
  struct ggml_tensor * a,
6637
6814
  int n_past,
6638
6815
  int n_dims,
6639
- int mode) {
6640
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
6816
+ int mode,
6817
+ int n_ctx) {
6818
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
6641
6819
  }
6642
6820
 
6643
6821
  struct ggml_tensor * ggml_rope_inplace(
@@ -6645,8 +6823,9 @@ struct ggml_tensor * ggml_rope_inplace(
6645
6823
  struct ggml_tensor * a,
6646
6824
  int n_past,
6647
6825
  int n_dims,
6648
- int mode) {
6649
- return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
6826
+ int mode,
6827
+ int n_ctx) {
6828
+ return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
6650
6829
  }
6651
6830
 
6652
6831
  // ggml_rope_back
@@ -7063,9 +7242,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
7063
7242
  is_node = true;
7064
7243
  }
7065
7244
 
7245
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7246
+
7247
+ ggml_scratch_save(ctx);
7248
+
7066
7249
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7067
7250
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7068
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7251
+
7252
+ ggml_scratch_load(ctx);
7069
7253
 
7070
7254
  result->op = GGML_OP_MAP_UNARY;
7071
7255
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7105,9 +7289,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
7105
7289
  is_node = true;
7106
7290
  }
7107
7291
 
7292
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7293
+
7294
+ ggml_scratch_save(ctx);
7295
+
7108
7296
  struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7109
7297
  *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7110
- struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7298
+
7299
+ ggml_scratch_load(ctx);
7111
7300
 
7112
7301
  result->op = GGML_OP_MAP_BINARY;
7113
7302
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7134,6 +7323,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
7134
7323
  return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
7135
7324
  }
7136
7325
 
7326
+ // ggml_map_custom1
7327
+
7328
+ struct ggml_tensor * ggml_map_custom1_impl_f32(
7329
+ struct ggml_context * ctx,
7330
+ struct ggml_tensor * a,
7331
+ const ggml_custom1_op_f32_t fun,
7332
+ bool inplace) {
7333
+ bool is_node = false;
7334
+
7335
+ if (!inplace && a->grad) {
7336
+ is_node = true;
7337
+ }
7338
+
7339
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7340
+
7341
+ ggml_scratch_save(ctx);
7342
+
7343
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7344
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7345
+
7346
+ ggml_scratch_load(ctx);
7347
+
7348
+ result->op = GGML_OP_MAP_CUSTOM1;
7349
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7350
+ result->src0 = a;
7351
+ result->opt[0] = addr_tensor;
7352
+
7353
+ return result;
7354
+ }
7355
+
7356
+ struct ggml_tensor * ggml_map_custom1_f32(
7357
+ struct ggml_context * ctx,
7358
+ struct ggml_tensor * a,
7359
+ const ggml_custom1_op_f32_t fun) {
7360
+ return ggml_map_custom1_impl_f32(ctx, a, fun, false);
7361
+ }
7362
+
7363
+ struct ggml_tensor * ggml_map_custom1_inplace_f32(
7364
+ struct ggml_context * ctx,
7365
+ struct ggml_tensor * a,
7366
+ const ggml_custom1_op_f32_t fun) {
7367
+ return ggml_map_custom1_impl_f32(ctx, a, fun, true);
7368
+ }
7369
+
7370
+ // ggml_map_custom2
7371
+
7372
+ struct ggml_tensor * ggml_map_custom2_impl_f32(
7373
+ struct ggml_context * ctx,
7374
+ struct ggml_tensor * a,
7375
+ struct ggml_tensor * b,
7376
+ const ggml_custom2_op_f32_t fun,
7377
+ bool inplace) {
7378
+ bool is_node = false;
7379
+
7380
+ if (!inplace && (a->grad || b->grad)) {
7381
+ is_node = true;
7382
+ }
7383
+
7384
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7385
+
7386
+ ggml_scratch_save(ctx);
7387
+
7388
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7389
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7390
+
7391
+ ggml_scratch_load(ctx);
7392
+
7393
+ result->op = GGML_OP_MAP_CUSTOM2;
7394
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7395
+ result->src0 = a;
7396
+ result->src1 = b;
7397
+ result->opt[0] = addr_tensor;
7398
+
7399
+ return result;
7400
+ }
7401
+
7402
+ struct ggml_tensor * ggml_map_custom2_f32(
7403
+ struct ggml_context * ctx,
7404
+ struct ggml_tensor * a,
7405
+ struct ggml_tensor * b,
7406
+ const ggml_custom2_op_f32_t fun) {
7407
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
7408
+ }
7409
+
7410
+ struct ggml_tensor * ggml_map_custom2_inplace_f32(
7411
+ struct ggml_context * ctx,
7412
+ struct ggml_tensor * a,
7413
+ struct ggml_tensor * b,
7414
+ const ggml_custom2_op_f32_t fun) {
7415
+ return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
7416
+ }
7417
+
7418
+ // ggml_map_custom3
7419
+
7420
+ struct ggml_tensor * ggml_map_custom3_impl_f32(
7421
+ struct ggml_context * ctx,
7422
+ struct ggml_tensor * a,
7423
+ struct ggml_tensor * b,
7424
+ struct ggml_tensor * c,
7425
+ const ggml_custom3_op_f32_t fun,
7426
+ bool inplace) {
7427
+ bool is_node = false;
7428
+
7429
+ if (!inplace && (a->grad || b->grad || c->grad)) {
7430
+ is_node = true;
7431
+ }
7432
+
7433
+ struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
7434
+
7435
+ ggml_scratch_save(ctx);
7436
+
7437
+ struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
7438
+ *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
7439
+
7440
+ ggml_scratch_load(ctx);
7441
+
7442
+ result->op = GGML_OP_MAP_CUSTOM3;
7443
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7444
+ result->src0 = a;
7445
+ result->src1 = b;
7446
+ result->opt[0] = addr_tensor;
7447
+ result->opt[1] = c;
7448
+
7449
+ return result;
7450
+ }
7451
+
7452
+ struct ggml_tensor * ggml_map_custom3_f32(
7453
+ struct ggml_context * ctx,
7454
+ struct ggml_tensor * a,
7455
+ struct ggml_tensor * b,
7456
+ struct ggml_tensor * c,
7457
+ const ggml_custom3_op_f32_t fun) {
7458
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
7459
+ }
7460
+
7461
+ struct ggml_tensor * ggml_map_custom3_inplace_f32(
7462
+ struct ggml_context * ctx,
7463
+ struct ggml_tensor * a,
7464
+ struct ggml_tensor * b,
7465
+ struct ggml_tensor * c,
7466
+ const ggml_custom3_op_f32_t fun) {
7467
+ return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
7468
+ }
7469
+
7137
7470
  // ggml_cross_entropy_loss
7138
7471
 
7139
7472
  struct ggml_tensor * ggml_cross_entropy_loss(
@@ -12111,7 +12444,7 @@ static void ggml_compute_forward_rope_f32(
12111
12444
  const struct ggml_tensor * src1,
12112
12445
  struct ggml_tensor * dst) {
12113
12446
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12114
- GGML_ASSERT(ggml_nelements(src1) == 3);
12447
+ GGML_ASSERT(ggml_nelements(src1) == 4);
12115
12448
 
12116
12449
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12117
12450
  return;
@@ -12120,6 +12453,7 @@ static void ggml_compute_forward_rope_f32(
12120
12453
  const int n_past = ((int32_t *) src1->data)[0];
12121
12454
  const int n_dims = ((int32_t *) src1->data)[1];
12122
12455
  const int mode = ((int32_t *) src1->data)[2];
12456
+ const int n_ctx = ((int32_t *) src1->data)[3];
12123
12457
 
12124
12458
  assert(n_past >= 0);
12125
12459
 
@@ -12164,6 +12498,7 @@ static void ggml_compute_forward_rope_f32(
12164
12498
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12165
12499
 
12166
12500
  const bool is_neox = mode & 2;
12501
+ const bool is_glm = mode & 4;
12167
12502
 
12168
12503
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12169
12504
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12174,7 +12509,32 @@ static void ggml_compute_forward_rope_f32(
12174
12509
 
12175
12510
  float theta = (float)p;
12176
12511
 
12177
- if (!is_neox) {
12512
+ if (is_glm) {
12513
+ theta = MIN(p, n_ctx - 2);
12514
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12515
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12516
+ const float cos_theta = cosf(theta);
12517
+ const float sin_theta = sinf(theta);
12518
+ const float cos_block_theta = cosf(block_theta);
12519
+ const float sin_block_theta = sinf(block_theta);
12520
+
12521
+ theta *= theta_scale;
12522
+ block_theta *= theta_scale;
12523
+
12524
+ const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12525
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12526
+
12527
+ const float x0 = src[0];
12528
+ const float x1 = src[n_dims/2];
12529
+ const float x2 = src[n_dims];
12530
+ const float x3 = src[n_dims/2*3];
12531
+
12532
+ dst_data[0] = x0*cos_theta - x1*sin_theta;
12533
+ dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
12534
+ dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
12535
+ dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
12536
+ }
12537
+ } else if (!is_neox) {
12178
12538
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12179
12539
  const float cos_theta = cosf(theta);
12180
12540
  const float sin_theta = sinf(theta);
@@ -12224,7 +12584,7 @@ static void ggml_compute_forward_rope_f16(
12224
12584
  const struct ggml_tensor * src1,
12225
12585
  struct ggml_tensor * dst) {
12226
12586
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
12227
- GGML_ASSERT(ggml_nelements(src1) == 3);
12587
+ GGML_ASSERT(ggml_nelements(src1) == 4);
12228
12588
 
12229
12589
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12230
12590
  return;
@@ -12233,6 +12593,7 @@ static void ggml_compute_forward_rope_f16(
12233
12593
  const int n_past = ((int32_t *) src1->data)[0];
12234
12594
  const int n_dims = ((int32_t *) src1->data)[1];
12235
12595
  const int mode = ((int32_t *) src1->data)[2];
12596
+ const int n_ctx = ((int32_t *) src1->data)[3];
12236
12597
 
12237
12598
  assert(n_past >= 0);
12238
12599
 
@@ -12277,6 +12638,7 @@ static void ggml_compute_forward_rope_f16(
12277
12638
  const float theta_scale = powf(10000.0, -2.0f/n_dims);
12278
12639
 
12279
12640
  const bool is_neox = mode & 2;
12641
+ const bool is_glm = mode & 4;
12280
12642
 
12281
12643
  for (int64_t i3 = 0; i3 < ne3; i3++) {
12282
12644
  for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -12287,7 +12649,32 @@ static void ggml_compute_forward_rope_f16(
12287
12649
 
12288
12650
  float theta = (float)p;
12289
12651
 
12290
- if (!is_neox) {
12652
+ if (is_glm) {
12653
+ theta = MIN(p, n_ctx - 2);
12654
+ float block_theta = MAX(p - (n_ctx - 2), 0);
12655
+ for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
12656
+ const float cos_theta = cosf(theta);
12657
+ const float sin_theta = sinf(theta);
12658
+ const float cos_block_theta = cosf(block_theta);
12659
+ const float sin_block_theta = sinf(block_theta);
12660
+
12661
+ theta *= theta_scale;
12662
+ block_theta *= theta_scale;
12663
+
12664
+ const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
12665
+ ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
12666
+
12667
+ const float x0 = GGML_FP16_TO_FP32(src[0]);
12668
+ const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
12669
+ const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
12670
+ const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
12671
+
12672
+ dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
12673
+ dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
12674
+ dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
12675
+ dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
12676
+ }
12677
+ } if (!is_neox) {
12291
12678
  for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
12292
12679
  const float cos_theta = cosf(theta);
12293
12680
  const float sin_theta = sinf(theta);
@@ -13179,8 +13566,7 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13179
13566
  const int nk1 = ne01;
13180
13567
 
13181
13568
  // size of the convolution row - the kernel size unrolled across all channels
13182
- // round-up so it is more suitable for SIMD
13183
- const int ew0 = ggml_up32(nk0*nk1*ne02);
13569
+ const int ew0 = nk0*nk1*ne02;
13184
13570
 
13185
13571
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13186
13572
  GGML_ASSERT(nb10 == sizeof(float));
@@ -14590,6 +14976,114 @@ static void ggml_compute_forward_map_binary(
14590
14976
  }
14591
14977
  }
14592
14978
 
14979
+ // ggml_compute_forward_map_custom1
14980
+
14981
+ static void ggml_compute_forward_map_custom1_f32(
14982
+ const struct ggml_compute_params * params,
14983
+ const struct ggml_tensor * a,
14984
+ struct ggml_tensor * dst,
14985
+ const ggml_custom1_op_f32_t fun) {
14986
+ assert(params->ith == 0);
14987
+
14988
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14989
+ return;
14990
+ }
14991
+
14992
+ fun(dst, a);
14993
+ }
14994
+
14995
+
14996
+ static void ggml_compute_forward_map_custom1(
14997
+ const struct ggml_compute_params * params,
14998
+ const struct ggml_tensor * a,
14999
+ struct ggml_tensor * dst,
15000
+ const ggml_custom1_op_f32_t fun) {
15001
+ switch (a->type) {
15002
+ case GGML_TYPE_F32:
15003
+ {
15004
+ ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
15005
+ } break;
15006
+ default:
15007
+ {
15008
+ GGML_ASSERT(false);
15009
+ } break;
15010
+ }
15011
+ }
15012
+
15013
+ // ggml_compute_forward_map_custom2
15014
+
15015
+ static void ggml_compute_forward_map_custom2_f32(
15016
+ const struct ggml_compute_params * params,
15017
+ const struct ggml_tensor * a,
15018
+ const struct ggml_tensor * b,
15019
+ struct ggml_tensor * dst,
15020
+ const ggml_custom2_op_f32_t fun) {
15021
+ assert(params->ith == 0);
15022
+
15023
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15024
+ return;
15025
+ }
15026
+
15027
+ fun(dst, a, b);
15028
+ }
15029
+
15030
+
15031
+ static void ggml_compute_forward_map_custom2(
15032
+ const struct ggml_compute_params * params,
15033
+ const struct ggml_tensor * a,
15034
+ const struct ggml_tensor * b,
15035
+ struct ggml_tensor * dst,
15036
+ const ggml_custom2_op_f32_t fun) {
15037
+ switch (a->type) {
15038
+ case GGML_TYPE_F32:
15039
+ {
15040
+ ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
15041
+ } break;
15042
+ default:
15043
+ {
15044
+ GGML_ASSERT(false);
15045
+ } break;
15046
+ }
15047
+ }
15048
+
15049
+ // ggml_compute_forward_map_custom3
15050
+
15051
+ static void ggml_compute_forward_map_custom3_f32(
15052
+ const struct ggml_compute_params * params,
15053
+ const struct ggml_tensor * a,
15054
+ const struct ggml_tensor * b,
15055
+ const struct ggml_tensor * c,
15056
+ struct ggml_tensor * dst,
15057
+ const ggml_custom3_op_f32_t fun) {
15058
+ assert(params->ith == 0);
15059
+
15060
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
15061
+ return;
15062
+ }
15063
+
15064
+ fun(dst, a, b, c);
15065
+ }
15066
+
15067
+
15068
+ static void ggml_compute_forward_map_custom3(
15069
+ const struct ggml_compute_params * params,
15070
+ const struct ggml_tensor * a,
15071
+ const struct ggml_tensor * b,
15072
+ const struct ggml_tensor * c,
15073
+ struct ggml_tensor * dst,
15074
+ const ggml_custom3_op_f32_t fun) {
15075
+ switch (a->type) {
15076
+ case GGML_TYPE_F32:
15077
+ {
15078
+ ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
15079
+ } break;
15080
+ default:
15081
+ {
15082
+ GGML_ASSERT(false);
15083
+ } break;
15084
+ }
15085
+ }
15086
+
14593
15087
  // ggml_compute_forward_cross_entropy_loss
14594
15088
 
14595
15089
  static void ggml_compute_forward_cross_entropy_loss_f32(
@@ -14880,7 +15374,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14880
15374
  if (skip_cpu) {
14881
15375
  return;
14882
15376
  }
14883
- GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
15377
+ GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
14884
15378
  GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14885
15379
  #endif // GGML_USE_CUBLAS
14886
15380
 
@@ -15127,6 +15621,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15127
15621
  ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
15128
15622
  }
15129
15623
  break;
15624
+ case GGML_OP_MAP_CUSTOM1:
15625
+ {
15626
+ const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
15627
+ ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
15628
+ }
15629
+ break;
15630
+ case GGML_OP_MAP_CUSTOM2:
15631
+ {
15632
+ const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
15633
+ ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
15634
+ }
15635
+ break;
15636
+ case GGML_OP_MAP_CUSTOM3:
15637
+ {
15638
+ const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
15639
+ ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
15640
+ }
15641
+ break;
15130
15642
  case GGML_OP_CROSS_ENTROPY_LOSS:
15131
15643
  {
15132
15644
  ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
@@ -15735,17 +16247,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15735
16247
  {
15736
16248
  if (src0->grad) {
15737
16249
  assert(src1->type == GGML_TYPE_I32);
15738
- assert(ggml_nelements(src1) == 3);
16250
+ assert(ggml_nelements(src1) == 4);
15739
16251
  const int n_past = ((int32_t *) src1->data)[0];
15740
16252
  const int n_dims = ((int32_t *) src1->data)[1];
15741
16253
  const int mode = ((int32_t *) src1->data)[2];
16254
+ const int n_ctx = ((int32_t *) src1->data)[3];
15742
16255
  src0->grad = ggml_add_impl(ctx,
15743
16256
  src0->grad,
15744
16257
  ggml_rope(ctx,
15745
16258
  tensor->grad,
15746
16259
  n_past,
15747
16260
  n_dims,
15748
- mode),
16261
+ mode,
16262
+ n_ctx),
15749
16263
  inplace);
15750
16264
  }
15751
16265
  if (src1->grad) {
@@ -15933,6 +16447,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15933
16447
  case GGML_OP_WIN_UNPART:
15934
16448
  case GGML_OP_MAP_UNARY:
15935
16449
  case GGML_OP_MAP_BINARY:
16450
+ case GGML_OP_MAP_CUSTOM1:
16451
+ case GGML_OP_MAP_CUSTOM2:
16452
+ case GGML_OP_MAP_CUSTOM3:
15936
16453
  {
15937
16454
  GGML_ASSERT(false); // not supported
15938
16455
  } break;
@@ -16004,7 +16521,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16004
16521
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
16005
16522
 
16006
16523
  if (strlen(node->name) == 0) {
16007
- snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
16524
+ ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
16008
16525
  }
16009
16526
 
16010
16527
  cgraph->leafs[cgraph->n_leafs] = node;
@@ -16013,7 +16530,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
16013
16530
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
16014
16531
 
16015
16532
  if (strlen(node->name) == 0) {
16016
- snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
16533
+ ggml_format_name(node, "node_%d", cgraph->n_nodes);
16017
16534
  }
16018
16535
 
16019
16536
  cgraph->nodes[cgraph->n_nodes] = node;
@@ -16167,68 +16684,173 @@ typedef pthread_t ggml_thread_t;
16167
16684
 
16168
16685
  #endif
16169
16686
 
16687
+ // Android's libc implementation "bionic" does not support setting affinity
16688
+ #if defined(__linux__) && !defined(__BIONIC__)
16689
+ void set_numa_thread_affinity(int thread_n, int n_threads) {
16690
+ if (!ggml_is_numa()) {
16691
+ return;
16692
+ }
16693
+
16694
+ // run thread on node_num thread_n / (threads per node)
16695
+ const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
16696
+ struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
16697
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16698
+
16699
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16700
+ CPU_ZERO_S(setsize, cpus);
16701
+ for (size_t i = 0; i < node->n_cpus; ++i) {
16702
+ CPU_SET_S(node->cpus[i], setsize, cpus);
16703
+ }
16704
+
16705
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16706
+ if (rv) {
16707
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16708
+ strerror(rv));
16709
+ }
16710
+
16711
+ CPU_FREE(cpus);
16712
+ }
16713
+
16714
+ void clear_numa_thread_affinity(void) {
16715
+ if (!ggml_is_numa()) {
16716
+ return;
16717
+ }
16718
+
16719
+ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
16720
+
16721
+ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
16722
+ CPU_ZERO_S(setsize, cpus);
16723
+ for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
16724
+ CPU_SET_S(i, setsize, cpus);
16725
+ }
16726
+
16727
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
16728
+ if (rv) {
16729
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
16730
+ strerror(rv));
16731
+ }
16732
+
16733
+ CPU_FREE(cpus);
16734
+ }
16735
+ #else
16736
+ // TODO: Windows etc.
16737
+ // (the linux implementation may also work on BSD, someone should test)
16738
+ void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
16739
+ void clear_numa_thread_affinity(void) {}
16740
+ #endif
16741
+
16170
16742
  struct ggml_compute_state_shared {
16171
- ggml_lock_t spin;
16743
+ struct ggml_cgraph * cgraph;
16744
+
16745
+ int64_t perf_node_start_cycles;
16746
+ int64_t perf_node_start_time_us;
16172
16747
 
16173
16748
  int n_threads;
16174
16749
 
16175
16750
  // synchronization primitives
16176
- atomic_int n_ready;
16177
- atomic_bool has_work;
16178
- atomic_bool stop; // stop all threads
16751
+ atomic_int n_active; // num active threads
16752
+ atomic_int node_n; // active graph node
16179
16753
  };
16180
16754
 
16181
16755
  struct ggml_compute_state {
16182
16756
  ggml_thread_t thrd;
16183
-
16184
- struct ggml_compute_params params;
16185
- struct ggml_tensor * node;
16186
-
16757
+ int ith;
16187
16758
  struct ggml_compute_state_shared * shared;
16188
16759
  };
16189
16760
 
16761
+ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
16762
+ int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
16763
+ int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
16764
+
16765
+ node->perf_runs++;
16766
+ node->perf_cycles += cycles_cur;
16767
+ node->perf_time_us += time_us_cur;
16768
+ }
16769
+
16190
16770
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16191
16771
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16772
+ struct ggml_cgraph * cgraph = state->shared->cgraph;
16192
16773
 
16193
16774
  const int n_threads = state->shared->n_threads;
16775
+ set_numa_thread_affinity(state->ith, n_threads);
16776
+
16777
+ int node_n = -1;
16194
16778
 
16195
16779
  while (true) {
16196
- if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
16197
- atomic_store(&state->shared->has_work, false);
16198
- } else {
16199
- while (atomic_load(&state->shared->has_work)) {
16200
- if (atomic_load(&state->shared->stop)) {
16201
- return 0;
16202
- }
16203
- ggml_lock_lock (&state->shared->spin);
16204
- ggml_lock_unlock(&state->shared->spin);
16780
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16781
+ // all other threads are finished and spinning
16782
+ // do finalize and init here so we don't have synchronize again
16783
+ struct ggml_compute_params params = {
16784
+ /*.type =*/ GGML_TASK_FINALIZE,
16785
+ /*.ith =*/ 0,
16786
+ /*.nth =*/ 0,
16787
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16788
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16789
+ };
16790
+
16791
+ if (node_n != -1) {
16792
+ /* FINALIZE */
16793
+ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
16794
+ params.nth = node->n_tasks;
16795
+ ggml_compute_forward(&params, node);
16796
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16205
16797
  }
16206
- }
16207
16798
 
16208
- atomic_fetch_sub(&state->shared->n_ready, 1);
16799
+ // distribute new work or execute it direct if 1T
16800
+ while (++node_n < cgraph->n_nodes) {
16801
+ GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
16802
+
16803
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16804
+
16805
+ state->shared->perf_node_start_cycles = ggml_perf_cycles();
16806
+ state->shared->perf_node_start_time_us = ggml_perf_time_us();
16209
16807
 
16210
- // wait for work
16211
- while (!atomic_load(&state->shared->has_work)) {
16212
- if (atomic_load(&state->shared->stop)) {
16213
- return 0;
16808
+ /* INIT */
16809
+ params.type = GGML_TASK_INIT;
16810
+ params.nth = node->n_tasks;
16811
+ ggml_compute_forward(&params, node);
16812
+
16813
+ if (node->n_tasks == 1) {
16814
+ // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16815
+ // they do something more efficient than spinning (?)
16816
+ params.type = GGML_TASK_COMPUTE;
16817
+ ggml_compute_forward(&params, node);
16818
+
16819
+ params.type = GGML_TASK_FINALIZE;
16820
+ ggml_compute_forward(&params, node);
16821
+ ggml_graph_compute_perf_stats_node(node, state->shared);
16822
+ } else {
16823
+ break;
16824
+ }
16214
16825
  }
16215
- ggml_lock_lock (&state->shared->spin);
16216
- ggml_lock_unlock(&state->shared->spin);
16826
+
16827
+ atomic_store(&state->shared->n_active, n_threads);
16828
+ atomic_store(&state->shared->node_n, node_n);
16829
+ } else {
16830
+ // wait for other threads to finish
16831
+ const int last = node_n;
16832
+ do {
16833
+ sched_yield();
16834
+ node_n = atomic_load(&state->shared->node_n);
16835
+ } while (node_n == last);
16217
16836
  }
16218
16837
 
16219
16838
  // check if we should stop
16220
- if (atomic_load(&state->shared->stop)) {
16221
- break;
16222
- }
16839
+ if (node_n >= cgraph->n_nodes) break;
16223
16840
 
16224
- if (state->node) {
16225
- if (state->params.ith < state->params.nth) {
16226
- ggml_compute_forward(&state->params, state->node);
16227
- }
16841
+ /* COMPUTE */
16842
+ struct ggml_tensor * node = cgraph->nodes[node_n];
16228
16843
 
16229
- state->node = NULL;
16230
- } else {
16231
- break;
16844
+ struct ggml_compute_params params = {
16845
+ /*.type =*/ GGML_TASK_COMPUTE,
16846
+ /*.ith =*/ state->ith,
16847
+ /*.nth =*/ node->n_tasks,
16848
+ /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16849
+ /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16850
+ };
16851
+
16852
+ if (state->ith < node->n_tasks) {
16853
+ ggml_compute_forward(&params, node);
16232
16854
  }
16233
16855
  }
16234
16856
 
@@ -16239,39 +16861,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16239
16861
  const int n_threads = cgraph->n_threads;
16240
16862
 
16241
16863
  struct ggml_compute_state_shared state_shared = {
16242
- /*.spin =*/ GGML_LOCK_INITIALIZER,
16243
- /*.n_threads =*/ n_threads,
16244
- /*.n_ready =*/ 0,
16245
- /*.has_work =*/ false,
16246
- /*.stop =*/ false,
16864
+ /*.cgraph =*/ cgraph,
16865
+ /*.perf_node_start_cycles =*/ 0,
16866
+ /*.perf_node_start_time_us =*/ 0,
16867
+ /*.n_threads =*/ n_threads,
16868
+ /*.n_active =*/ n_threads,
16869
+ /*.node_n =*/ -1,
16247
16870
  };
16248
- struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
16249
-
16250
- // create thread pool
16251
- if (n_threads > 1) {
16252
- ggml_lock_init(&state_shared.spin);
16253
-
16254
- atomic_store(&state_shared.has_work, true);
16255
-
16256
- for (int j = 0; j < n_threads - 1; j++) {
16257
- workers[j] = (struct ggml_compute_state) {
16258
- .thrd = 0,
16259
- .params = {
16260
- .type = GGML_TASK_COMPUTE,
16261
- .ith = j + 1,
16262
- .nth = n_threads,
16263
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16264
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16265
- },
16266
- .node = NULL,
16267
- .shared = &state_shared,
16268
- };
16269
-
16270
- int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
16271
- GGML_ASSERT(rc == 0);
16272
- UNUSED(rc);
16273
- }
16274
- }
16871
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
16275
16872
 
16276
16873
  // initialize tasks + work buffer
16277
16874
  {
@@ -16415,7 +17012,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16415
17012
  } break;
16416
17013
  case GGML_OP_SCALE:
16417
17014
  {
16418
- node->n_tasks = n_threads;
17015
+ node->n_tasks = 1;
16419
17016
  } break;
16420
17017
  case GGML_OP_SET:
16421
17018
  case GGML_OP_CONT:
@@ -16574,6 +17171,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16574
17171
  case GGML_OP_WIN_UNPART:
16575
17172
  case GGML_OP_MAP_UNARY:
16576
17173
  case GGML_OP_MAP_BINARY:
17174
+ case GGML_OP_MAP_CUSTOM1:
17175
+ case GGML_OP_MAP_CUSTOM2:
17176
+ case GGML_OP_MAP_CUSTOM3:
16577
17177
  {
16578
17178
  node->n_tasks = 1;
16579
17179
  } break;
@@ -16616,166 +17216,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
16616
17216
  }
16617
17217
  }
16618
17218
 
16619
- const int64_t perf_start_cycles = ggml_perf_cycles();
16620
- const int64_t perf_start_time_us = ggml_perf_time_us();
16621
-
16622
- for (int i = 0; i < cgraph->n_nodes; i++) {
16623
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
16624
-
16625
- struct ggml_tensor * node = cgraph->nodes[i];
16626
-
16627
- // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
16628
- //if (node->grad == NULL && node->perf_runs > 0) {
16629
- // continue;
16630
- //}
16631
-
16632
- const int64_t perf_node_start_cycles = ggml_perf_cycles();
16633
- const int64_t perf_node_start_time_us = ggml_perf_time_us();
16634
-
16635
- // INIT
16636
- struct ggml_compute_params params = {
16637
- /*.type =*/ GGML_TASK_INIT,
16638
- /*.ith =*/ 0,
16639
- /*.nth =*/ node->n_tasks,
16640
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16641
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
16642
- };
16643
-
16644
- ggml_compute_forward(&params, node);
16645
-
16646
- // COMPUTE
16647
- if (node->n_tasks > 1) {
16648
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16649
- atomic_store(&state_shared.has_work, false);
16650
- }
16651
-
16652
- while (atomic_load(&state_shared.has_work)) {
16653
- ggml_lock_lock (&state_shared.spin);
16654
- ggml_lock_unlock(&state_shared.spin);
16655
- }
16656
-
16657
- // launch thread pool
16658
- for (int j = 0; j < n_threads - 1; j++) {
16659
- workers[j].params = (struct ggml_compute_params) {
16660
- .type = GGML_TASK_COMPUTE,
16661
- .ith = j + 1,
16662
- .nth = node->n_tasks,
16663
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16664
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16665
- };
16666
- workers[j].node = node;
16667
- }
16668
-
16669
- atomic_fetch_sub(&state_shared.n_ready, 1);
16670
-
16671
- while (atomic_load(&state_shared.n_ready) > 0) {
16672
- ggml_lock_lock (&state_shared.spin);
16673
- ggml_lock_unlock(&state_shared.spin);
16674
- }
16675
-
16676
- atomic_store(&state_shared.has_work, true);
16677
- }
16678
-
16679
- params.type = GGML_TASK_COMPUTE;
16680
- ggml_compute_forward(&params, node);
16681
-
16682
- // wait for thread pool
16683
- if (node->n_tasks > 1) {
16684
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16685
- atomic_store(&state_shared.has_work, false);
16686
- }
16687
-
16688
- while (atomic_load(&state_shared.has_work)) {
16689
- ggml_lock_lock (&state_shared.spin);
16690
- ggml_lock_unlock(&state_shared.spin);
16691
- }
16692
-
16693
- atomic_fetch_sub(&state_shared.n_ready, 1);
16694
-
16695
- while (atomic_load(&state_shared.n_ready) != 0) {
16696
- ggml_lock_lock (&state_shared.spin);
16697
- ggml_lock_unlock(&state_shared.spin);
16698
- }
16699
- }
16700
-
16701
- // FINALIZE
16702
- if (node->n_tasks > 1) {
16703
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16704
- atomic_store(&state_shared.has_work, false);
16705
- }
16706
-
16707
- while (atomic_load(&state_shared.has_work)) {
16708
- ggml_lock_lock (&state_shared.spin);
16709
- ggml_lock_unlock(&state_shared.spin);
16710
- }
16711
-
16712
- // launch thread pool
16713
- for (int j = 0; j < n_threads - 1; j++) {
16714
- workers[j].params = (struct ggml_compute_params) {
16715
- .type = GGML_TASK_FINALIZE,
16716
- .ith = j + 1,
16717
- .nth = node->n_tasks,
16718
- .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
16719
- .wdata = cgraph->work ? cgraph->work->data : NULL,
16720
- };
16721
- workers[j].node = node;
16722
- }
16723
-
16724
- atomic_fetch_sub(&state_shared.n_ready, 1);
16725
-
16726
- while (atomic_load(&state_shared.n_ready) > 0) {
16727
- ggml_lock_lock (&state_shared.spin);
16728
- ggml_lock_unlock(&state_shared.spin);
16729
- }
17219
+ // create thread pool
17220
+ if (n_threads > 1) {
17221
+ for (int j = 1; j < n_threads; ++j) {
17222
+ workers[j] = (struct ggml_compute_state) {
17223
+ .thrd = 0,
17224
+ .ith = j,
17225
+ .shared = &state_shared,
17226
+ };
16730
17227
 
16731
- atomic_store(&state_shared.has_work, true);
17228
+ const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
17229
+ GGML_ASSERT(rc == 0);
16732
17230
  }
17231
+ }
17232
+ workers[0].ith = 0;
17233
+ workers[0].shared = &state_shared;
16733
17234
 
16734
- params.type = GGML_TASK_FINALIZE;
16735
- ggml_compute_forward(&params, node);
16736
-
16737
- // wait for thread pool
16738
- if (node->n_tasks > 1) {
16739
- if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
16740
- atomic_store(&state_shared.has_work, false);
16741
- }
16742
-
16743
- while (atomic_load(&state_shared.has_work)) {
16744
- ggml_lock_lock (&state_shared.spin);
16745
- ggml_lock_unlock(&state_shared.spin);
16746
- }
16747
-
16748
- atomic_fetch_sub(&state_shared.n_ready, 1);
16749
-
16750
- while (atomic_load(&state_shared.n_ready) != 0) {
16751
- ggml_lock_lock (&state_shared.spin);
16752
- ggml_lock_unlock(&state_shared.spin);
16753
- }
16754
- }
17235
+ const int64_t perf_start_cycles = ggml_perf_cycles();
17236
+ const int64_t perf_start_time_us = ggml_perf_time_us();
16755
17237
 
16756
- // performance stats (node)
16757
- {
16758
- int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles;
16759
- int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
17238
+ // this is a work thread too
17239
+ ggml_graph_compute_thread(&workers[0]);
16760
17240
 
16761
- node->perf_runs++;
16762
- node->perf_cycles += perf_cycles_cur;
16763
- node->perf_time_us += perf_time_us_cur;
16764
- }
16765
- }
17241
+ // don't leave affinity set on the main thread
17242
+ clear_numa_thread_affinity();
16766
17243
 
16767
17244
  // join thread pool
16768
17245
  if (n_threads > 1) {
16769
- atomic_store(&state_shared.stop, true);
16770
- atomic_store(&state_shared.has_work, true);
16771
-
16772
- for (int j = 0; j < n_threads - 1; j++) {
16773
- int rc = ggml_thread_join(workers[j].thrd, NULL);
17246
+ for (int j = 1; j < n_threads; j++) {
17247
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
16774
17248
  GGML_ASSERT(rc == 0);
16775
- UNUSED(rc);
16776
17249
  }
16777
-
16778
- ggml_lock_destroy(&state_shared.spin);
16779
17250
  }
16780
17251
 
16781
17252
  // performance stats (graph)
@@ -17397,6 +17868,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
17397
17868
  return NULL;
17398
17869
  }
17399
17870
 
17871
+ static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17872
+ struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
17873
+ struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
17874
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
17875
+ gparent0 ? (void *) gparent0 : (void *) parent,
17876
+ gparent0 ? "g" : "x",
17877
+ gparent ? (void *) gparent : (void *) node,
17878
+ gparent ? "g" : "x",
17879
+ gparent ? "empty" : "vee",
17880
+ gparent ? "dashed" : "solid",
17881
+ label);
17882
+ }
17883
+
17884
+ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17885
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
17886
+ (void *) parent, "x",
17887
+ (void *) node, "x",
17888
+ label);
17889
+ }
17890
+
17400
17891
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
17401
17892
  char color[16];
17402
17893
 
@@ -17432,7 +17923,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17432
17923
  (void *) node, color);
17433
17924
 
17434
17925
  if (strlen(node->name) > 0) {
17435
- fprintf(fp, "%s |", node->name);
17926
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17927
+ } else {
17928
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
17436
17929
  }
17437
17930
 
17438
17931
  if (node->n_dims == 2) {
@@ -17441,7 +17934,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17441
17934
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17442
17935
  }
17443
17936
 
17444
-
17445
17937
  if (node->grad) {
17446
17938
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17447
17939
  } else {
@@ -17460,18 +17952,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17460
17952
  (void *) node, color);
17461
17953
 
17462
17954
  if (strlen(node->name) > 0) {
17463
- fprintf(fp, "%s | ", node->name);
17955
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17956
+ } else {
17957
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
17464
17958
  }
17465
- if (ggml_nelements(node) == 1) {
17466
- if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17467
- fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
17468
- }
17469
- else {
17470
- fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
17959
+
17960
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17961
+ if (ggml_nelements(node) < 5) {
17962
+ fprintf(fp, " | (");
17963
+ for (int j = 0; j < ggml_nelements(node); j++) {
17964
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17965
+ fprintf(fp, "%d", ggml_get_i32_1d(node, j));
17966
+ }
17967
+ else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
17968
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
17969
+ }
17970
+ else {
17971
+ fprintf(fp, "#");
17972
+ }
17973
+ if (j < ggml_nelements(node) - 1) {
17974
+ fprintf(fp, ", ");
17975
+ }
17471
17976
  }
17472
- }
17473
- else {
17474
- fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17977
+ fprintf(fp, ")");
17475
17978
  }
17476
17979
  fprintf(fp, "\"; ]\n");
17477
17980
  }
@@ -17479,30 +17982,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17479
17982
  for (int i = 0; i < gb->n_nodes; i++) {
17480
17983
  struct ggml_tensor * node = gb->nodes[i];
17481
17984
 
17482
- struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
17483
-
17484
17985
  if (node->src0) {
17485
- struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
17486
-
17487
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
17488
- parent0 ? (void *) parent0 : (void *) node->src0,
17489
- parent0 ? "g" : "x",
17490
- parent ? (void *) parent : (void *) node,
17491
- parent ? "g" : "x",
17492
- parent ? "empty" : "vee",
17493
- parent ? "dashed" : "solid");
17986
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
17494
17987
  }
17495
17988
 
17496
17989
  if (node->src1) {
17497
- struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
17498
-
17499
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
17500
- parent1 ? (void *) parent1 : (void *) node->src1,
17501
- parent1 ? "g" : "x",
17502
- parent ? (void *) parent : (void *) node,
17503
- parent ? "g" : "x",
17504
- parent ? "empty" : "vee",
17505
- parent ? "dashed" : "solid");
17990
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
17991
+ }
17992
+
17993
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
17994
+ if (node->opt[j]) {
17995
+ char label[16];
17996
+ snprintf(label, sizeof(label), "opt %d", j);
17997
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
17998
+ }
17506
17999
  }
17507
18000
  }
17508
18001
 
@@ -17510,15 +18003,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
17510
18003
  struct ggml_tensor * node = gb->leafs[i];
17511
18004
 
17512
18005
  if (node->src0) {
17513
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
17514
- (void *) node->src0, "x",
17515
- (void *) node, "x");
18006
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
17516
18007
  }
17517
18008
 
17518
18009
  if (node->src1) {
17519
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
17520
- (void *) node->src1, "x",
17521
- (void *) node, "x");
18010
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
18011
+ }
18012
+
18013
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
18014
+ if (node->opt[j]) {
18015
+ char label[16];
18016
+ snprintf(label, sizeof(label), "opt %d", j);
18017
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
18018
+ }
17522
18019
  }
17523
18020
  }
17524
18021