llama_cpp 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ extern "C" {
10
10
  #define GGML_VK_NAME "Vulkan"
11
11
  #define GGML_VK_MAX_DEVICES 16
12
12
 
13
+ GGML_API void ggml_vk_instance_init(void);
13
14
  GGML_API void ggml_vk_init_cpu_assist(void);
14
15
 
15
16
  GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
@@ -320,6 +320,17 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
320
320
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
321
321
  float ggml_table_f32_f16[1 << 16];
322
322
 
323
+ const char * ggml_status_to_string(enum ggml_status status) {
324
+ switch (status) {
325
+ case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
326
+ case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
327
+ case GGML_STATUS_SUCCESS: return "GGML status: success";
328
+ case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
329
+ }
330
+
331
+ return "GGML status: unknown";
332
+ }
333
+
323
334
  // note: do not use these inside ggml.c
324
335
  // these are meant to be used via the ggml.h API
325
336
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
@@ -1822,6 +1833,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1822
1833
  "POOL_2D",
1823
1834
  "UPSCALE",
1824
1835
  "PAD",
1836
+ "ARANGE",
1837
+ "TIMESTEP_EMBEDDING",
1825
1838
  "ARGSORT",
1826
1839
  "LEAKY_RELU",
1827
1840
 
@@ -1850,7 +1863,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1850
1863
  "CROSS_ENTROPY_LOSS_BACK",
1851
1864
  };
1852
1865
 
1853
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1866
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1854
1867
 
1855
1868
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1856
1869
  "none",
@@ -1908,6 +1921,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1908
1921
  "pool_2d(x)",
1909
1922
  "upscale(x)",
1910
1923
  "pad(x)",
1924
+ "arange(start, stop, step)",
1925
+ "timestep_embedding(timesteps, dim, max_period)",
1911
1926
  "argsort(x)",
1912
1927
  "leaky_relu(x)",
1913
1928
 
@@ -1936,7 +1951,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1936
1951
  "cross_entropy_loss_back(x,y)",
1937
1952
  };
1938
1953
 
1939
- static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
1954
+ static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
1940
1955
 
1941
1956
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1942
1957
 
@@ -2139,7 +2154,10 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
2139
2154
  getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
2140
2155
  #else
2141
2156
  // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
2142
- getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
2157
+ # if !defined(SYS_getcpu) && defined(SYS_get_cpu)
2158
+ # define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
2159
+ # endif
2160
+ getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
2143
2161
  #endif
2144
2162
 
2145
2163
  if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
@@ -2895,11 +2913,21 @@ static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_
2895
2913
  return ((const int32_t *)(tensor->op_params))[i];
2896
2914
  }
2897
2915
 
2916
+ static float ggml_get_op_params_f32(const struct ggml_tensor * tensor, uint32_t i) {
2917
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2918
+ return ((const float *)(tensor->op_params))[i];
2919
+ }
2920
+
2898
2921
  static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
2899
2922
  assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
2900
2923
  ((int32_t *)(tensor->op_params))[i] = value;
2901
2924
  }
2902
2925
 
2926
+ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, float value) {
2927
+ assert(i < GGML_MAX_OP_PARAMS / sizeof(float));
2928
+ ((float *)(tensor->op_params))[i] = value;
2929
+ }
2930
+
2903
2931
  struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
2904
2932
  memset(tensor->data, 0, ggml_nbytes(tensor));
2905
2933
  return tensor;
@@ -5898,6 +5926,55 @@ struct ggml_tensor * ggml_upscale(
5898
5926
  return ggml_upscale_impl(ctx, a, scale_factor);
5899
5927
  }
5900
5928
 
5929
+ struct ggml_tensor * ggml_arange(
5930
+ struct ggml_context * ctx,
5931
+ float start,
5932
+ float stop,
5933
+ float step) {
5934
+
5935
+ GGML_ASSERT(stop > start);
5936
+
5937
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
5938
+
5939
+ struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
5940
+
5941
+ result->op = GGML_OP_ARANGE;
5942
+ ggml_set_op_params_f32(result, 0, start);
5943
+ ggml_set_op_params_f32(result, 1, stop);
5944
+ ggml_set_op_params_f32(result, 2, step);
5945
+
5946
+ return result;
5947
+ }
5948
+
5949
+ struct ggml_tensor * ggml_timestep_embedding(
5950
+ struct ggml_context * ctx,
5951
+ struct ggml_tensor * timesteps,
5952
+ int dim,
5953
+ int max_period) {
5954
+ bool is_node = false;
5955
+
5956
+ if (timesteps->grad) {
5957
+ GGML_ASSERT(false); // TODO: implement backward
5958
+ is_node = true;
5959
+ }
5960
+
5961
+ int actual_dim = dim;
5962
+ if (dim % 2 != 0) {
5963
+ actual_dim = dim + 1;
5964
+ }
5965
+
5966
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
5967
+
5968
+ result->op = GGML_OP_TIMESTEP_EMBEDDING;
5969
+ ggml_set_op_params_i32(result, 0, dim);
5970
+ ggml_set_op_params_i32(result, 1, max_period);
5971
+
5972
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5973
+ result->src[0] = timesteps;
5974
+
5975
+ return result;
5976
+ }
5977
+
5901
5978
  // ggml_argsort
5902
5979
 
5903
5980
  struct ggml_tensor * ggml_argsort(
@@ -10231,7 +10308,7 @@ static void ggml_compute_forward_group_norm_f32(
10231
10308
  int n_channels = src0->ne[2];
10232
10309
  int n_groups = dst->op_params[0];
10233
10310
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
10234
- for (int i = ith; i < n_groups; i+=nth) {
10311
+ for (int i = ith; i < n_groups; i += nth) {
10235
10312
  int start = i * n_channels_per_group;
10236
10313
  int end = start + n_channels_per_group;
10237
10314
  if (end > n_channels) {
@@ -10245,28 +10322,32 @@ static void ggml_compute_forward_group_norm_f32(
10245
10322
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10246
10323
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10247
10324
 
10325
+ ggml_float sumr = 0.0;
10248
10326
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10249
- sum += (ggml_float)x[i00];
10327
+ sumr += (ggml_float)x[i00];
10250
10328
  }
10329
+ sum += sumr;
10251
10330
  }
10252
10331
  }
10253
- float mean = sum / (ne00 * ne01 * step);
10254
- ggml_float sum2 = 0.0;
10332
+ const float mean = sum / (ne00 * ne01 * step);
10255
10333
 
10334
+ ggml_float sum2 = 0.0;
10256
10335
  for (int64_t i02 = start; i02 < end; i02++) {
10257
10336
  for (int64_t i01 = 0; i01 < ne01; i01++) {
10258
10337
  const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
10259
10338
 
10260
10339
  float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
10261
10340
 
10341
+ ggml_float sumr = 0.0;
10262
10342
  for (int64_t i00 = 0; i00 < ne00; i00++) {
10263
10343
  float v = x[i00] - mean;
10264
10344
  y[i00] = v;
10265
- sum2 += (ggml_float)(v * v);
10345
+ sumr += (ggml_float)(v * v);
10266
10346
  }
10347
+ sum2 += sumr;
10267
10348
  }
10268
10349
  }
10269
- float variance = sum2 / (ne00 * ne01 * step);
10350
+ const float variance = sum2 / (ne00 * ne01 * step);
10270
10351
  const float scale = 1.0f / sqrtf(variance + eps);
10271
10352
 
10272
10353
  for (int64_t i02 = start; i02 < end; i02++) {
@@ -13547,6 +13628,106 @@ static void ggml_compute_forward_pad(
13547
13628
  }
13548
13629
  }
13549
13630
 
13631
+
13632
+ // ggml_compute_forward_arange
13633
+
13634
+ static void ggml_compute_forward_arange_f32(
13635
+ const struct ggml_compute_params * params,
13636
+ struct ggml_tensor * dst) {
13637
+
13638
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13639
+ return;
13640
+ }
13641
+
13642
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
13643
+
13644
+ const int ith = params->ith;
13645
+ const int nth = params->nth;
13646
+
13647
+ const float start = ggml_get_op_params_f32(dst, 0);
13648
+ const float stop = ggml_get_op_params_f32(dst, 1);
13649
+ const float step = ggml_get_op_params_f32(dst, 2);
13650
+
13651
+ const int64_t steps = (int64_t) ceilf((stop - start) / step);
13652
+
13653
+ GGML_ASSERT(ggml_nelements(dst) == steps);
13654
+
13655
+ for (int64_t i = ith; i < steps; i+= nth) {
13656
+ float value = start + step * i;
13657
+ ((float *)dst->data)[i] = value;
13658
+ }
13659
+ }
13660
+
13661
+ static void ggml_compute_forward_arange(
13662
+ const struct ggml_compute_params * params,
13663
+ struct ggml_tensor * dst) {
13664
+ switch (dst->type) {
13665
+ case GGML_TYPE_F32:
13666
+ {
13667
+ ggml_compute_forward_arange_f32(params, dst);
13668
+ } break;
13669
+ default:
13670
+ {
13671
+ GGML_ASSERT(false);
13672
+ } break;
13673
+ }
13674
+ }
13675
+
13676
+ static void ggml_compute_forward_timestep_embedding_f32(
13677
+ const struct ggml_compute_params * params,
13678
+ struct ggml_tensor * dst) {
13679
+
13680
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13681
+ return;
13682
+ }
13683
+
13684
+ const struct ggml_tensor * src0 = dst->src[0];
13685
+
13686
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
13687
+
13688
+ const int ith = params->ith;
13689
+ const int nth = params->nth;
13690
+
13691
+ GGML_TENSOR_UNARY_OP_LOCALS
13692
+
13693
+ const int dim = ggml_get_op_params_i32(dst, 0);
13694
+ const int max_period = ggml_get_op_params_i32(dst, 1);
13695
+
13696
+ int half = dim / 2;
13697
+
13698
+ for (int64_t i = 0; i < ne00; i++) {
13699
+ float * embed_data = (float *)((char *) dst->data + i*nb1);
13700
+ for (int64_t j = ith; j < half; j += nth) {
13701
+ float timestep = ((float *)src0->data)[i];
13702
+ float freq = (float)expf(-logf(max_period) * j / half);
13703
+ float arg = timestep * freq;
13704
+ embed_data[j] = cosf(arg);
13705
+ embed_data[j + half] = sinf(arg);
13706
+ }
13707
+ if (dim % 2 != 0 && ith == 0) {
13708
+ embed_data[dim] = 0.f;
13709
+ }
13710
+ }
13711
+ }
13712
+
13713
+ static void ggml_compute_forward_timestep_embedding(
13714
+ const struct ggml_compute_params * params,
13715
+ struct ggml_tensor * dst) {
13716
+
13717
+ const struct ggml_tensor * src0 = dst->src[0];
13718
+
13719
+ switch (src0->type) {
13720
+ case GGML_TYPE_F32:
13721
+ {
13722
+ ggml_compute_forward_timestep_embedding_f32(params, dst);
13723
+ } break;
13724
+ default:
13725
+ {
13726
+ GGML_ASSERT(false);
13727
+ } break;
13728
+ }
13729
+ }
13730
+
13550
13731
  // ggml_compute_forward_argsort
13551
13732
 
13552
13733
  static void ggml_compute_forward_argsort_f32(
@@ -15615,6 +15796,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15615
15796
  {
15616
15797
  ggml_compute_forward_pad(params, tensor);
15617
15798
  } break;
15799
+ case GGML_OP_ARANGE:
15800
+ {
15801
+ ggml_compute_forward_arange(params, tensor);
15802
+ } break;
15803
+ case GGML_OP_TIMESTEP_EMBEDDING:
15804
+ {
15805
+ ggml_compute_forward_timestep_embedding(params, tensor);
15806
+ } break;
15618
15807
  case GGML_OP_ARGSORT:
15619
15808
  {
15620
15809
  ggml_compute_forward_argsort(params, tensor);
@@ -16617,6 +16806,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
16617
16806
  {
16618
16807
  GGML_ASSERT(false); // TODO: not implemented
16619
16808
  } break;
16809
+ case GGML_OP_ARANGE:
16810
+ {
16811
+ GGML_ASSERT(false); // TODO: not implemented
16812
+ } break;
16813
+ case GGML_OP_TIMESTEP_EMBEDDING:
16814
+ {
16815
+ GGML_ASSERT(false); // TODO: not implemented
16816
+ } break;
16620
16817
  case GGML_OP_ARGSORT:
16621
16818
  {
16622
16819
  GGML_ASSERT(false); // TODO: not implemented
@@ -17217,6 +17414,7 @@ struct ggml_compute_state {
17217
17414
  ggml_thread_t thrd;
17218
17415
  int ith;
17219
17416
  struct ggml_compute_state_shared * shared;
17417
+ enum ggml_status ec;
17220
17418
  };
17221
17419
 
17222
17420
  static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -17368,6 +17566,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
17368
17566
  {
17369
17567
  n_tasks = n_threads;
17370
17568
  } break;
17569
+ case GGML_OP_ARANGE:
17570
+ {
17571
+ n_tasks = n_threads;
17572
+ } break;
17573
+ case GGML_OP_TIMESTEP_EMBEDDING:
17574
+ {
17575
+ n_tasks = n_threads;
17576
+ } break;
17371
17577
  case GGML_OP_ARGSORT:
17372
17578
  {
17373
17579
  n_tasks = n_threads;
@@ -17502,7 +17708,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17502
17708
  while (true) {
17503
17709
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
17504
17710
  state->shared->node_n += 1;
17505
- return (thread_ret_t) GGML_EXIT_ABORTED;
17711
+ state->ec = GGML_STATUS_ABORTED;
17712
+ return 0;
17506
17713
  }
17507
17714
 
17508
17715
  if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -17624,7 +17831,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
17624
17831
  }
17625
17832
  }
17626
17833
 
17627
- return GGML_EXIT_SUCCESS;
17834
+ return 0;
17628
17835
  }
17629
17836
 
17630
17837
  struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
@@ -17820,7 +18027,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
17820
18027
  return cplan;
17821
18028
  }
17822
18029
 
17823
- int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
18030
+ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17824
18031
  {
17825
18032
  GGML_ASSERT(cplan);
17826
18033
  GGML_ASSERT(cplan->n_threads > 0);
@@ -17864,6 +18071,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17864
18071
  .thrd = 0,
17865
18072
  .ith = j,
17866
18073
  .shared = &state_shared,
18074
+ .ec = GGML_STATUS_SUCCESS,
17867
18075
  };
17868
18076
 
17869
18077
  const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
@@ -17874,12 +18082,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17874
18082
 
17875
18083
  workers[0].ith = 0;
17876
18084
  workers[0].shared = &state_shared;
18085
+ workers[0].ec = GGML_STATUS_SUCCESS;
17877
18086
 
17878
18087
  const int64_t perf_start_cycles = ggml_perf_cycles();
17879
18088
  const int64_t perf_start_time_us = ggml_perf_time_us();
17880
18089
 
17881
18090
  // this is a work thread too
17882
- int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
18091
+ ggml_graph_compute_thread(&workers[0]);
18092
+ enum ggml_status compute_status = workers[0].ec;
17883
18093
 
17884
18094
  // don't leave affinity set on the main thread
17885
18095
  clear_numa_thread_affinity();
@@ -17889,6 +18099,8 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17889
18099
  for (int j = 1; j < n_threads; j++) {
17890
18100
  const int rc = ggml_thread_join(workers[j].thrd, NULL);
17891
18101
  GGML_ASSERT(rc == 0);
18102
+ if (workers[j].ec != GGML_STATUS_SUCCESS)
18103
+ compute_status = workers[j].ec;
17892
18104
  }
17893
18105
  }
17894
18106
 
@@ -17916,14 +18128,14 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
17916
18128
  return compute_status;
17917
18129
  }
17918
18130
 
17919
- void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
18131
+ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
17920
18132
  struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
17921
18133
 
17922
18134
  struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
17923
18135
 
17924
18136
  cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
17925
18137
 
17926
- ggml_graph_compute(cgraph, &cplan);
18138
+ return ggml_graph_compute(cgraph, &cplan);
17927
18139
  }
17928
18140
 
17929
18141
  struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -315,6 +315,16 @@
315
315
  extern "C" {
316
316
  #endif
317
317
 
318
+ enum ggml_status {
319
+ GGML_STATUS_ALLOC_FAILED = -2,
320
+ GGML_STATUS_FAILED = -1,
321
+ GGML_STATUS_SUCCESS = 0,
322
+ GGML_STATUS_ABORTED = 1,
323
+ };
324
+
325
+ // get ggml_status name string
326
+ GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
327
+
318
328
  typedef uint16_t ggml_fp16_t;
319
329
 
320
330
  // convert FP16 <-> FP32
@@ -454,6 +464,8 @@ extern "C" {
454
464
  GGML_OP_POOL_2D,
455
465
  GGML_OP_UPSCALE, // nearest interpolate
456
466
  GGML_OP_PAD,
467
+ GGML_OP_ARANGE,
468
+ GGML_OP_TIMESTEP_EMBEDDING,
457
469
  GGML_OP_ARGSORT,
458
470
  GGML_OP_LEAKY_RELU,
459
471
 
@@ -1661,6 +1673,15 @@ extern "C" {
1661
1673
  int p2,
1662
1674
  int p3);
1663
1675
 
1676
+ // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1677
+ // timesteps: [N,]
1678
+ // return: [N, dim]
1679
+ GGML_API struct ggml_tensor * ggml_timestep_embedding(
1680
+ struct ggml_context * ctx,
1681
+ struct ggml_tensor * timesteps,
1682
+ int dim,
1683
+ int max_period);
1684
+
1664
1685
  // sort rows
1665
1686
  enum ggml_sort_order {
1666
1687
  GGML_SORT_ORDER_ASC,
@@ -1672,6 +1693,12 @@ extern "C" {
1672
1693
  struct ggml_tensor * a,
1673
1694
  enum ggml_sort_order order);
1674
1695
 
1696
+ GGML_API struct ggml_tensor * ggml_arange(
1697
+ struct ggml_context * ctx,
1698
+ float start,
1699
+ float stop,
1700
+ float step);
1701
+
1675
1702
  // top k elements per row
1676
1703
  GGML_API struct ggml_tensor * ggml_top_k(
1677
1704
  struct ggml_context * ctx,
@@ -1923,12 +1950,11 @@ extern "C" {
1923
1950
 
1924
1951
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1925
1952
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1926
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1927
- GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1928
-
1953
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1954
+ GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1929
1955
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1930
1956
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1931
- GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1957
+ GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1932
1958
 
1933
1959
  GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1934
1960