llama_cpp 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -186,10 +186,12 @@ typedef double ggml_float;
186
186
  #if defined(_MSC_VER) || defined(__MINGW32__)
187
187
  #include <intrin.h>
188
188
  #else
189
+ #if !defined(__riscv)
189
190
  #include <immintrin.h>
190
191
  #endif
191
192
  #endif
192
193
  #endif
194
+ #endif
193
195
 
194
196
  #ifdef __F16C__
195
197
 
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3494
3496
  };
3495
3497
  static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3496
3498
 
3497
- static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3499
+ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3498
3500
  "NONE",
3499
3501
 
3500
3502
  "DUP",
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
3749
3751
  return GGML_TYPE_NAME[type];
3750
3752
  }
3751
3753
 
3754
+ const char * ggml_op_name(enum ggml_op op) {
3755
+ return GGML_OP_NAME[op];
3756
+ }
3752
3757
 
3753
3758
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
3754
3759
  return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3805
3810
  return wtype;
3806
3811
  }
3807
3812
 
3813
+ size_t ggml_tensor_overhead(void) {
3814
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3815
+ }
3816
+
3808
3817
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3809
3818
  return tensor->nb[0] > tensor->nb[1];
3810
3819
  }
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4017
4026
  return result;
4018
4027
  }
4019
4028
 
4029
+ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4030
+ ctx->no_alloc = no_alloc;
4031
+ }
4032
+
4033
+ void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4034
+ return ctx->mem_buffer;
4035
+ }
4036
+
4037
+ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4038
+ return ctx->mem_size;
4039
+ }
4040
+
4020
4041
  // IMPORTANT:
4021
4042
  // when creating "opt" tensors, always save and load the scratch buffer
4022
4043
  // this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4061
4082
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4062
4083
 
4063
4084
  if (ctx->scratch.data == NULL || data != NULL) {
4064
- size_needed += sizeof(struct ggml_tensor);
4085
+ size_needed += GGML_TENSOR_SIZE;
4065
4086
 
4066
4087
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4067
4088
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
4077
4098
  };
4078
4099
  } else {
4079
4100
  if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4101
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4102
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4081
4103
  assert(false);
4082
4104
  return NULL;
4083
4105
  }
4084
4106
 
4085
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4107
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4086
4108
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4109
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4088
4110
  assert(false);
4089
4111
  return NULL;
4090
4112
  }
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4093
4115
 
4094
4116
  *obj_new = (struct ggml_object) {
4095
4117
  .offs = cur_end + GGML_OBJECT_SIZE,
4096
- .size = sizeof(struct ggml_tensor),
4118
+ .size = GGML_TENSOR_SIZE,
4097
4119
  .next = NULL,
4098
4120
  };
4099
4121
 
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
4509
4531
  return result;
4510
4532
  }
4511
4533
 
4534
+ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
4535
+ struct ggml_object * obj = ctx->objects_begin;
4536
+
4537
+ char * const mem_buffer = ctx->mem_buffer;
4538
+
4539
+ while (obj != NULL) {
4540
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4541
+ if (strcmp(cur->name, name) == 0) {
4542
+ return cur;
4543
+ }
4544
+
4545
+ obj = obj->next;
4546
+ }
4547
+
4548
+ return NULL;
4549
+ }
4550
+
4512
4551
  ////////////////////////////////////////////////////////////////////////////////
4513
4552
 
4514
4553
  // ggml_dup
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
6303
6342
 
6304
6343
  ggml_scratch_save(ctx);
6305
6344
 
6306
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6345
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6307
6346
 
6308
6347
  ((int32_t *) b->data)[0] = n_past;
6309
6348
  ((int32_t *) b->data)[1] = n_head;
@@ -9431,7 +9470,7 @@ static void ggml_compute_forward_rms_norm_back(
9431
9470
 
9432
9471
  // ggml_compute_forward_mul_mat
9433
9472
 
9434
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9473
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9435
9474
  // helper function to determine if it is better to use BLAS or not
9436
9475
  // for large matrices, BLAS is faster
9437
9476
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9472,7 +9511,7 @@ static void ggml_compute_forward_mul_mat_f32(
9472
9511
  const int64_t ne02 = src0->ne[2];
9473
9512
  const int64_t ne03 = src0->ne[3];
9474
9513
 
9475
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9514
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9476
9515
  const int64_t ne10 = src1->ne[0];
9477
9516
  #endif
9478
9517
  const int64_t ne11 = src1->ne[1];
@@ -9536,9 +9575,16 @@ static void ggml_compute_forward_mul_mat_f32(
9536
9575
  }
9537
9576
  return;
9538
9577
  }
9578
+ #elif defined(GGML_USE_CLBLAST)
9579
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9580
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9581
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9582
+ }
9583
+ return;
9584
+ }
9539
9585
  #endif
9540
9586
 
9541
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9587
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9542
9588
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9543
9589
  if (params->ith != 0) {
9544
9590
  return;
@@ -9558,21 +9604,11 @@ static void ggml_compute_forward_mul_mat_f32(
9558
9604
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9559
9605
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9560
9606
 
9561
- #if defined(GGML_USE_CLBLAST)
9562
- // zT = y * xT
9563
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9564
- ne11, ne01, ne10,
9565
- 1.0f, y, ne10,
9566
- x, ne10,
9567
- 0.0f, d, ne01,
9568
- GGML_TYPE_F32);
9569
- #else
9570
9607
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9571
9608
  ne11, ne01, ne10,
9572
9609
  1.0f, y, ne10,
9573
9610
  x, ne00,
9574
9611
  0.0f, d, ne01);
9575
- #endif
9576
9612
  }
9577
9613
  }
9578
9614
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9711,9 +9747,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9711
9747
  }
9712
9748
  return;
9713
9749
  }
9750
+ #elif defined(GGML_USE_CLBLAST)
9751
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9752
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9753
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9754
+ }
9755
+ return;
9756
+ }
9714
9757
  #endif
9715
9758
 
9716
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9759
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9717
9760
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9718
9761
  GGML_ASSERT(nb10 == sizeof(float));
9719
9762
 
@@ -9743,20 +9786,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9743
9786
  assert(id*sizeof(float) <= params->wsize);
9744
9787
  }
9745
9788
 
9746
- #if defined(GGML_USE_CLBLAST)
9747
- const float * x = wdata;
9748
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9749
-
9750
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9751
-
9752
- // zT = y * xT
9753
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9754
- ne11, ne01, ne10,
9755
- 1.0f, y, ne10,
9756
- x, ne10,
9757
- 0.0f, d, ne01,
9758
- GGML_TYPE_F32);
9759
- #else
9760
9789
  const float * x = wdata;
9761
9790
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9762
9791
 
@@ -9768,7 +9797,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9768
9797
  1.0f, y, ne10,
9769
9798
  x, ne00,
9770
9799
  0.0f, d, ne01);
9771
- #endif
9772
9800
  }
9773
9801
  }
9774
9802
 
@@ -9931,9 +9959,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
9931
9959
  }
9932
9960
  return;
9933
9961
  }
9962
+ #elif defined(GGML_USE_CLBLAST)
9963
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9964
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9965
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9966
+ }
9967
+ return;
9968
+ }
9934
9969
  #endif
9935
9970
 
9936
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9971
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9937
9972
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9938
9973
  if (params->ith != 0) {
9939
9974
  return;
@@ -9956,9 +9991,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
9956
9991
 
9957
9992
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9958
9993
 
9959
- #if defined(GGML_USE_CLBLAST)
9960
- const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
9961
- #else
9962
9994
  {
9963
9995
  size_t id = 0;
9964
9996
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9970,23 +10002,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
9970
10002
  }
9971
10003
 
9972
10004
  const float * x = wdata;
9973
- #endif
9974
10005
 
9975
- #if defined(GGML_USE_CLBLAST)
9976
- // zT = y * xT
9977
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9978
- ne11, ne01, ne10,
9979
- 1.0f, y, ne10,
9980
- x, ne10,
9981
- 0.0f, d, ne01,
9982
- type);
9983
- #else
9984
10006
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9985
10007
  ne11, ne01, ne10,
9986
10008
  1.0f, y, ne10,
9987
10009
  x, ne00,
9988
10010
  0.0f, d, ne01);
9989
- #endif
9990
10011
  }
9991
10012
  }
9992
10013
 
@@ -13810,11 +13831,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
13810
13831
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
13811
13832
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
13812
13833
 
13834
+ if (strlen(node->name) == 0) {
13835
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13836
+ }
13837
+
13813
13838
  cgraph->leafs[cgraph->n_leafs] = node;
13814
13839
  cgraph->n_leafs++;
13815
13840
  } else {
13816
13841
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
13817
13842
 
13843
+ if (strlen(node->name) == 0) {
13844
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
13845
+ }
13846
+
13818
13847
  cgraph->nodes[cgraph->n_nodes] = node;
13819
13848
  cgraph->grads[cgraph->n_nodes] = node->grad;
13820
13849
  cgraph->n_nodes++;
@@ -14165,9 +14194,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14165
14194
  cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14166
14195
  }
14167
14196
  else
14197
+ #elif defined(GGML_USE_CLBLAST)
14198
+ if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
14199
+ node->n_tasks = 1; // TODO: this actually is doing nothing
14200
+ // the threads are still spinning
14201
+ cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
14202
+ }
14203
+ else
14168
14204
  #endif
14169
14205
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
14170
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14206
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14171
14207
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14172
14208
  node->n_tasks = 1; // TODO: this actually is doing nothing
14173
14209
  // the threads are still spinning
@@ -14181,13 +14217,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14181
14217
  #endif
14182
14218
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
14183
14219
  cur = 0;
14184
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14220
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14185
14221
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14186
14222
  node->n_tasks = 1;
14187
14223
  }
14188
14224
  #endif
14189
14225
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
14190
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14226
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14191
14227
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14192
14228
  node->n_tasks = 1;
14193
14229
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -14521,6 +14557,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
14521
14557
  }
14522
14558
  }
14523
14559
 
14560
+ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
14561
+ for (int i = 0; i < cgraph->n_leafs; i++) {
14562
+ struct ggml_tensor * leaf = cgraph->leafs[i];
14563
+
14564
+ if (strcmp(leaf->name, name) == 0) {
14565
+ return leaf;
14566
+ }
14567
+ }
14568
+
14569
+ for (int i = 0; i < cgraph->n_nodes; i++) {
14570
+ struct ggml_tensor * node = cgraph->nodes[i];
14571
+
14572
+ if (strcmp(node->name, name) == 0) {
14573
+ return node;
14574
+ }
14575
+ }
14576
+
14577
+ return NULL;
14578
+ }
14579
+
14580
+ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
14581
+ const int64_t * ne = tensor->ne;
14582
+ const size_t * nb = tensor->nb;
14583
+
14584
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
14585
+ ggml_type_name(tensor->type),
14586
+ ggml_op_name (tensor->op),
14587
+ tensor->n_dims,
14588
+ ne[0], ne[1], ne[2], ne[3],
14589
+ nb[0], nb[1], nb[2], nb[3],
14590
+ tensor->data,
14591
+ tensor->name);
14592
+ }
14593
+
14594
+ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
14595
+ const int64_t * ne = tensor->ne;
14596
+ const size_t * nb = tensor->nb;
14597
+
14598
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
14599
+ arg,
14600
+ ggml_type_name(tensor->type),
14601
+ ggml_op_name (tensor->op),
14602
+ tensor->n_dims,
14603
+ ne[0], ne[1], ne[2], ne[3],
14604
+ nb[0], nb[1], nb[2], nb[3],
14605
+ tensor->n_tasks,
14606
+ tensor->data,
14607
+ tensor->name);
14608
+ }
14609
+
14610
+ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14611
+ assert(cgraph->work == NULL);
14612
+ assert(cgraph->work_size == 0);
14613
+
14614
+ uint64_t size_eval = 0;
14615
+
14616
+ // compute size of intermediate results
14617
+ // TODO: does not take into account scratch buffers !!!!
14618
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14619
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
14620
+ }
14621
+
14622
+ // print
14623
+ {
14624
+ FILE * fout = stdout;
14625
+
14626
+ fprintf(fout, "\n");
14627
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14628
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14629
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14630
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14631
+ fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
14632
+
14633
+ // header
14634
+ fprintf(fout, "\n");
14635
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
14636
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
14637
+
14638
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14639
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
14640
+
14641
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
14642
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
14643
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
14644
+ }
14645
+
14646
+ // header
14647
+ fprintf(fout, "\n");
14648
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
14649
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
14650
+
14651
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14652
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
14653
+
14654
+ if (cgraph->nodes[i]->src0) {
14655
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
14656
+ }
14657
+
14658
+ if (cgraph->nodes[i]->src1) {
14659
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
14660
+ }
14661
+
14662
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14663
+ if (cgraph->nodes[i]->opt[j]) {
14664
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
14665
+ }
14666
+ }
14667
+
14668
+ fprintf(fout, "\n");
14669
+ }
14670
+
14671
+ fprintf(fout, "\n");
14672
+ }
14673
+
14674
+ // write binary data
14675
+ {
14676
+ FILE * fout = fopen(fname, "wb");
14677
+
14678
+ if (!fout) {
14679
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14680
+ return;
14681
+ }
14682
+
14683
+ // header
14684
+ {
14685
+ const uint32_t magic = GGML_FILE_MAGIC;
14686
+ const uint32_t version = GGML_FILE_VERSION;
14687
+ const uint32_t n_leafs = cgraph->n_leafs;
14688
+ const uint32_t nodes = cgraph->n_nodes;
14689
+
14690
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
14691
+ fwrite(&version, sizeof(uint32_t), 1, fout);
14692
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
14693
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
14694
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
14695
+ }
14696
+
14697
+ // leafs
14698
+ {
14699
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14700
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
14701
+
14702
+ const uint32_t type = tensor->type;
14703
+ const uint32_t op = tensor->op;
14704
+ const uint32_t n_dims = tensor->n_dims;
14705
+
14706
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14707
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14708
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14709
+
14710
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14711
+ const uint64_t ne = tensor->ne[j];
14712
+ const uint64_t nb = tensor->nb[j];
14713
+
14714
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14715
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14716
+ }
14717
+
14718
+ // store the pointer address
14719
+ {
14720
+ const uint64_t ptr = (uint64_t) tensor->data;
14721
+
14722
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14723
+ }
14724
+
14725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14726
+
14727
+ // dump the data
14728
+ // TODO: pad this to 32 byte boundary
14729
+ {
14730
+ const size_t size = ggml_nbytes(tensor);
14731
+
14732
+ fwrite(tensor->data, sizeof(char), size, fout);
14733
+ }
14734
+ }
14735
+ }
14736
+
14737
+ // nodes
14738
+ {
14739
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14740
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
14741
+
14742
+ const uint32_t type = tensor->type;
14743
+ const uint32_t op = tensor->op;
14744
+ const uint32_t n_dims = tensor->n_dims;
14745
+
14746
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14747
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14748
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14749
+
14750
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14751
+ const uint64_t ne = tensor->ne[j];
14752
+ const uint64_t nb = tensor->nb[j];
14753
+
14754
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14755
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14756
+ }
14757
+
14758
+ // store the pointer address
14759
+ {
14760
+ const uint64_t ptr = (uint64_t) tensor->data;
14761
+
14762
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14763
+ }
14764
+
14765
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14766
+
14767
+ // output the op arguments
14768
+ {
14769
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14770
+
14771
+ args[0] = tensor->src0;
14772
+ args[1] = tensor->src1;
14773
+
14774
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14775
+ args[2 + j] = tensor->opt[j];
14776
+ }
14777
+
14778
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
14779
+ if (args[j]) {
14780
+ int32_t idx = -1;
14781
+
14782
+ // check if leaf
14783
+ {
14784
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
14785
+ if (args[j] == cgraph->leafs[k]) {
14786
+ idx = k;
14787
+ break;
14788
+ }
14789
+ }
14790
+ }
14791
+
14792
+ // check if node
14793
+ if (idx == -1) {
14794
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
14795
+ if (args[j] == cgraph->nodes[k]) {
14796
+ idx = GGML_MAX_NODES + k;
14797
+ break;
14798
+ }
14799
+ }
14800
+ }
14801
+
14802
+ if (idx == -1) {
14803
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
14804
+ return;
14805
+ }
14806
+
14807
+ fwrite(&idx, sizeof(int32_t), 1, fout);
14808
+ } else {
14809
+ const int32_t nul = -1;
14810
+
14811
+ fwrite(&nul, sizeof(int32_t), 1, fout);
14812
+ }
14813
+ }
14814
+ }
14815
+ }
14816
+ }
14817
+
14818
+ fclose(fout);
14819
+ }
14820
+ }
14821
+
14822
+ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
14823
+ assert(*ctx_data == NULL);
14824
+ assert(*ctx_eval == NULL);
14825
+
14826
+ struct ggml_cgraph result = { 0 };
14827
+
14828
+ struct ggml_tensor * data = NULL;
14829
+
14830
+ // read file into data
14831
+ {
14832
+ FILE * fin = fopen(fname, "rb");
14833
+
14834
+ if (!fin) {
14835
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14836
+ return result;
14837
+ }
14838
+
14839
+ size_t fsize = 0;
14840
+
14841
+ fseek(fin, 0, SEEK_END);
14842
+ fsize = ftell(fin);
14843
+ fseek(fin, 0, SEEK_SET);
14844
+
14845
+ // create the data context
14846
+ {
14847
+ const size_t overhead = 1*ggml_tensor_overhead();
14848
+
14849
+ struct ggml_init_params params = {
14850
+ .mem_size = fsize + overhead,
14851
+ .mem_buffer = NULL,
14852
+ .no_alloc = false,
14853
+ };
14854
+
14855
+ *ctx_data = ggml_init(params);
14856
+
14857
+ if (!*ctx_data) {
14858
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14859
+ return result;
14860
+ }
14861
+ }
14862
+
14863
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
14864
+
14865
+ fread(data->data, sizeof(char), fsize, fin);
14866
+
14867
+ fclose(fin);
14868
+ }
14869
+
14870
+ // populate result
14871
+ {
14872
+ char * ptr = (char *) data->data;
14873
+
14874
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
14875
+
14876
+ if (magic != GGML_FILE_MAGIC) {
14877
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
14878
+ return result;
14879
+ }
14880
+
14881
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
14882
+
14883
+ if (version != GGML_FILE_VERSION) {
14884
+ fprintf(stderr, "%s: invalid version number\n", __func__);
14885
+ return result;
14886
+ }
14887
+
14888
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
14889
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
14890
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
14891
+
14892
+ result.n_leafs = n_leafs;
14893
+ result.n_nodes = n_nodes;
14894
+
14895
+ // create the data context
14896
+ {
14897
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
14898
+
14899
+ struct ggml_init_params params = {
14900
+ .mem_size = size_eval + overhead,
14901
+ .mem_buffer = NULL,
14902
+ .no_alloc = true,
14903
+ };
14904
+
14905
+ *ctx_eval = ggml_init(params);
14906
+
14907
+ if (!*ctx_eval) {
14908
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14909
+ return result;
14910
+ }
14911
+ }
14912
+
14913
+ // leafs
14914
+ {
14915
+ uint32_t type;
14916
+ uint32_t op;
14917
+ uint32_t n_dims;
14918
+
14919
+ for (uint32_t i = 0; i < n_leafs; ++i) {
14920
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14921
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14922
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14923
+
14924
+ int64_t ne[GGML_MAX_DIMS];
14925
+ size_t nb[GGML_MAX_DIMS];
14926
+
14927
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14928
+ uint64_t ne_cur;
14929
+ uint64_t nb_cur;
14930
+
14931
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14932
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14933
+
14934
+ ne[j] = ne_cur;
14935
+ nb[j] = nb_cur;
14936
+ }
14937
+
14938
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14939
+
14940
+ tensor->op = (enum ggml_op) op;
14941
+
14942
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14943
+
14944
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14945
+
14946
+ tensor->data = (void *) ptr;
14947
+
14948
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14949
+ tensor->nb[j] = nb[j];
14950
+ }
14951
+
14952
+ result.leafs[i] = tensor;
14953
+
14954
+ ptr += ggml_nbytes(tensor);
14955
+
14956
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
14957
+ }
14958
+ }
14959
+
14960
+ ggml_set_no_alloc(*ctx_eval, false);
14961
+
14962
+ // nodes
14963
+ {
14964
+ uint32_t type;
14965
+ uint32_t op;
14966
+ uint32_t n_dims;
14967
+
14968
+ for (uint32_t i = 0; i < n_nodes; ++i) {
14969
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14970
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14971
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14972
+
14973
+ int64_t ne[GGML_MAX_DIMS];
14974
+ size_t nb[GGML_MAX_DIMS];
14975
+
14976
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14977
+ uint64_t ne_cur;
14978
+ uint64_t nb_cur;
14979
+
14980
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14981
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14982
+
14983
+ ne[j] = ne_cur;
14984
+ nb[j] = nb_cur;
14985
+ }
14986
+
14987
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14988
+
14989
+ tensor->op = (enum ggml_op) op;
14990
+
14991
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14992
+
14993
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14994
+
14995
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14996
+ tensor->nb[j] = nb[j];
14997
+ }
14998
+
14999
+ // parse args
15000
+ {
15001
+ struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
15002
+ &tensor->src0,
15003
+ &tensor->src1,
15004
+ };
15005
+
15006
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15007
+ args[2 + j] = &tensor->opt[j];
15008
+ }
15009
+
15010
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15011
+ const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
15012
+
15013
+ if (arg_idx == -1) {
15014
+ continue;
15015
+ }
15016
+
15017
+ if (arg_idx < GGML_MAX_NODES) {
15018
+ *args[j] = result.leafs[arg_idx];
15019
+ } else {
15020
+ *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15021
+ }
15022
+ }
15023
+ }
15024
+
15025
+ result.nodes[i] = tensor;
15026
+
15027
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15028
+ }
15029
+ }
15030
+ }
15031
+
15032
+ return result;
15033
+ }
15034
+
14524
15035
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14525
15036
  int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
14526
15037
 
@@ -14538,7 +15049,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14538
15049
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
14539
15050
  i,
14540
15051
  node->ne[0], node->ne[1], node->ne[2],
14541
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
15052
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14542
15053
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
14543
15054
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
14544
15055
  (double) node->perf_time_us / 1000.0,
@@ -14552,7 +15063,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14552
15063
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
14553
15064
  i,
14554
15065
  node->ne[0], node->ne[1],
14555
- GGML_OP_LABEL[node->op]);
15066
+ GGML_OP_NAME[node->op]);
14556
15067
  }
14557
15068
 
14558
15069
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14560,7 +15071,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14560
15071
  continue;
14561
15072
  }
14562
15073
 
14563
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
15074
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
14564
15075
  }
14565
15076
 
14566
15077
  GGML_PRINT("========================================\n");