llama_cpp 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -186,10 +186,12 @@ typedef double ggml_float;
186
186
  #if defined(_MSC_VER) || defined(__MINGW32__)
187
187
  #include <intrin.h>
188
188
  #else
189
+ #if !defined(__riscv)
189
190
  #include <immintrin.h>
190
191
  #endif
191
192
  #endif
192
193
  #endif
194
+ #endif
193
195
 
194
196
  #ifdef __F16C__
195
197
 
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3494
3496
  };
3495
3497
  static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3496
3498
 
3497
- static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3499
+ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3498
3500
  "NONE",
3499
3501
 
3500
3502
  "DUP",
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
3749
3751
  return GGML_TYPE_NAME[type];
3750
3752
  }
3751
3753
 
3754
+ const char * ggml_op_name(enum ggml_op op) {
3755
+ return GGML_OP_NAME[op];
3756
+ }
3752
3757
 
3753
3758
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
3754
3759
  return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3805
3810
  return wtype;
3806
3811
  }
3807
3812
 
3813
+ size_t ggml_tensor_overhead(void) {
3814
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3815
+ }
3816
+
3808
3817
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3809
3818
  return tensor->nb[0] > tensor->nb[1];
3810
3819
  }
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4017
4026
  return result;
4018
4027
  }
4019
4028
 
4029
+ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4030
+ ctx->no_alloc = no_alloc;
4031
+ }
4032
+
4033
+ void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4034
+ return ctx->mem_buffer;
4035
+ }
4036
+
4037
+ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4038
+ return ctx->mem_size;
4039
+ }
4040
+
4020
4041
  // IMPORTANT:
4021
4042
  // when creating "opt" tensors, always save and load the scratch buffer
4022
4043
  // this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4061
4082
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4062
4083
 
4063
4084
  if (ctx->scratch.data == NULL || data != NULL) {
4064
- size_needed += sizeof(struct ggml_tensor);
4085
+ size_needed += GGML_TENSOR_SIZE;
4065
4086
 
4066
4087
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4067
4088
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
4077
4098
  };
4078
4099
  } else {
4079
4100
  if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4101
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4102
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4081
4103
  assert(false);
4082
4104
  return NULL;
4083
4105
  }
4084
4106
 
4085
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4107
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4086
4108
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4109
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4088
4110
  assert(false);
4089
4111
  return NULL;
4090
4112
  }
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4093
4115
 
4094
4116
  *obj_new = (struct ggml_object) {
4095
4117
  .offs = cur_end + GGML_OBJECT_SIZE,
4096
- .size = sizeof(struct ggml_tensor),
4118
+ .size = GGML_TENSOR_SIZE,
4097
4119
  .next = NULL,
4098
4120
  };
4099
4121
 
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
4509
4531
  return result;
4510
4532
  }
4511
4533
 
4534
+ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
4535
+ struct ggml_object * obj = ctx->objects_begin;
4536
+
4537
+ char * const mem_buffer = ctx->mem_buffer;
4538
+
4539
+ while (obj != NULL) {
4540
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4541
+ if (strcmp(cur->name, name) == 0) {
4542
+ return cur;
4543
+ }
4544
+
4545
+ obj = obj->next;
4546
+ }
4547
+
4548
+ return NULL;
4549
+ }
4550
+
4512
4551
  ////////////////////////////////////////////////////////////////////////////////
4513
4552
 
4514
4553
  // ggml_dup
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
6303
6342
 
6304
6343
  ggml_scratch_save(ctx);
6305
6344
 
6306
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6345
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6307
6346
 
6308
6347
  ((int32_t *) b->data)[0] = n_past;
6309
6348
  ((int32_t *) b->data)[1] = n_head;
@@ -9431,7 +9470,7 @@ static void ggml_compute_forward_rms_norm_back(
9431
9470
 
9432
9471
  // ggml_compute_forward_mul_mat
9433
9472
 
9434
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9473
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9435
9474
  // helper function to determine if it is better to use BLAS or not
9436
9475
  // for large matrices, BLAS is faster
9437
9476
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9472,7 +9511,7 @@ static void ggml_compute_forward_mul_mat_f32(
9472
9511
  const int64_t ne02 = src0->ne[2];
9473
9512
  const int64_t ne03 = src0->ne[3];
9474
9513
 
9475
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9514
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9476
9515
  const int64_t ne10 = src1->ne[0];
9477
9516
  #endif
9478
9517
  const int64_t ne11 = src1->ne[1];
@@ -9536,9 +9575,16 @@ static void ggml_compute_forward_mul_mat_f32(
9536
9575
  }
9537
9576
  return;
9538
9577
  }
9578
+ #elif defined(GGML_USE_CLBLAST)
9579
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9580
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9581
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9582
+ }
9583
+ return;
9584
+ }
9539
9585
  #endif
9540
9586
 
9541
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9587
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9542
9588
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9543
9589
  if (params->ith != 0) {
9544
9590
  return;
@@ -9558,21 +9604,11 @@ static void ggml_compute_forward_mul_mat_f32(
9558
9604
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9559
9605
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9560
9606
 
9561
- #if defined(GGML_USE_CLBLAST)
9562
- // zT = y * xT
9563
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9564
- ne11, ne01, ne10,
9565
- 1.0f, y, ne10,
9566
- x, ne10,
9567
- 0.0f, d, ne01,
9568
- GGML_TYPE_F32);
9569
- #else
9570
9607
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9571
9608
  ne11, ne01, ne10,
9572
9609
  1.0f, y, ne10,
9573
9610
  x, ne00,
9574
9611
  0.0f, d, ne01);
9575
- #endif
9576
9612
  }
9577
9613
  }
9578
9614
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9711,9 +9747,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9711
9747
  }
9712
9748
  return;
9713
9749
  }
9750
+ #elif defined(GGML_USE_CLBLAST)
9751
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9752
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9753
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9754
+ }
9755
+ return;
9756
+ }
9714
9757
  #endif
9715
9758
 
9716
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9759
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9717
9760
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9718
9761
  GGML_ASSERT(nb10 == sizeof(float));
9719
9762
 
@@ -9743,20 +9786,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9743
9786
  assert(id*sizeof(float) <= params->wsize);
9744
9787
  }
9745
9788
 
9746
- #if defined(GGML_USE_CLBLAST)
9747
- const float * x = wdata;
9748
- const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9749
-
9750
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9751
-
9752
- // zT = y * xT
9753
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9754
- ne11, ne01, ne10,
9755
- 1.0f, y, ne10,
9756
- x, ne10,
9757
- 0.0f, d, ne01,
9758
- GGML_TYPE_F32);
9759
- #else
9760
9789
  const float * x = wdata;
9761
9790
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
9762
9791
 
@@ -9768,7 +9797,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
9768
9797
  1.0f, y, ne10,
9769
9798
  x, ne00,
9770
9799
  0.0f, d, ne01);
9771
- #endif
9772
9800
  }
9773
9801
  }
9774
9802
 
@@ -9931,9 +9959,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
9931
9959
  }
9932
9960
  return;
9933
9961
  }
9962
+ #elif defined(GGML_USE_CLBLAST)
9963
+ if (ggml_cl_can_mul_mat(src0, src1, dst)) {
9964
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
9965
+ ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
9966
+ }
9967
+ return;
9968
+ }
9934
9969
  #endif
9935
9970
 
9936
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
9971
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9937
9972
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9938
9973
  if (params->ith != 0) {
9939
9974
  return;
@@ -9956,9 +9991,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
9956
9991
 
9957
9992
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
9958
9993
 
9959
- #if defined(GGML_USE_CLBLAST)
9960
- const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
9961
- #else
9962
9994
  {
9963
9995
  size_t id = 0;
9964
9996
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9970,23 +10002,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
9970
10002
  }
9971
10003
 
9972
10004
  const float * x = wdata;
9973
- #endif
9974
10005
 
9975
- #if defined(GGML_USE_CLBLAST)
9976
- // zT = y * xT
9977
- ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
9978
- ne11, ne01, ne10,
9979
- 1.0f, y, ne10,
9980
- x, ne10,
9981
- 0.0f, d, ne01,
9982
- type);
9983
- #else
9984
10006
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
9985
10007
  ne11, ne01, ne10,
9986
10008
  1.0f, y, ne10,
9987
10009
  x, ne00,
9988
10010
  0.0f, d, ne01);
9989
- #endif
9990
10011
  }
9991
10012
  }
9992
10013
 
@@ -13810,11 +13831,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
13810
13831
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
13811
13832
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
13812
13833
 
13834
+ if (strlen(node->name) == 0) {
13835
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13836
+ }
13837
+
13813
13838
  cgraph->leafs[cgraph->n_leafs] = node;
13814
13839
  cgraph->n_leafs++;
13815
13840
  } else {
13816
13841
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
13817
13842
 
13843
+ if (strlen(node->name) == 0) {
13844
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
13845
+ }
13846
+
13818
13847
  cgraph->nodes[cgraph->n_nodes] = node;
13819
13848
  cgraph->grads[cgraph->n_nodes] = node->grad;
13820
13849
  cgraph->n_nodes++;
@@ -14165,9 +14194,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14165
14194
  cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
14166
14195
  }
14167
14196
  else
14197
+ #elif defined(GGML_USE_CLBLAST)
14198
+ if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
14199
+ node->n_tasks = 1; // TODO: this actually is doing nothing
14200
+ // the threads are still spinning
14201
+ cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
14202
+ }
14203
+ else
14168
14204
  #endif
14169
14205
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
14170
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14206
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14171
14207
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14172
14208
  node->n_tasks = 1; // TODO: this actually is doing nothing
14173
14209
  // the threads are still spinning
@@ -14181,13 +14217,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
14181
14217
  #endif
14182
14218
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
14183
14219
  cur = 0;
14184
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14220
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14185
14221
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14186
14222
  node->n_tasks = 1;
14187
14223
  }
14188
14224
  #endif
14189
14225
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
14190
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
14226
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
14191
14227
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
14192
14228
  node->n_tasks = 1;
14193
14229
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -14521,6 +14557,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
14521
14557
  }
14522
14558
  }
14523
14559
 
14560
+ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
14561
+ for (int i = 0; i < cgraph->n_leafs; i++) {
14562
+ struct ggml_tensor * leaf = cgraph->leafs[i];
14563
+
14564
+ if (strcmp(leaf->name, name) == 0) {
14565
+ return leaf;
14566
+ }
14567
+ }
14568
+
14569
+ for (int i = 0; i < cgraph->n_nodes; i++) {
14570
+ struct ggml_tensor * node = cgraph->nodes[i];
14571
+
14572
+ if (strcmp(node->name, name) == 0) {
14573
+ return node;
14574
+ }
14575
+ }
14576
+
14577
+ return NULL;
14578
+ }
14579
+
14580
+ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
14581
+ const int64_t * ne = tensor->ne;
14582
+ const size_t * nb = tensor->nb;
14583
+
14584
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
14585
+ ggml_type_name(tensor->type),
14586
+ ggml_op_name (tensor->op),
14587
+ tensor->n_dims,
14588
+ ne[0], ne[1], ne[2], ne[3],
14589
+ nb[0], nb[1], nb[2], nb[3],
14590
+ tensor->data,
14591
+ tensor->name);
14592
+ }
14593
+
14594
+ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
14595
+ const int64_t * ne = tensor->ne;
14596
+ const size_t * nb = tensor->nb;
14597
+
14598
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
14599
+ arg,
14600
+ ggml_type_name(tensor->type),
14601
+ ggml_op_name (tensor->op),
14602
+ tensor->n_dims,
14603
+ ne[0], ne[1], ne[2], ne[3],
14604
+ nb[0], nb[1], nb[2], nb[3],
14605
+ tensor->n_tasks,
14606
+ tensor->data,
14607
+ tensor->name);
14608
+ }
14609
+
14610
+ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14611
+ assert(cgraph->work == NULL);
14612
+ assert(cgraph->work_size == 0);
14613
+
14614
+ uint64_t size_eval = 0;
14615
+
14616
+ // compute size of intermediate results
14617
+ // TODO: does not take into account scratch buffers !!!!
14618
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14619
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
14620
+ }
14621
+
14622
+ // print
14623
+ {
14624
+ FILE * fout = stdout;
14625
+
14626
+ fprintf(fout, "\n");
14627
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14628
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14629
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14630
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14631
+ fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
14632
+
14633
+ // header
14634
+ fprintf(fout, "\n");
14635
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
14636
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
14637
+
14638
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14639
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
14640
+
14641
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
14642
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
14643
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
14644
+ }
14645
+
14646
+ // header
14647
+ fprintf(fout, "\n");
14648
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
14649
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
14650
+
14651
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14652
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
14653
+
14654
+ if (cgraph->nodes[i]->src0) {
14655
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
14656
+ }
14657
+
14658
+ if (cgraph->nodes[i]->src1) {
14659
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
14660
+ }
14661
+
14662
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14663
+ if (cgraph->nodes[i]->opt[j]) {
14664
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
14665
+ }
14666
+ }
14667
+
14668
+ fprintf(fout, "\n");
14669
+ }
14670
+
14671
+ fprintf(fout, "\n");
14672
+ }
14673
+
14674
+ // write binary data
14675
+ {
14676
+ FILE * fout = fopen(fname, "wb");
14677
+
14678
+ if (!fout) {
14679
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14680
+ return;
14681
+ }
14682
+
14683
+ // header
14684
+ {
14685
+ const uint32_t magic = GGML_FILE_MAGIC;
14686
+ const uint32_t version = GGML_FILE_VERSION;
14687
+ const uint32_t n_leafs = cgraph->n_leafs;
14688
+ const uint32_t nodes = cgraph->n_nodes;
14689
+
14690
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
14691
+ fwrite(&version, sizeof(uint32_t), 1, fout);
14692
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
14693
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
14694
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
14695
+ }
14696
+
14697
+ // leafs
14698
+ {
14699
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14700
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
14701
+
14702
+ const uint32_t type = tensor->type;
14703
+ const uint32_t op = tensor->op;
14704
+ const uint32_t n_dims = tensor->n_dims;
14705
+
14706
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14707
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14708
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14709
+
14710
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14711
+ const uint64_t ne = tensor->ne[j];
14712
+ const uint64_t nb = tensor->nb[j];
14713
+
14714
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14715
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14716
+ }
14717
+
14718
+ // store the pointer address
14719
+ {
14720
+ const uint64_t ptr = (uint64_t) tensor->data;
14721
+
14722
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14723
+ }
14724
+
14725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14726
+
14727
+ // dump the data
14728
+ // TODO: pad this to 32 byte boundary
14729
+ {
14730
+ const size_t size = ggml_nbytes(tensor);
14731
+
14732
+ fwrite(tensor->data, sizeof(char), size, fout);
14733
+ }
14734
+ }
14735
+ }
14736
+
14737
+ // nodes
14738
+ {
14739
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14740
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
14741
+
14742
+ const uint32_t type = tensor->type;
14743
+ const uint32_t op = tensor->op;
14744
+ const uint32_t n_dims = tensor->n_dims;
14745
+
14746
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14747
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14748
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14749
+
14750
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14751
+ const uint64_t ne = tensor->ne[j];
14752
+ const uint64_t nb = tensor->nb[j];
14753
+
14754
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14755
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14756
+ }
14757
+
14758
+ // store the pointer address
14759
+ {
14760
+ const uint64_t ptr = (uint64_t) tensor->data;
14761
+
14762
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14763
+ }
14764
+
14765
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14766
+
14767
+ // output the op arguments
14768
+ {
14769
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14770
+
14771
+ args[0] = tensor->src0;
14772
+ args[1] = tensor->src1;
14773
+
14774
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14775
+ args[2 + j] = tensor->opt[j];
14776
+ }
14777
+
14778
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
14779
+ if (args[j]) {
14780
+ int32_t idx = -1;
14781
+
14782
+ // check if leaf
14783
+ {
14784
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
14785
+ if (args[j] == cgraph->leafs[k]) {
14786
+ idx = k;
14787
+ break;
14788
+ }
14789
+ }
14790
+ }
14791
+
14792
+ // check if node
14793
+ if (idx == -1) {
14794
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
14795
+ if (args[j] == cgraph->nodes[k]) {
14796
+ idx = GGML_MAX_NODES + k;
14797
+ break;
14798
+ }
14799
+ }
14800
+ }
14801
+
14802
+ if (idx == -1) {
14803
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
14804
+ return;
14805
+ }
14806
+
14807
+ fwrite(&idx, sizeof(int32_t), 1, fout);
14808
+ } else {
14809
+ const int32_t nul = -1;
14810
+
14811
+ fwrite(&nul, sizeof(int32_t), 1, fout);
14812
+ }
14813
+ }
14814
+ }
14815
+ }
14816
+ }
14817
+
14818
+ fclose(fout);
14819
+ }
14820
+ }
14821
+
14822
+ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
14823
+ assert(*ctx_data == NULL);
14824
+ assert(*ctx_eval == NULL);
14825
+
14826
+ struct ggml_cgraph result = { 0 };
14827
+
14828
+ struct ggml_tensor * data = NULL;
14829
+
14830
+ // read file into data
14831
+ {
14832
+ FILE * fin = fopen(fname, "rb");
14833
+
14834
+ if (!fin) {
14835
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14836
+ return result;
14837
+ }
14838
+
14839
+ size_t fsize = 0;
14840
+
14841
+ fseek(fin, 0, SEEK_END);
14842
+ fsize = ftell(fin);
14843
+ fseek(fin, 0, SEEK_SET);
14844
+
14845
+ // create the data context
14846
+ {
14847
+ const size_t overhead = 1*ggml_tensor_overhead();
14848
+
14849
+ struct ggml_init_params params = {
14850
+ .mem_size = fsize + overhead,
14851
+ .mem_buffer = NULL,
14852
+ .no_alloc = false,
14853
+ };
14854
+
14855
+ *ctx_data = ggml_init(params);
14856
+
14857
+ if (!*ctx_data) {
14858
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14859
+ return result;
14860
+ }
14861
+ }
14862
+
14863
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
14864
+
14865
+ fread(data->data, sizeof(char), fsize, fin);
14866
+
14867
+ fclose(fin);
14868
+ }
14869
+
14870
+ // populate result
14871
+ {
14872
+ char * ptr = (char *) data->data;
14873
+
14874
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
14875
+
14876
+ if (magic != GGML_FILE_MAGIC) {
14877
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
14878
+ return result;
14879
+ }
14880
+
14881
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
14882
+
14883
+ if (version != GGML_FILE_VERSION) {
14884
+ fprintf(stderr, "%s: invalid version number\n", __func__);
14885
+ return result;
14886
+ }
14887
+
14888
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
14889
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
14890
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
14891
+
14892
+ result.n_leafs = n_leafs;
14893
+ result.n_nodes = n_nodes;
14894
+
14895
+ // create the data context
14896
+ {
14897
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
14898
+
14899
+ struct ggml_init_params params = {
14900
+ .mem_size = size_eval + overhead,
14901
+ .mem_buffer = NULL,
14902
+ .no_alloc = true,
14903
+ };
14904
+
14905
+ *ctx_eval = ggml_init(params);
14906
+
14907
+ if (!*ctx_eval) {
14908
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14909
+ return result;
14910
+ }
14911
+ }
14912
+
14913
+ // leafs
14914
+ {
14915
+ uint32_t type;
14916
+ uint32_t op;
14917
+ uint32_t n_dims;
14918
+
14919
+ for (uint32_t i = 0; i < n_leafs; ++i) {
14920
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14921
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14922
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14923
+
14924
+ int64_t ne[GGML_MAX_DIMS];
14925
+ size_t nb[GGML_MAX_DIMS];
14926
+
14927
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14928
+ uint64_t ne_cur;
14929
+ uint64_t nb_cur;
14930
+
14931
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14932
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14933
+
14934
+ ne[j] = ne_cur;
14935
+ nb[j] = nb_cur;
14936
+ }
14937
+
14938
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14939
+
14940
+ tensor->op = (enum ggml_op) op;
14941
+
14942
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14943
+
14944
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14945
+
14946
+ tensor->data = (void *) ptr;
14947
+
14948
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14949
+ tensor->nb[j] = nb[j];
14950
+ }
14951
+
14952
+ result.leafs[i] = tensor;
14953
+
14954
+ ptr += ggml_nbytes(tensor);
14955
+
14956
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
14957
+ }
14958
+ }
14959
+
14960
+ ggml_set_no_alloc(*ctx_eval, false);
14961
+
14962
+ // nodes
14963
+ {
14964
+ uint32_t type;
14965
+ uint32_t op;
14966
+ uint32_t n_dims;
14967
+
14968
+ for (uint32_t i = 0; i < n_nodes; ++i) {
14969
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14970
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14971
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14972
+
14973
+ int64_t ne[GGML_MAX_DIMS];
14974
+ size_t nb[GGML_MAX_DIMS];
14975
+
14976
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14977
+ uint64_t ne_cur;
14978
+ uint64_t nb_cur;
14979
+
14980
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14981
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14982
+
14983
+ ne[j] = ne_cur;
14984
+ nb[j] = nb_cur;
14985
+ }
14986
+
14987
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14988
+
14989
+ tensor->op = (enum ggml_op) op;
14990
+
14991
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14992
+
14993
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14994
+
14995
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14996
+ tensor->nb[j] = nb[j];
14997
+ }
14998
+
14999
+ // parse args
15000
+ {
15001
+ struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
15002
+ &tensor->src0,
15003
+ &tensor->src1,
15004
+ };
15005
+
15006
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15007
+ args[2 + j] = &tensor->opt[j];
15008
+ }
15009
+
15010
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15011
+ const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
15012
+
15013
+ if (arg_idx == -1) {
15014
+ continue;
15015
+ }
15016
+
15017
+ if (arg_idx < GGML_MAX_NODES) {
15018
+ *args[j] = result.leafs[arg_idx];
15019
+ } else {
15020
+ *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15021
+ }
15022
+ }
15023
+ }
15024
+
15025
+ result.nodes[i] = tensor;
15026
+
15027
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15028
+ }
15029
+ }
15030
+ }
15031
+
15032
+ return result;
15033
+ }
15034
+
14524
15035
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14525
15036
  int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
14526
15037
 
@@ -14538,7 +15049,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14538
15049
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
14539
15050
  i,
14540
15051
  node->ne[0], node->ne[1], node->ne[2],
14541
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
15052
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14542
15053
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
14543
15054
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
14544
15055
  (double) node->perf_time_us / 1000.0,
@@ -14552,7 +15063,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14552
15063
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
14553
15064
  i,
14554
15065
  node->ne[0], node->ne[1],
14555
- GGML_OP_LABEL[node->op]);
15066
+ GGML_OP_NAME[node->op]);
14556
15067
  }
14557
15068
 
14558
15069
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14560,7 +15071,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14560
15071
  continue;
14561
15072
  }
14562
15073
 
14563
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
15074
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
14564
15075
  }
14565
15076
 
14566
15077
  GGML_PRINT("========================================\n");