llama_cpp 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ed569e816938dfca79c345228cf592eb81827c04acfeae3c8e26e0635bbc518b
4
- data.tar.gz: f4a899df0cf450370d7dc75e486a17617f1af0cbcacd9d9a8c7d3bde10016441
3
+ metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
4
+ data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
5
5
  SHA512:
6
- metadata.gz: 0f3d38eed6628e8d68efc741fe00024fb0c5199fb2e1a33d6f04d9299e1c59deb969e3eafe36190ade84522e70ddca50956fbee9b6406edc5d613f654889a83a
7
- data.tar.gz: 0b1705a8d70564a59ad6472b03dc0241727766d4121e26a2e9c3c0d4725ddf2ccf65cb8f4a862688661ea9fa2b1c8858cd6e5e722821e6c2c30c91401475ef74
6
+ metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
7
+ data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
data/CHANGELOG.md CHANGED
@@ -1,25 +1,27 @@
1
- ## [Unreleased]
1
+ ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
2
+
3
+ - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
2
4
 
3
5
  ## [[0.1.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.2...v0.1.3)] - 2023-05-27
4
6
 
5
- - Bump bundled llama.cpp from master-265db98 to master-66874d4
7
+ - Bump bundled llama.cpp from master-265db98 to master-66874d4.
6
8
 
7
9
  ## [[0.1.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.1...v0.1.2)] - 2023-05-22
8
10
 
9
11
  **Breaking Changes**
10
12
 
11
- - Bump bundled llama.cpp from master-6986c78 to master-265db98
12
- - bump LLAMA_FILE_VERSION to 3
13
+ - Bump bundled llama.cpp from master-6986c78 to master-265db98.
14
+ - bump LLAMA_FILE_VERSION to 3.
13
15
 
14
16
  ## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
15
17
 
16
- - Add load_session_file method to Context
17
- - Add save_session_file method to Context
18
+ - Add load_session_file method to Context.
19
+ - Add save_session_file method to Context.
18
20
 
19
21
  **Breaking Changes**
20
22
 
21
- - Bump bundled llama.cpp from master-173d0e6 to master-6986c78
22
- - bump LLAMA_FILE_VERSION to 2
23
+ - Bump bundled llama.cpp from master-173d0e6 to master-6986c78.
24
+ - bump LLAMA_FILE_VERSION to 2.
23
25
 
24
26
  ## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
25
27
 
@@ -469,16 +469,11 @@ void ggml_cl_init(void) {
469
469
 
470
470
  size_t ext_str_size;
471
471
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
472
- char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
472
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
473
473
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
474
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
474
475
  // Check if ext_buffer contains cl_khr_fp16
475
- for (size_t i = 0; i < ext_str_size - 12; i++) {
476
- if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
477
- fp16_support = true;
478
- break;
479
- }
480
- }
481
- free(ext_buffer);
476
+ fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
482
477
  fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
483
478
 
484
479
  cl_context_properties properties[] = {
@@ -672,7 +667,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
672
667
  size_t d_size;
673
668
  cl_mem d_X;
674
669
  if (src0->backend == GGML_BACKEND_CL) {
675
- d_X = *(cl_mem*) src0->data;
670
+ d_X = (cl_mem) src0->data;
676
671
  } else {
677
672
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
678
673
  }
@@ -748,7 +743,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
748
743
  size_t d_size;
749
744
  cl_mem d_X;
750
745
  if (src0->backend == GGML_BACKEND_CL) {
751
- d_X = *(cl_mem*) src0->data;
746
+ d_X = (cl_mem) src0->data;
752
747
  } else {
753
748
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
754
749
  }
@@ -873,7 +868,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
873
868
  if (src0->backend == GGML_BACKEND_CPU) {
874
869
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
875
870
  } else if (src0->backend == GGML_BACKEND_CL) {
876
- d_Q = *(cl_mem*) src0->data;
871
+ d_Q = (cl_mem) src0->data;
877
872
  } else {
878
873
  GGML_ASSERT(false);
879
874
  }
@@ -1016,14 +1011,13 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1016
1011
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1017
1012
 
1018
1013
  size_t q_size;
1019
- cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
1020
- *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1014
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1021
1015
 
1022
1016
  // copy tensor to device
1023
1017
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1024
1018
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1025
1019
  int i = i3*ne2 + i2;
1026
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
1020
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1027
1021
  }
1028
1022
  }
1029
1023
 
@@ -186,10 +186,12 @@ typedef double ggml_float;
186
186
  #if defined(_MSC_VER) || defined(__MINGW32__)
187
187
  #include <intrin.h>
188
188
  #else
189
+ #if !defined(__riscv)
189
190
  #include <immintrin.h>
190
191
  #endif
191
192
  #endif
192
193
  #endif
194
+ #endif
193
195
 
194
196
  #ifdef __F16C__
195
197
 
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3494
3496
  };
3495
3497
  static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3496
3498
 
3497
- static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3499
+ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3498
3500
  "NONE",
3499
3501
 
3500
3502
  "DUP",
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
3749
3751
  return GGML_TYPE_NAME[type];
3750
3752
  }
3751
3753
 
3754
+ const char * ggml_op_name(enum ggml_op op) {
3755
+ return GGML_OP_NAME[op];
3756
+ }
3752
3757
 
3753
3758
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
3754
3759
  return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3805
3810
  return wtype;
3806
3811
  }
3807
3812
 
3813
+ size_t ggml_tensor_overhead(void) {
3814
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3815
+ }
3816
+
3808
3817
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3809
3818
  return tensor->nb[0] > tensor->nb[1];
3810
3819
  }
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4017
4026
  return result;
4018
4027
  }
4019
4028
 
4029
+ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4030
+ ctx->no_alloc = no_alloc;
4031
+ }
4032
+
4033
+ void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4034
+ return ctx->mem_buffer;
4035
+ }
4036
+
4037
+ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4038
+ return ctx->mem_size;
4039
+ }
4040
+
4020
4041
  // IMPORTANT:
4021
4042
  // when creating "opt" tensors, always save and load the scratch buffer
4022
4043
  // this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4061
4082
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4062
4083
 
4063
4084
  if (ctx->scratch.data == NULL || data != NULL) {
4064
- size_needed += sizeof(struct ggml_tensor);
4085
+ size_needed += GGML_TENSOR_SIZE;
4065
4086
 
4066
4087
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4067
4088
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
4077
4098
  };
4078
4099
  } else {
4079
4100
  if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4101
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4102
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4081
4103
  assert(false);
4082
4104
  return NULL;
4083
4105
  }
4084
4106
 
4085
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4107
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4086
4108
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4109
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4088
4110
  assert(false);
4089
4111
  return NULL;
4090
4112
  }
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4093
4115
 
4094
4116
  *obj_new = (struct ggml_object) {
4095
4117
  .offs = cur_end + GGML_OBJECT_SIZE,
4096
- .size = sizeof(struct ggml_tensor),
4118
+ .size = GGML_TENSOR_SIZE,
4097
4119
  .next = NULL,
4098
4120
  };
4099
4121
 
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
4509
4531
  return result;
4510
4532
  }
4511
4533
 
4534
+ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
4535
+ struct ggml_object * obj = ctx->objects_begin;
4536
+
4537
+ char * const mem_buffer = ctx->mem_buffer;
4538
+
4539
+ while (obj != NULL) {
4540
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4541
+ if (strcmp(cur->name, name) == 0) {
4542
+ return cur;
4543
+ }
4544
+
4545
+ obj = obj->next;
4546
+ }
4547
+
4548
+ return NULL;
4549
+ }
4550
+
4512
4551
  ////////////////////////////////////////////////////////////////////////////////
4513
4552
 
4514
4553
  // ggml_dup
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
6303
6342
 
6304
6343
  ggml_scratch_save(ctx);
6305
6344
 
6306
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6345
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6307
6346
 
6308
6347
  ((int32_t *) b->data)[0] = n_past;
6309
6348
  ((int32_t *) b->data)[1] = n_head;
@@ -13792,11 +13831,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
13792
13831
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
13793
13832
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
13794
13833
 
13834
+ if (strlen(node->name) == 0) {
13835
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13836
+ }
13837
+
13795
13838
  cgraph->leafs[cgraph->n_leafs] = node;
13796
13839
  cgraph->n_leafs++;
13797
13840
  } else {
13798
13841
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
13799
13842
 
13843
+ if (strlen(node->name) == 0) {
13844
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
13845
+ }
13846
+
13800
13847
  cgraph->nodes[cgraph->n_nodes] = node;
13801
13848
  cgraph->grads[cgraph->n_nodes] = node->grad;
13802
13849
  cgraph->n_nodes++;
@@ -14510,6 +14557,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
14510
14557
  }
14511
14558
  }
14512
14559
 
14560
+ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
14561
+ for (int i = 0; i < cgraph->n_leafs; i++) {
14562
+ struct ggml_tensor * leaf = cgraph->leafs[i];
14563
+
14564
+ if (strcmp(leaf->name, name) == 0) {
14565
+ return leaf;
14566
+ }
14567
+ }
14568
+
14569
+ for (int i = 0; i < cgraph->n_nodes; i++) {
14570
+ struct ggml_tensor * node = cgraph->nodes[i];
14571
+
14572
+ if (strcmp(node->name, name) == 0) {
14573
+ return node;
14574
+ }
14575
+ }
14576
+
14577
+ return NULL;
14578
+ }
14579
+
14580
+ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
14581
+ const int64_t * ne = tensor->ne;
14582
+ const size_t * nb = tensor->nb;
14583
+
14584
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
14585
+ ggml_type_name(tensor->type),
14586
+ ggml_op_name (tensor->op),
14587
+ tensor->n_dims,
14588
+ ne[0], ne[1], ne[2], ne[3],
14589
+ nb[0], nb[1], nb[2], nb[3],
14590
+ tensor->data,
14591
+ tensor->name);
14592
+ }
14593
+
14594
+ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
14595
+ const int64_t * ne = tensor->ne;
14596
+ const size_t * nb = tensor->nb;
14597
+
14598
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
14599
+ arg,
14600
+ ggml_type_name(tensor->type),
14601
+ ggml_op_name (tensor->op),
14602
+ tensor->n_dims,
14603
+ ne[0], ne[1], ne[2], ne[3],
14604
+ nb[0], nb[1], nb[2], nb[3],
14605
+ tensor->n_tasks,
14606
+ tensor->data,
14607
+ tensor->name);
14608
+ }
14609
+
14610
+ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14611
+ assert(cgraph->work == NULL);
14612
+ assert(cgraph->work_size == 0);
14613
+
14614
+ uint64_t size_eval = 0;
14615
+
14616
+ // compute size of intermediate results
14617
+ // TODO: does not take into account scratch buffers !!!!
14618
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14619
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
14620
+ }
14621
+
14622
+ // print
14623
+ {
14624
+ FILE * fout = stdout;
14625
+
14626
+ fprintf(fout, "\n");
14627
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14628
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14629
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14630
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14631
+ fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
14632
+
14633
+ // header
14634
+ fprintf(fout, "\n");
14635
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
14636
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
14637
+
14638
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14639
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
14640
+
14641
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
14642
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
14643
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
14644
+ }
14645
+
14646
+ // header
14647
+ fprintf(fout, "\n");
14648
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
14649
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
14650
+
14651
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14652
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
14653
+
14654
+ if (cgraph->nodes[i]->src0) {
14655
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
14656
+ }
14657
+
14658
+ if (cgraph->nodes[i]->src1) {
14659
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
14660
+ }
14661
+
14662
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14663
+ if (cgraph->nodes[i]->opt[j]) {
14664
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
14665
+ }
14666
+ }
14667
+
14668
+ fprintf(fout, "\n");
14669
+ }
14670
+
14671
+ fprintf(fout, "\n");
14672
+ }
14673
+
14674
+ // write binary data
14675
+ {
14676
+ FILE * fout = fopen(fname, "wb");
14677
+
14678
+ if (!fout) {
14679
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14680
+ return;
14681
+ }
14682
+
14683
+ // header
14684
+ {
14685
+ const uint32_t magic = GGML_FILE_MAGIC;
14686
+ const uint32_t version = GGML_FILE_VERSION;
14687
+ const uint32_t n_leafs = cgraph->n_leafs;
14688
+ const uint32_t nodes = cgraph->n_nodes;
14689
+
14690
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
14691
+ fwrite(&version, sizeof(uint32_t), 1, fout);
14692
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
14693
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
14694
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
14695
+ }
14696
+
14697
+ // leafs
14698
+ {
14699
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14700
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
14701
+
14702
+ const uint32_t type = tensor->type;
14703
+ const uint32_t op = tensor->op;
14704
+ const uint32_t n_dims = tensor->n_dims;
14705
+
14706
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14707
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14708
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14709
+
14710
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14711
+ const uint64_t ne = tensor->ne[j];
14712
+ const uint64_t nb = tensor->nb[j];
14713
+
14714
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14715
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14716
+ }
14717
+
14718
+ // store the pointer address
14719
+ {
14720
+ const uint64_t ptr = (uint64_t) tensor->data;
14721
+
14722
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14723
+ }
14724
+
14725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14726
+
14727
+ // dump the data
14728
+ // TODO: pad this to 32 byte boundary
14729
+ {
14730
+ const size_t size = ggml_nbytes(tensor);
14731
+
14732
+ fwrite(tensor->data, sizeof(char), size, fout);
14733
+ }
14734
+ }
14735
+ }
14736
+
14737
+ // nodes
14738
+ {
14739
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14740
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
14741
+
14742
+ const uint32_t type = tensor->type;
14743
+ const uint32_t op = tensor->op;
14744
+ const uint32_t n_dims = tensor->n_dims;
14745
+
14746
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14747
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14748
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14749
+
14750
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14751
+ const uint64_t ne = tensor->ne[j];
14752
+ const uint64_t nb = tensor->nb[j];
14753
+
14754
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14755
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14756
+ }
14757
+
14758
+ // store the pointer address
14759
+ {
14760
+ const uint64_t ptr = (uint64_t) tensor->data;
14761
+
14762
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14763
+ }
14764
+
14765
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14766
+
14767
+ // output the op arguments
14768
+ {
14769
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14770
+
14771
+ args[0] = tensor->src0;
14772
+ args[1] = tensor->src1;
14773
+
14774
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14775
+ args[2 + j] = tensor->opt[j];
14776
+ }
14777
+
14778
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
14779
+ if (args[j]) {
14780
+ int32_t idx = -1;
14781
+
14782
+ // check if leaf
14783
+ {
14784
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
14785
+ if (args[j] == cgraph->leafs[k]) {
14786
+ idx = k;
14787
+ break;
14788
+ }
14789
+ }
14790
+ }
14791
+
14792
+ // check if node
14793
+ if (idx == -1) {
14794
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
14795
+ if (args[j] == cgraph->nodes[k]) {
14796
+ idx = GGML_MAX_NODES + k;
14797
+ break;
14798
+ }
14799
+ }
14800
+ }
14801
+
14802
+ if (idx == -1) {
14803
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
14804
+ return;
14805
+ }
14806
+
14807
+ fwrite(&idx, sizeof(int32_t), 1, fout);
14808
+ } else {
14809
+ const int32_t nul = -1;
14810
+
14811
+ fwrite(&nul, sizeof(int32_t), 1, fout);
14812
+ }
14813
+ }
14814
+ }
14815
+ }
14816
+ }
14817
+
14818
+ fclose(fout);
14819
+ }
14820
+ }
14821
+
14822
+ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
14823
+ assert(*ctx_data == NULL);
14824
+ assert(*ctx_eval == NULL);
14825
+
14826
+ struct ggml_cgraph result = { 0 };
14827
+
14828
+ struct ggml_tensor * data = NULL;
14829
+
14830
+ // read file into data
14831
+ {
14832
+ FILE * fin = fopen(fname, "rb");
14833
+
14834
+ if (!fin) {
14835
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14836
+ return result;
14837
+ }
14838
+
14839
+ size_t fsize = 0;
14840
+
14841
+ fseek(fin, 0, SEEK_END);
14842
+ fsize = ftell(fin);
14843
+ fseek(fin, 0, SEEK_SET);
14844
+
14845
+ // create the data context
14846
+ {
14847
+ const size_t overhead = 1*ggml_tensor_overhead();
14848
+
14849
+ struct ggml_init_params params = {
14850
+ .mem_size = fsize + overhead,
14851
+ .mem_buffer = NULL,
14852
+ .no_alloc = false,
14853
+ };
14854
+
14855
+ *ctx_data = ggml_init(params);
14856
+
14857
+ if (!*ctx_data) {
14858
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14859
+ return result;
14860
+ }
14861
+ }
14862
+
14863
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
14864
+
14865
+ fread(data->data, sizeof(char), fsize, fin);
14866
+
14867
+ fclose(fin);
14868
+ }
14869
+
14870
+ // populate result
14871
+ {
14872
+ char * ptr = (char *) data->data;
14873
+
14874
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
14875
+
14876
+ if (magic != GGML_FILE_MAGIC) {
14877
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
14878
+ return result;
14879
+ }
14880
+
14881
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
14882
+
14883
+ if (version != GGML_FILE_VERSION) {
14884
+ fprintf(stderr, "%s: invalid version number\n", __func__);
14885
+ return result;
14886
+ }
14887
+
14888
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
14889
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
14890
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
14891
+
14892
+ result.n_leafs = n_leafs;
14893
+ result.n_nodes = n_nodes;
14894
+
14895
+ // create the data context
14896
+ {
14897
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
14898
+
14899
+ struct ggml_init_params params = {
14900
+ .mem_size = size_eval + overhead,
14901
+ .mem_buffer = NULL,
14902
+ .no_alloc = true,
14903
+ };
14904
+
14905
+ *ctx_eval = ggml_init(params);
14906
+
14907
+ if (!*ctx_eval) {
14908
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14909
+ return result;
14910
+ }
14911
+ }
14912
+
14913
+ // leafs
14914
+ {
14915
+ uint32_t type;
14916
+ uint32_t op;
14917
+ uint32_t n_dims;
14918
+
14919
+ for (uint32_t i = 0; i < n_leafs; ++i) {
14920
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14921
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14922
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14923
+
14924
+ int64_t ne[GGML_MAX_DIMS];
14925
+ size_t nb[GGML_MAX_DIMS];
14926
+
14927
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14928
+ uint64_t ne_cur;
14929
+ uint64_t nb_cur;
14930
+
14931
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14932
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14933
+
14934
+ ne[j] = ne_cur;
14935
+ nb[j] = nb_cur;
14936
+ }
14937
+
14938
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14939
+
14940
+ tensor->op = (enum ggml_op) op;
14941
+
14942
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14943
+
14944
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14945
+
14946
+ tensor->data = (void *) ptr;
14947
+
14948
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14949
+ tensor->nb[j] = nb[j];
14950
+ }
14951
+
14952
+ result.leafs[i] = tensor;
14953
+
14954
+ ptr += ggml_nbytes(tensor);
14955
+
14956
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
14957
+ }
14958
+ }
14959
+
14960
+ ggml_set_no_alloc(*ctx_eval, false);
14961
+
14962
+ // nodes
14963
+ {
14964
+ uint32_t type;
14965
+ uint32_t op;
14966
+ uint32_t n_dims;
14967
+
14968
+ for (uint32_t i = 0; i < n_nodes; ++i) {
14969
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14970
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14971
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14972
+
14973
+ int64_t ne[GGML_MAX_DIMS];
14974
+ size_t nb[GGML_MAX_DIMS];
14975
+
14976
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14977
+ uint64_t ne_cur;
14978
+ uint64_t nb_cur;
14979
+
14980
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14981
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14982
+
14983
+ ne[j] = ne_cur;
14984
+ nb[j] = nb_cur;
14985
+ }
14986
+
14987
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14988
+
14989
+ tensor->op = (enum ggml_op) op;
14990
+
14991
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14992
+
14993
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14994
+
14995
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14996
+ tensor->nb[j] = nb[j];
14997
+ }
14998
+
14999
+ // parse args
15000
+ {
15001
+ struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
15002
+ &tensor->src0,
15003
+ &tensor->src1,
15004
+ };
15005
+
15006
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15007
+ args[2 + j] = &tensor->opt[j];
15008
+ }
15009
+
15010
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15011
+ const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
15012
+
15013
+ if (arg_idx == -1) {
15014
+ continue;
15015
+ }
15016
+
15017
+ if (arg_idx < GGML_MAX_NODES) {
15018
+ *args[j] = result.leafs[arg_idx];
15019
+ } else {
15020
+ *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15021
+ }
15022
+ }
15023
+ }
15024
+
15025
+ result.nodes[i] = tensor;
15026
+
15027
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15028
+ }
15029
+ }
15030
+ }
15031
+
15032
+ return result;
15033
+ }
15034
+
14513
15035
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14514
15036
  int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
14515
15037
 
@@ -14527,7 +15049,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14527
15049
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
14528
15050
  i,
14529
15051
  node->ne[0], node->ne[1], node->ne[2],
14530
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
15052
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14531
15053
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
14532
15054
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
14533
15055
  (double) node->perf_time_us / 1000.0,
@@ -14541,7 +15063,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14541
15063
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
14542
15064
  i,
14543
15065
  node->ne[0], node->ne[1],
14544
- GGML_OP_LABEL[node->op]);
15066
+ GGML_OP_NAME[node->op]);
14545
15067
  }
14546
15068
 
14547
15069
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14549,7 +15071,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14549
15071
  continue;
14550
15072
  }
14551
15073
 
14552
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
15074
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
14553
15075
  }
14554
15076
 
14555
15077
  GGML_PRINT("========================================\n");
@@ -198,6 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
201
202
  #define GGML_DEFAULT_N_THREADS 4
202
203
 
203
204
  #define GGML_ASSERT(x) \
@@ -372,11 +373,13 @@ extern "C" {
372
373
 
373
374
  void * data;
374
375
 
375
- char name[32];
376
+ char name[GGML_MAX_NAME];
376
377
 
377
378
  char padding[16];
378
379
  };
379
380
 
381
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
382
+
380
383
  // computation graph
381
384
  struct ggml_cgraph {
382
385
  int n_nodes;
@@ -429,6 +432,7 @@ extern "C" {
429
432
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
430
433
 
431
434
  GGML_API const char * ggml_type_name(enum ggml_type type);
435
+ GGML_API const char * ggml_op_name (enum ggml_op op);
432
436
 
433
437
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
434
438
 
@@ -437,6 +441,9 @@ extern "C" {
437
441
  // TODO: temporary until model loading of ggml examples is refactored
438
442
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
439
443
 
444
+ // use this to compute the memory overhead of a tensor
445
+ GGML_API size_t ggml_tensor_overhead(void);
446
+
440
447
  // main
441
448
 
442
449
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,7 +451,11 @@ extern "C" {
444
451
 
445
452
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
446
453
 
447
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
454
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
455
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
456
+
457
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
458
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
448
459
 
449
460
  GGML_API struct ggml_tensor * ggml_new_tensor(
450
461
  struct ggml_context * ctx,
@@ -484,6 +495,8 @@ extern "C" {
484
495
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
485
496
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
486
497
 
498
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
499
+
487
500
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
488
501
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
489
502
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -970,6 +983,11 @@ extern "C" {
970
983
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
971
984
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
972
985
 
986
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
987
+
988
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
989
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
990
+
973
991
  // print info and performance information for the graph
974
992
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
975
993
 
@@ -42,6 +42,7 @@
42
42
  // available llama models
43
43
  enum e_model {
44
44
  MODEL_UNKNOWN,
45
+ MODEL_3B,
45
46
  MODEL_7B,
46
47
  MODEL_13B,
47
48
  MODEL_30B,
@@ -58,6 +59,7 @@ static const size_t MB = 1024*1024;
58
59
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
59
60
  {
60
61
  static std::map<e_model, size_t> k_sizes = {
62
+ { MODEL_3B, 128ull * MB },
61
63
  { MODEL_7B, 512ull * MB },
62
64
  { MODEL_13B, 512ull * MB },
63
65
  { MODEL_30B, 512ull * MB },
@@ -69,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
69
71
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
70
72
  {
71
73
  static std::map<e_model, size_t> k_sizes = {
74
+ { MODEL_3B, 128ull * MB },
72
75
  { MODEL_7B, 512ull * MB },
73
76
  { MODEL_13B, 512ull * MB },
74
77
  { MODEL_30B, 512ull * MB },
@@ -81,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
81
84
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
82
85
  {
83
86
  static std::map<e_model, size_t> k_sizes = {
87
+ { MODEL_3B, 682ull * MB },
84
88
  { MODEL_7B, 1026ull * MB },
85
89
  { MODEL_13B, 1608ull * MB },
86
90
  { MODEL_30B, 3124ull * MB },
@@ -94,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
94
98
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
95
99
  {
96
100
  static std::map<e_model, size_t> k_sizes = {
101
+ { MODEL_3B, 512ull * MB },
97
102
  { MODEL_7B, 768ull * MB },
98
103
  { MODEL_13B, 1024ull * MB },
99
104
  { MODEL_30B, 1280ull * MB },
@@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
899
904
 
900
905
  static const char *llama_model_type_name(e_model type) {
901
906
  switch (type) {
907
+ case MODEL_3B: return "3B";
902
908
  case MODEL_7B: return "7B";
903
909
  case MODEL_13B: return "13B";
904
910
  case MODEL_30B: return "30B";
@@ -932,6 +938,7 @@ static void llama_model_load_internal(
932
938
 
933
939
  {
934
940
  switch (hparams.n_layer) {
941
+ case 26: model.type = e_model::MODEL_3B; break;
935
942
  case 32: model.type = e_model::MODEL_7B; break;
936
943
  case 40: model.type = e_model::MODEL_13B; break;
937
944
  case 60: model.type = e_model::MODEL_30B; break;
@@ -31,6 +31,11 @@
31
31
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
32
  #define LLAMA_SESSION_VERSION 1
33
33
 
34
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
35
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
+ #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
+ #endif
38
+
34
39
  #ifdef __cplusplus
35
40
  extern "C" {
36
41
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.3'
6
+ VERSION = '0.1.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-66874d4'
9
+ LLAMA_CPP_VERSION = 'master-ffb06a3'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-26 00:00:00.000000000 Z
11
+ date: 2023-06-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: