llama_cpp 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ed569e816938dfca79c345228cf592eb81827c04acfeae3c8e26e0635bbc518b
4
- data.tar.gz: f4a899df0cf450370d7dc75e486a17617f1af0cbcacd9d9a8c7d3bde10016441
3
+ metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
4
+ data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
5
5
  SHA512:
6
- metadata.gz: 0f3d38eed6628e8d68efc741fe00024fb0c5199fb2e1a33d6f04d9299e1c59deb969e3eafe36190ade84522e70ddca50956fbee9b6406edc5d613f654889a83a
7
- data.tar.gz: 0b1705a8d70564a59ad6472b03dc0241727766d4121e26a2e9c3c0d4725ddf2ccf65cb8f4a862688661ea9fa2b1c8858cd6e5e722821e6c2c30c91401475ef74
6
+ metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
7
+ data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
data/CHANGELOG.md CHANGED
@@ -1,25 +1,27 @@
1
- ## [Unreleased]
1
+ ## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
2
+
3
+ - Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
2
4
 
3
5
  ## [[0.1.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.2...v0.1.3)] - 2023-05-27
4
6
 
5
- - Bump bundled llama.cpp from master-265db98 to master-66874d4
7
+ - Bump bundled llama.cpp from master-265db98 to master-66874d4.
6
8
 
7
9
  ## [[0.1.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.1...v0.1.2)] - 2023-05-22
8
10
 
9
11
  **Breaking Changes**
10
12
 
11
- - Bump bundled llama.cpp from master-6986c78 to master-265db98
12
- - bump LLAMA_FILE_VERSION to 3
13
+ - Bump bundled llama.cpp from master-6986c78 to master-265db98.
14
+ - bump LLAMA_FILE_VERSION to 3.
13
15
 
14
16
  ## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
15
17
 
16
- - Add load_session_file method to Context
17
- - Add save_session_file method to Context
18
+ - Add load_session_file method to Context.
19
+ - Add save_session_file method to Context.
18
20
 
19
21
  **Breaking Changes**
20
22
 
21
- - Bump bundled llama.cpp from master-173d0e6 to master-6986c78
22
- - bump LLAMA_FILE_VERSION to 2
23
+ - Bump bundled llama.cpp from master-173d0e6 to master-6986c78.
24
+ - bump LLAMA_FILE_VERSION to 2.
23
25
 
24
26
  ## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
25
27
 
@@ -469,16 +469,11 @@ void ggml_cl_init(void) {
469
469
 
470
470
  size_t ext_str_size;
471
471
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
472
- char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
472
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
473
473
  clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
474
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
474
475
  // Check if ext_buffer contains cl_khr_fp16
475
- for (size_t i = 0; i < ext_str_size - 12; i++) {
476
- if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
477
- fp16_support = true;
478
- break;
479
- }
480
- }
481
- free(ext_buffer);
476
+ fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
482
477
  fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
483
478
 
484
479
  cl_context_properties properties[] = {
@@ -672,7 +667,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
672
667
  size_t d_size;
673
668
  cl_mem d_X;
674
669
  if (src0->backend == GGML_BACKEND_CL) {
675
- d_X = *(cl_mem*) src0->data;
670
+ d_X = (cl_mem) src0->data;
676
671
  } else {
677
672
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
678
673
  }
@@ -748,7 +743,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
748
743
  size_t d_size;
749
744
  cl_mem d_X;
750
745
  if (src0->backend == GGML_BACKEND_CL) {
751
- d_X = *(cl_mem*) src0->data;
746
+ d_X = (cl_mem) src0->data;
752
747
  } else {
753
748
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
754
749
  }
@@ -873,7 +868,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
873
868
  if (src0->backend == GGML_BACKEND_CPU) {
874
869
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
875
870
  } else if (src0->backend == GGML_BACKEND_CL) {
876
- d_Q = *(cl_mem*) src0->data;
871
+ d_Q = (cl_mem) src0->data;
877
872
  } else {
878
873
  GGML_ASSERT(false);
879
874
  }
@@ -1016,14 +1011,13 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
1016
1011
  const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1017
1012
 
1018
1013
  size_t q_size;
1019
- cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
1020
- *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1014
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
1021
1015
 
1022
1016
  // copy tensor to device
1023
1017
  for (int64_t i3 = 0; i3 < ne3; i3++) {
1024
1018
  for (int64_t i2 = 0; i2 < ne2; i2++) {
1025
1019
  int i = i3*ne2 + i2;
1026
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
1020
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1027
1021
  }
1028
1022
  }
1029
1023
 
@@ -186,10 +186,12 @@ typedef double ggml_float;
186
186
  #if defined(_MSC_VER) || defined(__MINGW32__)
187
187
  #include <intrin.h>
188
188
  #else
189
+ #if !defined(__riscv)
189
190
  #include <immintrin.h>
190
191
  #endif
191
192
  #endif
192
193
  #endif
194
+ #endif
193
195
 
194
196
  #ifdef __F16C__
195
197
 
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3494
3496
  };
3495
3497
  static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
3496
3498
 
3497
- static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
3499
+ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3498
3500
  "NONE",
3499
3501
 
3500
3502
  "DUP",
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
3749
3751
  return GGML_TYPE_NAME[type];
3750
3752
  }
3751
3753
 
3754
+ const char * ggml_op_name(enum ggml_op op) {
3755
+ return GGML_OP_NAME[op];
3756
+ }
3752
3757
 
3753
3758
  size_t ggml_element_size(const struct ggml_tensor * tensor) {
3754
3759
  return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
3805
3810
  return wtype;
3806
3811
  }
3807
3812
 
3813
+ size_t ggml_tensor_overhead(void) {
3814
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
3815
+ }
3816
+
3808
3817
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
3809
3818
  return tensor->nb[0] > tensor->nb[1];
3810
3819
  }
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
4017
4026
  return result;
4018
4027
  }
4019
4028
 
4029
+ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4030
+ ctx->no_alloc = no_alloc;
4031
+ }
4032
+
4033
+ void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4034
+ return ctx->mem_buffer;
4035
+ }
4036
+
4037
+ size_t ggml_get_mem_size(struct ggml_context * ctx) {
4038
+ return ctx->mem_size;
4039
+ }
4040
+
4020
4041
  // IMPORTANT:
4021
4042
  // when creating "opt" tensors, always save and load the scratch buffer
4022
4043
  // this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4061
4082
  struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
4062
4083
 
4063
4084
  if (ctx->scratch.data == NULL || data != NULL) {
4064
- size_needed += sizeof(struct ggml_tensor);
4085
+ size_needed += GGML_TENSOR_SIZE;
4065
4086
 
4066
4087
  if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
4067
4088
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
4077
4098
  };
4078
4099
  } else {
4079
4100
  if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
4080
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
4101
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
4102
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
4081
4103
  assert(false);
4082
4104
  return NULL;
4083
4105
  }
4084
4106
 
4085
- if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
4107
+ if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
4086
4108
  GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
4087
- __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
4109
+ __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
4088
4110
  assert(false);
4089
4111
  return NULL;
4090
4112
  }
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4093
4115
 
4094
4116
  *obj_new = (struct ggml_object) {
4095
4117
  .offs = cur_end + GGML_OBJECT_SIZE,
4096
- .size = sizeof(struct ggml_tensor),
4118
+ .size = GGML_TENSOR_SIZE,
4097
4119
  .next = NULL,
4098
4120
  };
4099
4121
 
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
4509
4531
  return result;
4510
4532
  }
4511
4533
 
4534
+ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
4535
+ struct ggml_object * obj = ctx->objects_begin;
4536
+
4537
+ char * const mem_buffer = ctx->mem_buffer;
4538
+
4539
+ while (obj != NULL) {
4540
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
4541
+ if (strcmp(cur->name, name) == 0) {
4542
+ return cur;
4543
+ }
4544
+
4545
+ obj = obj->next;
4546
+ }
4547
+
4548
+ return NULL;
4549
+ }
4550
+
4512
4551
  ////////////////////////////////////////////////////////////////////////////////
4513
4552
 
4514
4553
  // ggml_dup
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
6303
6342
 
6304
6343
  ggml_scratch_save(ctx);
6305
6344
 
6306
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6345
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6307
6346
 
6308
6347
  ((int32_t *) b->data)[0] = n_past;
6309
6348
  ((int32_t *) b->data)[1] = n_head;
@@ -13792,11 +13831,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
13792
13831
  // reached a leaf node, not part of the gradient graph (e.g. a constant)
13793
13832
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
13794
13833
 
13834
+ if (strlen(node->name) == 0) {
13835
+ snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
13836
+ }
13837
+
13795
13838
  cgraph->leafs[cgraph->n_leafs] = node;
13796
13839
  cgraph->n_leafs++;
13797
13840
  } else {
13798
13841
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
13799
13842
 
13843
+ if (strlen(node->name) == 0) {
13844
+ snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
13845
+ }
13846
+
13800
13847
  cgraph->nodes[cgraph->n_nodes] = node;
13801
13848
  cgraph->grads[cgraph->n_nodes] = node->grad;
13802
13849
  cgraph->n_nodes++;
@@ -14510,6 +14557,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
14510
14557
  }
14511
14558
  }
14512
14559
 
14560
+ struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
14561
+ for (int i = 0; i < cgraph->n_leafs; i++) {
14562
+ struct ggml_tensor * leaf = cgraph->leafs[i];
14563
+
14564
+ if (strcmp(leaf->name, name) == 0) {
14565
+ return leaf;
14566
+ }
14567
+ }
14568
+
14569
+ for (int i = 0; i < cgraph->n_nodes; i++) {
14570
+ struct ggml_tensor * node = cgraph->nodes[i];
14571
+
14572
+ if (strcmp(node->name, name) == 0) {
14573
+ return node;
14574
+ }
14575
+ }
14576
+
14577
+ return NULL;
14578
+ }
14579
+
14580
+ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
14581
+ const int64_t * ne = tensor->ne;
14582
+ const size_t * nb = tensor->nb;
14583
+
14584
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
14585
+ ggml_type_name(tensor->type),
14586
+ ggml_op_name (tensor->op),
14587
+ tensor->n_dims,
14588
+ ne[0], ne[1], ne[2], ne[3],
14589
+ nb[0], nb[1], nb[2], nb[3],
14590
+ tensor->data,
14591
+ tensor->name);
14592
+ }
14593
+
14594
+ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
14595
+ const int64_t * ne = tensor->ne;
14596
+ const size_t * nb = tensor->nb;
14597
+
14598
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
14599
+ arg,
14600
+ ggml_type_name(tensor->type),
14601
+ ggml_op_name (tensor->op),
14602
+ tensor->n_dims,
14603
+ ne[0], ne[1], ne[2], ne[3],
14604
+ nb[0], nb[1], nb[2], nb[3],
14605
+ tensor->n_tasks,
14606
+ tensor->data,
14607
+ tensor->name);
14608
+ }
14609
+
14610
+ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
14611
+ assert(cgraph->work == NULL);
14612
+ assert(cgraph->work_size == 0);
14613
+
14614
+ uint64_t size_eval = 0;
14615
+
14616
+ // compute size of intermediate results
14617
+ // TODO: does not take into account scratch buffers !!!!
14618
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14619
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
14620
+ }
14621
+
14622
+ // print
14623
+ {
14624
+ FILE * fout = stdout;
14625
+
14626
+ fprintf(fout, "\n");
14627
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
14628
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
14629
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
14630
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
14631
+ fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
14632
+
14633
+ // header
14634
+ fprintf(fout, "\n");
14635
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
14636
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
14637
+
14638
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14639
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
14640
+
14641
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
14642
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
14643
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
14644
+ }
14645
+
14646
+ // header
14647
+ fprintf(fout, "\n");
14648
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
14649
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
14650
+
14651
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14652
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
14653
+
14654
+ if (cgraph->nodes[i]->src0) {
14655
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
14656
+ }
14657
+
14658
+ if (cgraph->nodes[i]->src1) {
14659
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
14660
+ }
14661
+
14662
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14663
+ if (cgraph->nodes[i]->opt[j]) {
14664
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
14665
+ }
14666
+ }
14667
+
14668
+ fprintf(fout, "\n");
14669
+ }
14670
+
14671
+ fprintf(fout, "\n");
14672
+ }
14673
+
14674
+ // write binary data
14675
+ {
14676
+ FILE * fout = fopen(fname, "wb");
14677
+
14678
+ if (!fout) {
14679
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14680
+ return;
14681
+ }
14682
+
14683
+ // header
14684
+ {
14685
+ const uint32_t magic = GGML_FILE_MAGIC;
14686
+ const uint32_t version = GGML_FILE_VERSION;
14687
+ const uint32_t n_leafs = cgraph->n_leafs;
14688
+ const uint32_t nodes = cgraph->n_nodes;
14689
+
14690
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
14691
+ fwrite(&version, sizeof(uint32_t), 1, fout);
14692
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
14693
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
14694
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
14695
+ }
14696
+
14697
+ // leafs
14698
+ {
14699
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
14700
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
14701
+
14702
+ const uint32_t type = tensor->type;
14703
+ const uint32_t op = tensor->op;
14704
+ const uint32_t n_dims = tensor->n_dims;
14705
+
14706
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14707
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14708
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14709
+
14710
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14711
+ const uint64_t ne = tensor->ne[j];
14712
+ const uint64_t nb = tensor->nb[j];
14713
+
14714
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14715
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14716
+ }
14717
+
14718
+ // store the pointer address
14719
+ {
14720
+ const uint64_t ptr = (uint64_t) tensor->data;
14721
+
14722
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14723
+ }
14724
+
14725
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14726
+
14727
+ // dump the data
14728
+ // TODO: pad this to 32 byte boundary
14729
+ {
14730
+ const size_t size = ggml_nbytes(tensor);
14731
+
14732
+ fwrite(tensor->data, sizeof(char), size, fout);
14733
+ }
14734
+ }
14735
+ }
14736
+
14737
+ // nodes
14738
+ {
14739
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
14740
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
14741
+
14742
+ const uint32_t type = tensor->type;
14743
+ const uint32_t op = tensor->op;
14744
+ const uint32_t n_dims = tensor->n_dims;
14745
+
14746
+ fwrite(&type, sizeof(uint32_t), 1, fout);
14747
+ fwrite(&op, sizeof(uint32_t), 1, fout);
14748
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
14749
+
14750
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14751
+ const uint64_t ne = tensor->ne[j];
14752
+ const uint64_t nb = tensor->nb[j];
14753
+
14754
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
14755
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
14756
+ }
14757
+
14758
+ // store the pointer address
14759
+ {
14760
+ const uint64_t ptr = (uint64_t) tensor->data;
14761
+
14762
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
14763
+ }
14764
+
14765
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
14766
+
14767
+ // output the op arguments
14768
+ {
14769
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
14770
+
14771
+ args[0] = tensor->src0;
14772
+ args[1] = tensor->src1;
14773
+
14774
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
14775
+ args[2 + j] = tensor->opt[j];
14776
+ }
14777
+
14778
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
14779
+ if (args[j]) {
14780
+ int32_t idx = -1;
14781
+
14782
+ // check if leaf
14783
+ {
14784
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
14785
+ if (args[j] == cgraph->leafs[k]) {
14786
+ idx = k;
14787
+ break;
14788
+ }
14789
+ }
14790
+ }
14791
+
14792
+ // check if node
14793
+ if (idx == -1) {
14794
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
14795
+ if (args[j] == cgraph->nodes[k]) {
14796
+ idx = GGML_MAX_NODES + k;
14797
+ break;
14798
+ }
14799
+ }
14800
+ }
14801
+
14802
+ if (idx == -1) {
14803
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
14804
+ return;
14805
+ }
14806
+
14807
+ fwrite(&idx, sizeof(int32_t), 1, fout);
14808
+ } else {
14809
+ const int32_t nul = -1;
14810
+
14811
+ fwrite(&nul, sizeof(int32_t), 1, fout);
14812
+ }
14813
+ }
14814
+ }
14815
+ }
14816
+ }
14817
+
14818
+ fclose(fout);
14819
+ }
14820
+ }
14821
+
14822
+ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
14823
+ assert(*ctx_data == NULL);
14824
+ assert(*ctx_eval == NULL);
14825
+
14826
+ struct ggml_cgraph result = { 0 };
14827
+
14828
+ struct ggml_tensor * data = NULL;
14829
+
14830
+ // read file into data
14831
+ {
14832
+ FILE * fin = fopen(fname, "rb");
14833
+
14834
+ if (!fin) {
14835
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
14836
+ return result;
14837
+ }
14838
+
14839
+ size_t fsize = 0;
14840
+
14841
+ fseek(fin, 0, SEEK_END);
14842
+ fsize = ftell(fin);
14843
+ fseek(fin, 0, SEEK_SET);
14844
+
14845
+ // create the data context
14846
+ {
14847
+ const size_t overhead = 1*ggml_tensor_overhead();
14848
+
14849
+ struct ggml_init_params params = {
14850
+ .mem_size = fsize + overhead,
14851
+ .mem_buffer = NULL,
14852
+ .no_alloc = false,
14853
+ };
14854
+
14855
+ *ctx_data = ggml_init(params);
14856
+
14857
+ if (!*ctx_data) {
14858
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14859
+ return result;
14860
+ }
14861
+ }
14862
+
14863
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
14864
+
14865
+ fread(data->data, sizeof(char), fsize, fin);
14866
+
14867
+ fclose(fin);
14868
+ }
14869
+
14870
+ // populate result
14871
+ {
14872
+ char * ptr = (char *) data->data;
14873
+
14874
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
14875
+
14876
+ if (magic != GGML_FILE_MAGIC) {
14877
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
14878
+ return result;
14879
+ }
14880
+
14881
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
14882
+
14883
+ if (version != GGML_FILE_VERSION) {
14884
+ fprintf(stderr, "%s: invalid version number\n", __func__);
14885
+ return result;
14886
+ }
14887
+
14888
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
14889
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
14890
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
14891
+
14892
+ result.n_leafs = n_leafs;
14893
+ result.n_nodes = n_nodes;
14894
+
14895
+ // create the data context
14896
+ {
14897
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
14898
+
14899
+ struct ggml_init_params params = {
14900
+ .mem_size = size_eval + overhead,
14901
+ .mem_buffer = NULL,
14902
+ .no_alloc = true,
14903
+ };
14904
+
14905
+ *ctx_eval = ggml_init(params);
14906
+
14907
+ if (!*ctx_eval) {
14908
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
14909
+ return result;
14910
+ }
14911
+ }
14912
+
14913
+ // leafs
14914
+ {
14915
+ uint32_t type;
14916
+ uint32_t op;
14917
+ uint32_t n_dims;
14918
+
14919
+ for (uint32_t i = 0; i < n_leafs; ++i) {
14920
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14921
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14922
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14923
+
14924
+ int64_t ne[GGML_MAX_DIMS];
14925
+ size_t nb[GGML_MAX_DIMS];
14926
+
14927
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14928
+ uint64_t ne_cur;
14929
+ uint64_t nb_cur;
14930
+
14931
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14932
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14933
+
14934
+ ne[j] = ne_cur;
14935
+ nb[j] = nb_cur;
14936
+ }
14937
+
14938
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14939
+
14940
+ tensor->op = (enum ggml_op) op;
14941
+
14942
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14943
+
14944
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14945
+
14946
+ tensor->data = (void *) ptr;
14947
+
14948
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14949
+ tensor->nb[j] = nb[j];
14950
+ }
14951
+
14952
+ result.leafs[i] = tensor;
14953
+
14954
+ ptr += ggml_nbytes(tensor);
14955
+
14956
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
14957
+ }
14958
+ }
14959
+
14960
+ ggml_set_no_alloc(*ctx_eval, false);
14961
+
14962
+ // nodes
14963
+ {
14964
+ uint32_t type;
14965
+ uint32_t op;
14966
+ uint32_t n_dims;
14967
+
14968
+ for (uint32_t i = 0; i < n_nodes; ++i) {
14969
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
14970
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
14971
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
14972
+
14973
+ int64_t ne[GGML_MAX_DIMS];
14974
+ size_t nb[GGML_MAX_DIMS];
14975
+
14976
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14977
+ uint64_t ne_cur;
14978
+ uint64_t nb_cur;
14979
+
14980
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
14981
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
14982
+
14983
+ ne[j] = ne_cur;
14984
+ nb[j] = nb_cur;
14985
+ }
14986
+
14987
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
14988
+
14989
+ tensor->op = (enum ggml_op) op;
14990
+
14991
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
14992
+
14993
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
14994
+
14995
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
14996
+ tensor->nb[j] = nb[j];
14997
+ }
14998
+
14999
+ // parse args
15000
+ {
15001
+ struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
15002
+ &tensor->src0,
15003
+ &tensor->src1,
15004
+ };
15005
+
15006
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
15007
+ args[2 + j] = &tensor->opt[j];
15008
+ }
15009
+
15010
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
15011
+ const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
15012
+
15013
+ if (arg_idx == -1) {
15014
+ continue;
15015
+ }
15016
+
15017
+ if (arg_idx < GGML_MAX_NODES) {
15018
+ *args[j] = result.leafs[arg_idx];
15019
+ } else {
15020
+ *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
15021
+ }
15022
+ }
15023
+ }
15024
+
15025
+ result.nodes[i] = tensor;
15026
+
15027
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
15028
+ }
15029
+ }
15030
+ }
15031
+
15032
+ return result;
15033
+ }
15034
+
14513
15035
  void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14514
15036
  int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
14515
15037
 
@@ -14527,7 +15049,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14527
15049
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
14528
15050
  i,
14529
15051
  node->ne[0], node->ne[1], node->ne[2],
14530
- GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
15052
+ GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
14531
15053
  (double) node->perf_cycles / (double) ggml_cycles_per_ms(),
14532
15054
  (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
14533
15055
  (double) node->perf_time_us / 1000.0,
@@ -14541,7 +15063,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14541
15063
  GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
14542
15064
  i,
14543
15065
  node->ne[0], node->ne[1],
14544
- GGML_OP_LABEL[node->op]);
15066
+ GGML_OP_NAME[node->op]);
14545
15067
  }
14546
15068
 
14547
15069
  for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14549,7 +15071,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
14549
15071
  continue;
14550
15072
  }
14551
15073
 
14552
- GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
15074
+ GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
14553
15075
  }
14554
15076
 
14555
15077
  GGML_PRINT("========================================\n");
@@ -198,6 +198,7 @@
198
198
  #define GGML_MAX_PARAMS 256
199
199
  #define GGML_MAX_CONTEXTS 64
200
200
  #define GGML_MAX_OPT 4
201
+ #define GGML_MAX_NAME 32
201
202
  #define GGML_DEFAULT_N_THREADS 4
202
203
 
203
204
  #define GGML_ASSERT(x) \
@@ -372,11 +373,13 @@ extern "C" {
372
373
 
373
374
  void * data;
374
375
 
375
- char name[32];
376
+ char name[GGML_MAX_NAME];
376
377
 
377
378
  char padding[16];
378
379
  };
379
380
 
381
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
382
+
380
383
  // computation graph
381
384
  struct ggml_cgraph {
382
385
  int n_nodes;
@@ -429,6 +432,7 @@ extern "C" {
429
432
  GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
430
433
 
431
434
  GGML_API const char * ggml_type_name(enum ggml_type type);
435
+ GGML_API const char * ggml_op_name (enum ggml_op op);
432
436
 
433
437
  GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
434
438
 
@@ -437,6 +441,9 @@ extern "C" {
437
441
  // TODO: temporary until model loading of ggml examples is refactored
438
442
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
439
443
 
444
+ // use this to compute the memory overhead of a tensor
445
+ GGML_API size_t ggml_tensor_overhead(void);
446
+
440
447
  // main
441
448
 
442
449
  GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,7 +451,11 @@ extern "C" {
444
451
 
445
452
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
446
453
 
447
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
454
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
455
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
456
+
457
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
458
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
448
459
 
449
460
  GGML_API struct ggml_tensor * ggml_new_tensor(
450
461
  struct ggml_context * ctx,
@@ -484,6 +495,8 @@ extern "C" {
484
495
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
485
496
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
486
497
 
498
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
499
+
487
500
  GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
488
501
  GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
489
502
  GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -970,6 +983,11 @@ extern "C" {
970
983
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
971
984
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
972
985
 
986
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
987
+
988
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
989
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
990
+
973
991
  // print info and performance information for the graph
974
992
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
975
993
 
@@ -42,6 +42,7 @@
42
42
  // available llama models
43
43
  enum e_model {
44
44
  MODEL_UNKNOWN,
45
+ MODEL_3B,
45
46
  MODEL_7B,
46
47
  MODEL_13B,
47
48
  MODEL_30B,
@@ -58,6 +59,7 @@ static const size_t MB = 1024*1024;
58
59
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
59
60
  {
60
61
  static std::map<e_model, size_t> k_sizes = {
62
+ { MODEL_3B, 128ull * MB },
61
63
  { MODEL_7B, 512ull * MB },
62
64
  { MODEL_13B, 512ull * MB },
63
65
  { MODEL_30B, 512ull * MB },
@@ -69,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
69
71
  static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
70
72
  {
71
73
  static std::map<e_model, size_t> k_sizes = {
74
+ { MODEL_3B, 128ull * MB },
72
75
  { MODEL_7B, 512ull * MB },
73
76
  { MODEL_13B, 512ull * MB },
74
77
  { MODEL_30B, 512ull * MB },
@@ -81,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
81
84
  static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
82
85
  {
83
86
  static std::map<e_model, size_t> k_sizes = {
87
+ { MODEL_3B, 682ull * MB },
84
88
  { MODEL_7B, 1026ull * MB },
85
89
  { MODEL_13B, 1608ull * MB },
86
90
  { MODEL_30B, 3124ull * MB },
@@ -94,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
94
98
  static const std::map<e_model, size_t> & MEM_REQ_EVAL()
95
99
  {
96
100
  static std::map<e_model, size_t> k_sizes = {
101
+ { MODEL_3B, 512ull * MB },
97
102
  { MODEL_7B, 768ull * MB },
98
103
  { MODEL_13B, 1024ull * MB },
99
104
  { MODEL_30B, 1280ull * MB },
@@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
899
904
 
900
905
  static const char *llama_model_type_name(e_model type) {
901
906
  switch (type) {
907
+ case MODEL_3B: return "3B";
902
908
  case MODEL_7B: return "7B";
903
909
  case MODEL_13B: return "13B";
904
910
  case MODEL_30B: return "30B";
@@ -932,6 +938,7 @@ static void llama_model_load_internal(
932
938
 
933
939
  {
934
940
  switch (hparams.n_layer) {
941
+ case 26: model.type = e_model::MODEL_3B; break;
935
942
  case 32: model.type = e_model::MODEL_7B; break;
936
943
  case 40: model.type = e_model::MODEL_13B; break;
937
944
  case 60: model.type = e_model::MODEL_30B; break;
@@ -31,6 +31,11 @@
31
31
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
32
32
  #define LLAMA_SESSION_VERSION 1
33
33
 
34
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
35
+ // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
36
+ #define LLAMA_SUPPORTS_GPU_OFFLOAD
37
+ #endif
38
+
34
39
  #ifdef __cplusplus
35
40
  extern "C" {
36
41
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.3'
6
+ VERSION = '0.1.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-66874d4'
9
+ LLAMA_CPP_VERSION = 'master-ffb06a3'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-26 00:00:00.000000000 Z
11
+ date: 2023-06-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: