llama_cpp 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -8
- data/ext/llama_cpp/src/ggml-opencl.cpp +8 -14
- data/ext/llama_cpp/src/ggml.c +532 -10
- data/ext/llama_cpp/src/ggml.h +20 -2
- data/ext/llama_cpp/src/llama.cpp +7 -0
- data/ext/llama_cpp/src/llama.h +5 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
|
4
|
+
data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
|
7
|
+
data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
|
data/CHANGELOG.md
CHANGED
@@ -1,25 +1,27 @@
|
|
1
|
-
## [
|
1
|
+
## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
|
2
4
|
|
3
5
|
## [[0.1.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.2...v0.1.3)] - 2023-05-27
|
4
6
|
|
5
|
-
- Bump bundled llama.cpp from master-265db98 to master-66874d4
|
7
|
+
- Bump bundled llama.cpp from master-265db98 to master-66874d4.
|
6
8
|
|
7
9
|
## [[0.1.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.1...v0.1.2)] - 2023-05-22
|
8
10
|
|
9
11
|
**Breaking Changes**
|
10
12
|
|
11
|
-
- Bump bundled llama.cpp from master-6986c78 to master-265db98
|
12
|
-
- bump LLAMA_FILE_VERSION to 3
|
13
|
+
- Bump bundled llama.cpp from master-6986c78 to master-265db98.
|
14
|
+
- bump LLAMA_FILE_VERSION to 3.
|
13
15
|
|
14
16
|
## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
|
15
17
|
|
16
|
-
- Add load_session_file method to Context
|
17
|
-
- Add save_session_file method to Context
|
18
|
+
- Add load_session_file method to Context.
|
19
|
+
- Add save_session_file method to Context.
|
18
20
|
|
19
21
|
**Breaking Changes**
|
20
22
|
|
21
|
-
- Bump bundled llama.cpp from master-173d0e6 to master-6986c78
|
22
|
-
- bump LLAMA_FILE_VERSION to 2
|
23
|
+
- Bump bundled llama.cpp from master-173d0e6 to master-6986c78.
|
24
|
+
- bump LLAMA_FILE_VERSION to 2.
|
23
25
|
|
24
26
|
## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
|
25
27
|
|
@@ -469,16 +469,11 @@ void ggml_cl_init(void) {
|
|
469
469
|
|
470
470
|
size_t ext_str_size;
|
471
471
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
472
|
-
char*
|
472
|
+
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
473
473
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
474
|
+
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
474
475
|
// Check if ext_buffer contains cl_khr_fp16
|
475
|
-
|
476
|
-
if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
|
477
|
-
fp16_support = true;
|
478
|
-
break;
|
479
|
-
}
|
480
|
-
}
|
481
|
-
free(ext_buffer);
|
476
|
+
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
482
477
|
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
483
478
|
|
484
479
|
cl_context_properties properties[] = {
|
@@ -672,7 +667,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
672
667
|
size_t d_size;
|
673
668
|
cl_mem d_X;
|
674
669
|
if (src0->backend == GGML_BACKEND_CL) {
|
675
|
-
d_X =
|
670
|
+
d_X = (cl_mem) src0->data;
|
676
671
|
} else {
|
677
672
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
678
673
|
}
|
@@ -748,7 +743,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
748
743
|
size_t d_size;
|
749
744
|
cl_mem d_X;
|
750
745
|
if (src0->backend == GGML_BACKEND_CL) {
|
751
|
-
d_X =
|
746
|
+
d_X = (cl_mem) src0->data;
|
752
747
|
} else {
|
753
748
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
754
749
|
}
|
@@ -873,7 +868,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
873
868
|
if (src0->backend == GGML_BACKEND_CPU) {
|
874
869
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
|
875
870
|
} else if (src0->backend == GGML_BACKEND_CL) {
|
876
|
-
d_Q =
|
871
|
+
d_Q = (cl_mem) src0->data;
|
877
872
|
} else {
|
878
873
|
GGML_ASSERT(false);
|
879
874
|
}
|
@@ -1016,14 +1011,13 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1016
1011
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1017
1012
|
|
1018
1013
|
size_t q_size;
|
1019
|
-
cl_mem
|
1020
|
-
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
1014
|
+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
1021
1015
|
|
1022
1016
|
// copy tensor to device
|
1023
1017
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1024
1018
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1025
1019
|
int i = i3*ne2 + i2;
|
1026
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue,
|
1020
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
1027
1021
|
}
|
1028
1022
|
}
|
1029
1023
|
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -186,10 +186,12 @@ typedef double ggml_float;
|
|
186
186
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
187
187
|
#include <intrin.h>
|
188
188
|
#else
|
189
|
+
#if !defined(__riscv)
|
189
190
|
#include <immintrin.h>
|
190
191
|
#endif
|
191
192
|
#endif
|
192
193
|
#endif
|
194
|
+
#endif
|
193
195
|
|
194
196
|
#ifdef __F16C__
|
195
197
|
|
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3494
3496
|
};
|
3495
3497
|
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
|
3496
3498
|
|
3497
|
-
static const char *
|
3499
|
+
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3498
3500
|
"NONE",
|
3499
3501
|
|
3500
3502
|
"DUP",
|
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
|
|
3749
3751
|
return GGML_TYPE_NAME[type];
|
3750
3752
|
}
|
3751
3753
|
|
3754
|
+
const char * ggml_op_name(enum ggml_op op) {
|
3755
|
+
return GGML_OP_NAME[op];
|
3756
|
+
}
|
3752
3757
|
|
3753
3758
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
3754
3759
|
return GGML_TYPE_SIZE[tensor->type];
|
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3805
3810
|
return wtype;
|
3806
3811
|
}
|
3807
3812
|
|
3813
|
+
size_t ggml_tensor_overhead(void) {
|
3814
|
+
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
3815
|
+
}
|
3816
|
+
|
3808
3817
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
3809
3818
|
return tensor->nb[0] > tensor->nb[1];
|
3810
3819
|
}
|
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
4017
4026
|
return result;
|
4018
4027
|
}
|
4019
4028
|
|
4029
|
+
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
4030
|
+
ctx->no_alloc = no_alloc;
|
4031
|
+
}
|
4032
|
+
|
4033
|
+
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4034
|
+
return ctx->mem_buffer;
|
4035
|
+
}
|
4036
|
+
|
4037
|
+
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4038
|
+
return ctx->mem_size;
|
4039
|
+
}
|
4040
|
+
|
4020
4041
|
// IMPORTANT:
|
4021
4042
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4022
4043
|
// this is an error prone process, but it is necessary to support inplace
|
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4061
4082
|
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
4062
4083
|
|
4063
4084
|
if (ctx->scratch.data == NULL || data != NULL) {
|
4064
|
-
size_needed +=
|
4085
|
+
size_needed += GGML_TENSOR_SIZE;
|
4065
4086
|
|
4066
4087
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4067
4088
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4077
4098
|
};
|
4078
4099
|
} else {
|
4079
4100
|
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
|
4080
|
-
GGML_PRINT("%s: not enough space in the scratch memory\n",
|
4101
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4102
|
+
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
|
4081
4103
|
assert(false);
|
4082
4104
|
return NULL;
|
4083
4105
|
}
|
4084
4106
|
|
4085
|
-
if (cur_end +
|
4107
|
+
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4086
4108
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
4087
|
-
__func__, cur_end +
|
4109
|
+
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
|
4088
4110
|
assert(false);
|
4089
4111
|
return NULL;
|
4090
4112
|
}
|
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4093
4115
|
|
4094
4116
|
*obj_new = (struct ggml_object) {
|
4095
4117
|
.offs = cur_end + GGML_OBJECT_SIZE,
|
4096
|
-
.size =
|
4118
|
+
.size = GGML_TENSOR_SIZE,
|
4097
4119
|
.next = NULL,
|
4098
4120
|
};
|
4099
4121
|
|
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
|
|
4509
4531
|
return result;
|
4510
4532
|
}
|
4511
4533
|
|
4534
|
+
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
4535
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4536
|
+
|
4537
|
+
char * const mem_buffer = ctx->mem_buffer;
|
4538
|
+
|
4539
|
+
while (obj != NULL) {
|
4540
|
+
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
|
4541
|
+
if (strcmp(cur->name, name) == 0) {
|
4542
|
+
return cur;
|
4543
|
+
}
|
4544
|
+
|
4545
|
+
obj = obj->next;
|
4546
|
+
}
|
4547
|
+
|
4548
|
+
return NULL;
|
4549
|
+
}
|
4550
|
+
|
4512
4551
|
////////////////////////////////////////////////////////////////////////////////
|
4513
4552
|
|
4514
4553
|
// ggml_dup
|
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6303
6342
|
|
6304
6343
|
ggml_scratch_save(ctx);
|
6305
6344
|
|
6306
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6345
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6307
6346
|
|
6308
6347
|
((int32_t *) b->data)[0] = n_past;
|
6309
6348
|
((int32_t *) b->data)[1] = n_head;
|
@@ -13792,11 +13831,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
13792
13831
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
13793
13832
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
13794
13833
|
|
13834
|
+
if (strlen(node->name) == 0) {
|
13835
|
+
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
13836
|
+
}
|
13837
|
+
|
13795
13838
|
cgraph->leafs[cgraph->n_leafs] = node;
|
13796
13839
|
cgraph->n_leafs++;
|
13797
13840
|
} else {
|
13798
13841
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
13799
13842
|
|
13843
|
+
if (strlen(node->name) == 0) {
|
13844
|
+
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
13845
|
+
}
|
13846
|
+
|
13800
13847
|
cgraph->nodes[cgraph->n_nodes] = node;
|
13801
13848
|
cgraph->grads[cgraph->n_nodes] = node->grad;
|
13802
13849
|
cgraph->n_nodes++;
|
@@ -14510,6 +14557,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
14510
14557
|
}
|
14511
14558
|
}
|
14512
14559
|
|
14560
|
+
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
14561
|
+
for (int i = 0; i < cgraph->n_leafs; i++) {
|
14562
|
+
struct ggml_tensor * leaf = cgraph->leafs[i];
|
14563
|
+
|
14564
|
+
if (strcmp(leaf->name, name) == 0) {
|
14565
|
+
return leaf;
|
14566
|
+
}
|
14567
|
+
}
|
14568
|
+
|
14569
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
14570
|
+
struct ggml_tensor * node = cgraph->nodes[i];
|
14571
|
+
|
14572
|
+
if (strcmp(node->name, name) == 0) {
|
14573
|
+
return node;
|
14574
|
+
}
|
14575
|
+
}
|
14576
|
+
|
14577
|
+
return NULL;
|
14578
|
+
}
|
14579
|
+
|
14580
|
+
static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
|
14581
|
+
const int64_t * ne = tensor->ne;
|
14582
|
+
const size_t * nb = tensor->nb;
|
14583
|
+
|
14584
|
+
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
|
14585
|
+
ggml_type_name(tensor->type),
|
14586
|
+
ggml_op_name (tensor->op),
|
14587
|
+
tensor->n_dims,
|
14588
|
+
ne[0], ne[1], ne[2], ne[3],
|
14589
|
+
nb[0], nb[1], nb[2], nb[3],
|
14590
|
+
tensor->data,
|
14591
|
+
tensor->name);
|
14592
|
+
}
|
14593
|
+
|
14594
|
+
static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
|
14595
|
+
const int64_t * ne = tensor->ne;
|
14596
|
+
const size_t * nb = tensor->nb;
|
14597
|
+
|
14598
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
|
14599
|
+
arg,
|
14600
|
+
ggml_type_name(tensor->type),
|
14601
|
+
ggml_op_name (tensor->op),
|
14602
|
+
tensor->n_dims,
|
14603
|
+
ne[0], ne[1], ne[2], ne[3],
|
14604
|
+
nb[0], nb[1], nb[2], nb[3],
|
14605
|
+
tensor->n_tasks,
|
14606
|
+
tensor->data,
|
14607
|
+
tensor->name);
|
14608
|
+
}
|
14609
|
+
|
14610
|
+
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
14611
|
+
assert(cgraph->work == NULL);
|
14612
|
+
assert(cgraph->work_size == 0);
|
14613
|
+
|
14614
|
+
uint64_t size_eval = 0;
|
14615
|
+
|
14616
|
+
// compute size of intermediate results
|
14617
|
+
// TODO: does not take into account scratch buffers !!!!
|
14618
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14619
|
+
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
14620
|
+
}
|
14621
|
+
|
14622
|
+
// print
|
14623
|
+
{
|
14624
|
+
FILE * fout = stdout;
|
14625
|
+
|
14626
|
+
fprintf(fout, "\n");
|
14627
|
+
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
14628
|
+
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
14629
|
+
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
14630
|
+
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
14631
|
+
fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
|
14632
|
+
|
14633
|
+
// header
|
14634
|
+
fprintf(fout, "\n");
|
14635
|
+
fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
|
14636
|
+
"TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
|
14637
|
+
|
14638
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14639
|
+
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
14640
|
+
|
14641
|
+
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
14642
|
+
GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
|
14643
|
+
GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
|
14644
|
+
}
|
14645
|
+
|
14646
|
+
// header
|
14647
|
+
fprintf(fout, "\n");
|
14648
|
+
fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
|
14649
|
+
"ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
|
14650
|
+
|
14651
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14652
|
+
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
14653
|
+
|
14654
|
+
if (cgraph->nodes[i]->src0) {
|
14655
|
+
ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
|
14656
|
+
}
|
14657
|
+
|
14658
|
+
if (cgraph->nodes[i]->src1) {
|
14659
|
+
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
14660
|
+
}
|
14661
|
+
|
14662
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14663
|
+
if (cgraph->nodes[i]->opt[j]) {
|
14664
|
+
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
14665
|
+
}
|
14666
|
+
}
|
14667
|
+
|
14668
|
+
fprintf(fout, "\n");
|
14669
|
+
}
|
14670
|
+
|
14671
|
+
fprintf(fout, "\n");
|
14672
|
+
}
|
14673
|
+
|
14674
|
+
// write binary data
|
14675
|
+
{
|
14676
|
+
FILE * fout = fopen(fname, "wb");
|
14677
|
+
|
14678
|
+
if (!fout) {
|
14679
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14680
|
+
return;
|
14681
|
+
}
|
14682
|
+
|
14683
|
+
// header
|
14684
|
+
{
|
14685
|
+
const uint32_t magic = GGML_FILE_MAGIC;
|
14686
|
+
const uint32_t version = GGML_FILE_VERSION;
|
14687
|
+
const uint32_t n_leafs = cgraph->n_leafs;
|
14688
|
+
const uint32_t nodes = cgraph->n_nodes;
|
14689
|
+
|
14690
|
+
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
14691
|
+
fwrite(&version, sizeof(uint32_t), 1, fout);
|
14692
|
+
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
14693
|
+
fwrite(&nodes, sizeof(uint32_t), 1, fout);
|
14694
|
+
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
14695
|
+
}
|
14696
|
+
|
14697
|
+
// leafs
|
14698
|
+
{
|
14699
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14700
|
+
const struct ggml_tensor * tensor = cgraph->leafs[i];
|
14701
|
+
|
14702
|
+
const uint32_t type = tensor->type;
|
14703
|
+
const uint32_t op = tensor->op;
|
14704
|
+
const uint32_t n_dims = tensor->n_dims;
|
14705
|
+
|
14706
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14707
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14708
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14709
|
+
|
14710
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14711
|
+
const uint64_t ne = tensor->ne[j];
|
14712
|
+
const uint64_t nb = tensor->nb[j];
|
14713
|
+
|
14714
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14715
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14716
|
+
}
|
14717
|
+
|
14718
|
+
// store the pointer address
|
14719
|
+
{
|
14720
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14721
|
+
|
14722
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14723
|
+
}
|
14724
|
+
|
14725
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14726
|
+
|
14727
|
+
// dump the data
|
14728
|
+
// TODO: pad this to 32 byte boundary
|
14729
|
+
{
|
14730
|
+
const size_t size = ggml_nbytes(tensor);
|
14731
|
+
|
14732
|
+
fwrite(tensor->data, sizeof(char), size, fout);
|
14733
|
+
}
|
14734
|
+
}
|
14735
|
+
}
|
14736
|
+
|
14737
|
+
// nodes
|
14738
|
+
{
|
14739
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14740
|
+
const struct ggml_tensor * tensor = cgraph->nodes[i];
|
14741
|
+
|
14742
|
+
const uint32_t type = tensor->type;
|
14743
|
+
const uint32_t op = tensor->op;
|
14744
|
+
const uint32_t n_dims = tensor->n_dims;
|
14745
|
+
|
14746
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14747
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14748
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14749
|
+
|
14750
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14751
|
+
const uint64_t ne = tensor->ne[j];
|
14752
|
+
const uint64_t nb = tensor->nb[j];
|
14753
|
+
|
14754
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14755
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14756
|
+
}
|
14757
|
+
|
14758
|
+
// store the pointer address
|
14759
|
+
{
|
14760
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14761
|
+
|
14762
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14763
|
+
}
|
14764
|
+
|
14765
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14766
|
+
|
14767
|
+
// output the op arguments
|
14768
|
+
{
|
14769
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
14770
|
+
|
14771
|
+
args[0] = tensor->src0;
|
14772
|
+
args[1] = tensor->src1;
|
14773
|
+
|
14774
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14775
|
+
args[2 + j] = tensor->opt[j];
|
14776
|
+
}
|
14777
|
+
|
14778
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
14779
|
+
if (args[j]) {
|
14780
|
+
int32_t idx = -1;
|
14781
|
+
|
14782
|
+
// check if leaf
|
14783
|
+
{
|
14784
|
+
for (int k = 0; k < cgraph->n_leafs; ++k) {
|
14785
|
+
if (args[j] == cgraph->leafs[k]) {
|
14786
|
+
idx = k;
|
14787
|
+
break;
|
14788
|
+
}
|
14789
|
+
}
|
14790
|
+
}
|
14791
|
+
|
14792
|
+
// check if node
|
14793
|
+
if (idx == -1) {
|
14794
|
+
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
14795
|
+
if (args[j] == cgraph->nodes[k]) {
|
14796
|
+
idx = GGML_MAX_NODES + k;
|
14797
|
+
break;
|
14798
|
+
}
|
14799
|
+
}
|
14800
|
+
}
|
14801
|
+
|
14802
|
+
if (idx == -1) {
|
14803
|
+
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
14804
|
+
return;
|
14805
|
+
}
|
14806
|
+
|
14807
|
+
fwrite(&idx, sizeof(int32_t), 1, fout);
|
14808
|
+
} else {
|
14809
|
+
const int32_t nul = -1;
|
14810
|
+
|
14811
|
+
fwrite(&nul, sizeof(int32_t), 1, fout);
|
14812
|
+
}
|
14813
|
+
}
|
14814
|
+
}
|
14815
|
+
}
|
14816
|
+
}
|
14817
|
+
|
14818
|
+
fclose(fout);
|
14819
|
+
}
|
14820
|
+
}
|
14821
|
+
|
14822
|
+
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
14823
|
+
assert(*ctx_data == NULL);
|
14824
|
+
assert(*ctx_eval == NULL);
|
14825
|
+
|
14826
|
+
struct ggml_cgraph result = { 0 };
|
14827
|
+
|
14828
|
+
struct ggml_tensor * data = NULL;
|
14829
|
+
|
14830
|
+
// read file into data
|
14831
|
+
{
|
14832
|
+
FILE * fin = fopen(fname, "rb");
|
14833
|
+
|
14834
|
+
if (!fin) {
|
14835
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14836
|
+
return result;
|
14837
|
+
}
|
14838
|
+
|
14839
|
+
size_t fsize = 0;
|
14840
|
+
|
14841
|
+
fseek(fin, 0, SEEK_END);
|
14842
|
+
fsize = ftell(fin);
|
14843
|
+
fseek(fin, 0, SEEK_SET);
|
14844
|
+
|
14845
|
+
// create the data context
|
14846
|
+
{
|
14847
|
+
const size_t overhead = 1*ggml_tensor_overhead();
|
14848
|
+
|
14849
|
+
struct ggml_init_params params = {
|
14850
|
+
.mem_size = fsize + overhead,
|
14851
|
+
.mem_buffer = NULL,
|
14852
|
+
.no_alloc = false,
|
14853
|
+
};
|
14854
|
+
|
14855
|
+
*ctx_data = ggml_init(params);
|
14856
|
+
|
14857
|
+
if (!*ctx_data) {
|
14858
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
14859
|
+
return result;
|
14860
|
+
}
|
14861
|
+
}
|
14862
|
+
|
14863
|
+
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
14864
|
+
|
14865
|
+
fread(data->data, sizeof(char), fsize, fin);
|
14866
|
+
|
14867
|
+
fclose(fin);
|
14868
|
+
}
|
14869
|
+
|
14870
|
+
// populate result
|
14871
|
+
{
|
14872
|
+
char * ptr = (char *) data->data;
|
14873
|
+
|
14874
|
+
const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
|
14875
|
+
|
14876
|
+
if (magic != GGML_FILE_MAGIC) {
|
14877
|
+
fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
|
14878
|
+
return result;
|
14879
|
+
}
|
14880
|
+
|
14881
|
+
const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
|
14882
|
+
|
14883
|
+
if (version != GGML_FILE_VERSION) {
|
14884
|
+
fprintf(stderr, "%s: invalid version number\n", __func__);
|
14885
|
+
return result;
|
14886
|
+
}
|
14887
|
+
|
14888
|
+
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
14889
|
+
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
14890
|
+
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
14891
|
+
|
14892
|
+
result.n_leafs = n_leafs;
|
14893
|
+
result.n_nodes = n_nodes;
|
14894
|
+
|
14895
|
+
// create the data context
|
14896
|
+
{
|
14897
|
+
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
14898
|
+
|
14899
|
+
struct ggml_init_params params = {
|
14900
|
+
.mem_size = size_eval + overhead,
|
14901
|
+
.mem_buffer = NULL,
|
14902
|
+
.no_alloc = true,
|
14903
|
+
};
|
14904
|
+
|
14905
|
+
*ctx_eval = ggml_init(params);
|
14906
|
+
|
14907
|
+
if (!*ctx_eval) {
|
14908
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
14909
|
+
return result;
|
14910
|
+
}
|
14911
|
+
}
|
14912
|
+
|
14913
|
+
// leafs
|
14914
|
+
{
|
14915
|
+
uint32_t type;
|
14916
|
+
uint32_t op;
|
14917
|
+
uint32_t n_dims;
|
14918
|
+
|
14919
|
+
for (uint32_t i = 0; i < n_leafs; ++i) {
|
14920
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
14921
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
14922
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
14923
|
+
|
14924
|
+
int64_t ne[GGML_MAX_DIMS];
|
14925
|
+
size_t nb[GGML_MAX_DIMS];
|
14926
|
+
|
14927
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14928
|
+
uint64_t ne_cur;
|
14929
|
+
uint64_t nb_cur;
|
14930
|
+
|
14931
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
14932
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
14933
|
+
|
14934
|
+
ne[j] = ne_cur;
|
14935
|
+
nb[j] = nb_cur;
|
14936
|
+
}
|
14937
|
+
|
14938
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
14939
|
+
|
14940
|
+
tensor->op = (enum ggml_op) op;
|
14941
|
+
|
14942
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
14943
|
+
|
14944
|
+
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
14945
|
+
|
14946
|
+
tensor->data = (void *) ptr;
|
14947
|
+
|
14948
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14949
|
+
tensor->nb[j] = nb[j];
|
14950
|
+
}
|
14951
|
+
|
14952
|
+
result.leafs[i] = tensor;
|
14953
|
+
|
14954
|
+
ptr += ggml_nbytes(tensor);
|
14955
|
+
|
14956
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
14957
|
+
}
|
14958
|
+
}
|
14959
|
+
|
14960
|
+
ggml_set_no_alloc(*ctx_eval, false);
|
14961
|
+
|
14962
|
+
// nodes
|
14963
|
+
{
|
14964
|
+
uint32_t type;
|
14965
|
+
uint32_t op;
|
14966
|
+
uint32_t n_dims;
|
14967
|
+
|
14968
|
+
for (uint32_t i = 0; i < n_nodes; ++i) {
|
14969
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
14970
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
14971
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
14972
|
+
|
14973
|
+
int64_t ne[GGML_MAX_DIMS];
|
14974
|
+
size_t nb[GGML_MAX_DIMS];
|
14975
|
+
|
14976
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14977
|
+
uint64_t ne_cur;
|
14978
|
+
uint64_t nb_cur;
|
14979
|
+
|
14980
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
14981
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
14982
|
+
|
14983
|
+
ne[j] = ne_cur;
|
14984
|
+
nb[j] = nb_cur;
|
14985
|
+
}
|
14986
|
+
|
14987
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
14988
|
+
|
14989
|
+
tensor->op = (enum ggml_op) op;
|
14990
|
+
|
14991
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
14992
|
+
|
14993
|
+
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
14994
|
+
|
14995
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14996
|
+
tensor->nb[j] = nb[j];
|
14997
|
+
}
|
14998
|
+
|
14999
|
+
// parse args
|
15000
|
+
{
|
15001
|
+
struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
|
15002
|
+
&tensor->src0,
|
15003
|
+
&tensor->src1,
|
15004
|
+
};
|
15005
|
+
|
15006
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
15007
|
+
args[2 + j] = &tensor->opt[j];
|
15008
|
+
}
|
15009
|
+
|
15010
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
15011
|
+
const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
|
15012
|
+
|
15013
|
+
if (arg_idx == -1) {
|
15014
|
+
continue;
|
15015
|
+
}
|
15016
|
+
|
15017
|
+
if (arg_idx < GGML_MAX_NODES) {
|
15018
|
+
*args[j] = result.leafs[arg_idx];
|
15019
|
+
} else {
|
15020
|
+
*args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
15021
|
+
}
|
15022
|
+
}
|
15023
|
+
}
|
15024
|
+
|
15025
|
+
result.nodes[i] = tensor;
|
15026
|
+
|
15027
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
15028
|
+
}
|
15029
|
+
}
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
return result;
|
15033
|
+
}
|
15034
|
+
|
14513
15035
|
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
14514
15036
|
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
|
14515
15037
|
|
@@ -14527,7 +15049,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14527
15049
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
14528
15050
|
i,
|
14529
15051
|
node->ne[0], node->ne[1], node->ne[2],
|
14530
|
-
|
15052
|
+
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
14531
15053
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
14532
15054
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
14533
15055
|
(double) node->perf_time_us / 1000.0,
|
@@ -14541,7 +15063,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14541
15063
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
14542
15064
|
i,
|
14543
15065
|
node->ne[0], node->ne[1],
|
14544
|
-
|
15066
|
+
GGML_OP_NAME[node->op]);
|
14545
15067
|
}
|
14546
15068
|
|
14547
15069
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -14549,7 +15071,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14549
15071
|
continue;
|
14550
15072
|
}
|
14551
15073
|
|
14552
|
-
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n",
|
15074
|
+
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
|
14553
15075
|
}
|
14554
15076
|
|
14555
15077
|
GGML_PRINT("========================================\n");
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,6 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
+
#define GGML_MAX_NAME 32
|
201
202
|
#define GGML_DEFAULT_N_THREADS 4
|
202
203
|
|
203
204
|
#define GGML_ASSERT(x) \
|
@@ -372,11 +373,13 @@ extern "C" {
|
|
372
373
|
|
373
374
|
void * data;
|
374
375
|
|
375
|
-
char name[
|
376
|
+
char name[GGML_MAX_NAME];
|
376
377
|
|
377
378
|
char padding[16];
|
378
379
|
};
|
379
380
|
|
381
|
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
382
|
+
|
380
383
|
// computation graph
|
381
384
|
struct ggml_cgraph {
|
382
385
|
int n_nodes;
|
@@ -429,6 +432,7 @@ extern "C" {
|
|
429
432
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
430
433
|
|
431
434
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
435
|
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
432
436
|
|
433
437
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
434
438
|
|
@@ -437,6 +441,9 @@ extern "C" {
|
|
437
441
|
// TODO: temporary until model loading of ggml examples is refactored
|
438
442
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
439
443
|
|
444
|
+
// use this to compute the memory overhead of a tensor
|
445
|
+
GGML_API size_t ggml_tensor_overhead(void);
|
446
|
+
|
440
447
|
// main
|
441
448
|
|
442
449
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -444,7 +451,11 @@ extern "C" {
|
|
444
451
|
|
445
452
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
446
453
|
|
447
|
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
454
|
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
455
|
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
456
|
+
|
457
|
+
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
458
|
+
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
448
459
|
|
449
460
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
450
461
|
struct ggml_context * ctx,
|
@@ -484,6 +495,8 @@ extern "C" {
|
|
484
495
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
485
496
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
486
497
|
|
498
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
499
|
+
|
487
500
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
488
501
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
489
502
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
@@ -970,6 +983,11 @@ extern "C" {
|
|
970
983
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
971
984
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
972
985
|
|
986
|
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
987
|
+
|
988
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
989
|
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
990
|
+
|
973
991
|
// print info and performance information for the graph
|
974
992
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
975
993
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -42,6 +42,7 @@
|
|
42
42
|
// available llama models
|
43
43
|
enum e_model {
|
44
44
|
MODEL_UNKNOWN,
|
45
|
+
MODEL_3B,
|
45
46
|
MODEL_7B,
|
46
47
|
MODEL_13B,
|
47
48
|
MODEL_30B,
|
@@ -58,6 +59,7 @@ static const size_t MB = 1024*1024;
|
|
58
59
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
59
60
|
{
|
60
61
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
+
{ MODEL_3B, 128ull * MB },
|
61
63
|
{ MODEL_7B, 512ull * MB },
|
62
64
|
{ MODEL_13B, 512ull * MB },
|
63
65
|
{ MODEL_30B, 512ull * MB },
|
@@ -69,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
69
71
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
70
72
|
{
|
71
73
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
+
{ MODEL_3B, 128ull * MB },
|
72
75
|
{ MODEL_7B, 512ull * MB },
|
73
76
|
{ MODEL_13B, 512ull * MB },
|
74
77
|
{ MODEL_30B, 512ull * MB },
|
@@ -81,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
81
84
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
82
85
|
{
|
83
86
|
static std::map<e_model, size_t> k_sizes = {
|
87
|
+
{ MODEL_3B, 682ull * MB },
|
84
88
|
{ MODEL_7B, 1026ull * MB },
|
85
89
|
{ MODEL_13B, 1608ull * MB },
|
86
90
|
{ MODEL_30B, 3124ull * MB },
|
@@ -94,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
94
98
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
95
99
|
{
|
96
100
|
static std::map<e_model, size_t> k_sizes = {
|
101
|
+
{ MODEL_3B, 512ull * MB },
|
97
102
|
{ MODEL_7B, 768ull * MB },
|
98
103
|
{ MODEL_13B, 1024ull * MB },
|
99
104
|
{ MODEL_30B, 1280ull * MB },
|
@@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
899
904
|
|
900
905
|
static const char *llama_model_type_name(e_model type) {
|
901
906
|
switch (type) {
|
907
|
+
case MODEL_3B: return "3B";
|
902
908
|
case MODEL_7B: return "7B";
|
903
909
|
case MODEL_13B: return "13B";
|
904
910
|
case MODEL_30B: return "30B";
|
@@ -932,6 +938,7 @@ static void llama_model_load_internal(
|
|
932
938
|
|
933
939
|
{
|
934
940
|
switch (hparams.n_layer) {
|
941
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
935
942
|
case 32: model.type = e_model::MODEL_7B; break;
|
936
943
|
case 40: model.type = e_model::MODEL_13B; break;
|
937
944
|
case 60: model.type = e_model::MODEL_30B; break;
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -31,6 +31,11 @@
|
|
31
31
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
32
|
#define LLAMA_SESSION_VERSION 1
|
33
33
|
|
34
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
35
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
|
+
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
|
+
#endif
|
38
|
+
|
34
39
|
#ifdef __cplusplus
|
35
40
|
extern "C" {
|
36
41
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-ffb06a3'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|