llama_cpp 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -8
- data/ext/llama_cpp/src/ggml-opencl.cpp +8 -14
- data/ext/llama_cpp/src/ggml.c +532 -10
- data/ext/llama_cpp/src/ggml.h +20 -2
- data/ext/llama_cpp/src/llama.cpp +7 -0
- data/ext/llama_cpp/src/llama.h +5 -0
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f08992d10701b3ac0ab87c32d8a28d7f81101a4896e3300c461d7015f6486814
|
4
|
+
data.tar.gz: 38fd72fe9cdd596f7878ef902ddf8ec8e36954bbac50388fe8ef8437a93bfe29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e3d024db508be6cbe7e644c4a9295da97742b45f921c9c5d64a7f4b4eb6be624e79f4c63d39c226566dbb9676215ae3b986828095a185cb2069547a12cf651a0
|
7
|
+
data.tar.gz: d884334f2d77a7204f0bc96c037fa86ee6fdf3f2879c5c5bd721be336dc743a0034733fc566c114bc6f22e620e5d79ccd4c67b6ded7d929d1949315b31445701
|
data/CHANGELOG.md
CHANGED
@@ -1,25 +1,27 @@
|
|
1
|
-
## [
|
1
|
+
## [[0.1.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.3...v0.1.4)] - 2023-06-03
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-66874d4 to master-ffb06a3.
|
2
4
|
|
3
5
|
## [[0.1.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.2...v0.1.3)] - 2023-05-27
|
4
6
|
|
5
|
-
- Bump bundled llama.cpp from master-265db98 to master-66874d4
|
7
|
+
- Bump bundled llama.cpp from master-265db98 to master-66874d4.
|
6
8
|
|
7
9
|
## [[0.1.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.1...v0.1.2)] - 2023-05-22
|
8
10
|
|
9
11
|
**Breaking Changes**
|
10
12
|
|
11
|
-
- Bump bundled llama.cpp from master-6986c78 to master-265db98
|
12
|
-
- bump LLAMA_FILE_VERSION to 3
|
13
|
+
- Bump bundled llama.cpp from master-6986c78 to master-265db98.
|
14
|
+
- bump LLAMA_FILE_VERSION to 3.
|
13
15
|
|
14
16
|
## [[0.1.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.1.0...v0.1.1)] - 2023-05-21
|
15
17
|
|
16
|
-
- Add load_session_file method to Context
|
17
|
-
- Add save_session_file method to Context
|
18
|
+
- Add load_session_file method to Context.
|
19
|
+
- Add save_session_file method to Context.
|
18
20
|
|
19
21
|
**Breaking Changes**
|
20
22
|
|
21
|
-
- Bump bundled llama.cpp from master-173d0e6 to master-6986c78
|
22
|
-
- bump LLAMA_FILE_VERSION to 2
|
23
|
+
- Bump bundled llama.cpp from master-173d0e6 to master-6986c78.
|
24
|
+
- bump LLAMA_FILE_VERSION to 2.
|
23
25
|
|
24
26
|
## [[0.1.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.0.7...v0.1.0)] - 2023-05-20
|
25
27
|
|
@@ -469,16 +469,11 @@ void ggml_cl_init(void) {
|
|
469
469
|
|
470
470
|
size_t ext_str_size;
|
471
471
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
472
|
-
char*
|
472
|
+
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
473
473
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
474
|
+
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
474
475
|
// Check if ext_buffer contains cl_khr_fp16
|
475
|
-
|
476
|
-
if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
|
477
|
-
fp16_support = true;
|
478
|
-
break;
|
479
|
-
}
|
480
|
-
}
|
481
|
-
free(ext_buffer);
|
476
|
+
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
482
477
|
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
483
478
|
|
484
479
|
cl_context_properties properties[] = {
|
@@ -672,7 +667,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
672
667
|
size_t d_size;
|
673
668
|
cl_mem d_X;
|
674
669
|
if (src0->backend == GGML_BACKEND_CL) {
|
675
|
-
d_X =
|
670
|
+
d_X = (cl_mem) src0->data;
|
676
671
|
} else {
|
677
672
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
678
673
|
}
|
@@ -748,7 +743,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
748
743
|
size_t d_size;
|
749
744
|
cl_mem d_X;
|
750
745
|
if (src0->backend == GGML_BACKEND_CL) {
|
751
|
-
d_X =
|
746
|
+
d_X = (cl_mem) src0->data;
|
752
747
|
} else {
|
753
748
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
754
749
|
}
|
@@ -873,7 +868,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
873
868
|
if (src0->backend == GGML_BACKEND_CPU) {
|
874
869
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
|
875
870
|
} else if (src0->backend == GGML_BACKEND_CL) {
|
876
|
-
d_Q =
|
871
|
+
d_Q = (cl_mem) src0->data;
|
877
872
|
} else {
|
878
873
|
GGML_ASSERT(false);
|
879
874
|
}
|
@@ -1016,14 +1011,13 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
|
1016
1011
|
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
1017
1012
|
|
1018
1013
|
size_t q_size;
|
1019
|
-
cl_mem
|
1020
|
-
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
1014
|
+
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
1021
1015
|
|
1022
1016
|
// copy tensor to device
|
1023
1017
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
1024
1018
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
1025
1019
|
int i = i3*ne2 + i2;
|
1026
|
-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue,
|
1020
|
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
|
1027
1021
|
}
|
1028
1022
|
}
|
1029
1023
|
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -186,10 +186,12 @@ typedef double ggml_float;
|
|
186
186
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
187
187
|
#include <intrin.h>
|
188
188
|
#else
|
189
|
+
#if !defined(__riscv)
|
189
190
|
#include <immintrin.h>
|
190
191
|
#endif
|
191
192
|
#endif
|
192
193
|
#endif
|
194
|
+
#endif
|
193
195
|
|
194
196
|
#ifdef __F16C__
|
195
197
|
|
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
|
|
3494
3496
|
};
|
3495
3497
|
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
|
3496
3498
|
|
3497
|
-
static const char *
|
3499
|
+
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
3498
3500
|
"NONE",
|
3499
3501
|
|
3500
3502
|
"DUP",
|
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
|
|
3749
3751
|
return GGML_TYPE_NAME[type];
|
3750
3752
|
}
|
3751
3753
|
|
3754
|
+
const char * ggml_op_name(enum ggml_op op) {
|
3755
|
+
return GGML_OP_NAME[op];
|
3756
|
+
}
|
3752
3757
|
|
3753
3758
|
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
3754
3759
|
return GGML_TYPE_SIZE[tensor->type];
|
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|
3805
3810
|
return wtype;
|
3806
3811
|
}
|
3807
3812
|
|
3813
|
+
size_t ggml_tensor_overhead(void) {
|
3814
|
+
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
3815
|
+
}
|
3816
|
+
|
3808
3817
|
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
3809
3818
|
return tensor->nb[0] > tensor->nb[1];
|
3810
3819
|
}
|
@@ -4017,6 +4026,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|
4017
4026
|
return result;
|
4018
4027
|
}
|
4019
4028
|
|
4029
|
+
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
4030
|
+
ctx->no_alloc = no_alloc;
|
4031
|
+
}
|
4032
|
+
|
4033
|
+
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4034
|
+
return ctx->mem_buffer;
|
4035
|
+
}
|
4036
|
+
|
4037
|
+
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4038
|
+
return ctx->mem_size;
|
4039
|
+
}
|
4040
|
+
|
4020
4041
|
// IMPORTANT:
|
4021
4042
|
// when creating "opt" tensors, always save and load the scratch buffer
|
4022
4043
|
// this is an error prone process, but it is necessary to support inplace
|
@@ -4061,7 +4082,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4061
4082
|
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
4062
4083
|
|
4063
4084
|
if (ctx->scratch.data == NULL || data != NULL) {
|
4064
|
-
size_needed +=
|
4085
|
+
size_needed += GGML_TENSOR_SIZE;
|
4065
4086
|
|
4066
4087
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4067
4088
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
@@ -4077,14 +4098,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4077
4098
|
};
|
4078
4099
|
} else {
|
4079
4100
|
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
|
4080
|
-
GGML_PRINT("%s: not enough space in the scratch memory\n",
|
4101
|
+
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
4102
|
+
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
|
4081
4103
|
assert(false);
|
4082
4104
|
return NULL;
|
4083
4105
|
}
|
4084
4106
|
|
4085
|
-
if (cur_end +
|
4107
|
+
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
|
4086
4108
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
4087
|
-
__func__, cur_end +
|
4109
|
+
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
|
4088
4110
|
assert(false);
|
4089
4111
|
return NULL;
|
4090
4112
|
}
|
@@ -4093,7 +4115,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|
4093
4115
|
|
4094
4116
|
*obj_new = (struct ggml_object) {
|
4095
4117
|
.offs = cur_end + GGML_OBJECT_SIZE,
|
4096
|
-
.size =
|
4118
|
+
.size = GGML_TENSOR_SIZE,
|
4097
4119
|
.next = NULL,
|
4098
4120
|
};
|
4099
4121
|
|
@@ -4509,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
|
|
4509
4531
|
return result;
|
4510
4532
|
}
|
4511
4533
|
|
4534
|
+
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
|
4535
|
+
struct ggml_object * obj = ctx->objects_begin;
|
4536
|
+
|
4537
|
+
char * const mem_buffer = ctx->mem_buffer;
|
4538
|
+
|
4539
|
+
while (obj != NULL) {
|
4540
|
+
struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
|
4541
|
+
if (strcmp(cur->name, name) == 0) {
|
4542
|
+
return cur;
|
4543
|
+
}
|
4544
|
+
|
4545
|
+
obj = obj->next;
|
4546
|
+
}
|
4547
|
+
|
4548
|
+
return NULL;
|
4549
|
+
}
|
4550
|
+
|
4512
4551
|
////////////////////////////////////////////////////////////////////////////////
|
4513
4552
|
|
4514
4553
|
// ggml_dup
|
@@ -6303,7 +6342,7 @@ struct ggml_tensor * ggml_alibi(
|
|
6303
6342
|
|
6304
6343
|
ggml_scratch_save(ctx);
|
6305
6344
|
|
6306
|
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32,
|
6345
|
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
6307
6346
|
|
6308
6347
|
((int32_t *) b->data)[0] = n_past;
|
6309
6348
|
((int32_t *) b->data)[1] = n_head;
|
@@ -13792,11 +13831,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
13792
13831
|
// reached a leaf node, not part of the gradient graph (e.g. a constant)
|
13793
13832
|
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
13794
13833
|
|
13834
|
+
if (strlen(node->name) == 0) {
|
13835
|
+
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
|
13836
|
+
}
|
13837
|
+
|
13795
13838
|
cgraph->leafs[cgraph->n_leafs] = node;
|
13796
13839
|
cgraph->n_leafs++;
|
13797
13840
|
} else {
|
13798
13841
|
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
13799
13842
|
|
13843
|
+
if (strlen(node->name) == 0) {
|
13844
|
+
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
|
13845
|
+
}
|
13846
|
+
|
13800
13847
|
cgraph->nodes[cgraph->n_nodes] = node;
|
13801
13848
|
cgraph->grads[cgraph->n_nodes] = node->grad;
|
13802
13849
|
cgraph->n_nodes++;
|
@@ -14510,6 +14557,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
|
14510
14557
|
}
|
14511
14558
|
}
|
14512
14559
|
|
14560
|
+
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
14561
|
+
for (int i = 0; i < cgraph->n_leafs; i++) {
|
14562
|
+
struct ggml_tensor * leaf = cgraph->leafs[i];
|
14563
|
+
|
14564
|
+
if (strcmp(leaf->name, name) == 0) {
|
14565
|
+
return leaf;
|
14566
|
+
}
|
14567
|
+
}
|
14568
|
+
|
14569
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
14570
|
+
struct ggml_tensor * node = cgraph->nodes[i];
|
14571
|
+
|
14572
|
+
if (strcmp(node->name, name) == 0) {
|
14573
|
+
return node;
|
14574
|
+
}
|
14575
|
+
}
|
14576
|
+
|
14577
|
+
return NULL;
|
14578
|
+
}
|
14579
|
+
|
14580
|
+
static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
|
14581
|
+
const int64_t * ne = tensor->ne;
|
14582
|
+
const size_t * nb = tensor->nb;
|
14583
|
+
|
14584
|
+
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
|
14585
|
+
ggml_type_name(tensor->type),
|
14586
|
+
ggml_op_name (tensor->op),
|
14587
|
+
tensor->n_dims,
|
14588
|
+
ne[0], ne[1], ne[2], ne[3],
|
14589
|
+
nb[0], nb[1], nb[2], nb[3],
|
14590
|
+
tensor->data,
|
14591
|
+
tensor->name);
|
14592
|
+
}
|
14593
|
+
|
14594
|
+
static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
|
14595
|
+
const int64_t * ne = tensor->ne;
|
14596
|
+
const size_t * nb = tensor->nb;
|
14597
|
+
|
14598
|
+
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
|
14599
|
+
arg,
|
14600
|
+
ggml_type_name(tensor->type),
|
14601
|
+
ggml_op_name (tensor->op),
|
14602
|
+
tensor->n_dims,
|
14603
|
+
ne[0], ne[1], ne[2], ne[3],
|
14604
|
+
nb[0], nb[1], nb[2], nb[3],
|
14605
|
+
tensor->n_tasks,
|
14606
|
+
tensor->data,
|
14607
|
+
tensor->name);
|
14608
|
+
}
|
14609
|
+
|
14610
|
+
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
14611
|
+
assert(cgraph->work == NULL);
|
14612
|
+
assert(cgraph->work_size == 0);
|
14613
|
+
|
14614
|
+
uint64_t size_eval = 0;
|
14615
|
+
|
14616
|
+
// compute size of intermediate results
|
14617
|
+
// TODO: does not take into account scratch buffers !!!!
|
14618
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14619
|
+
size_eval += ggml_nbytes(cgraph->nodes[i]);
|
14620
|
+
}
|
14621
|
+
|
14622
|
+
// print
|
14623
|
+
{
|
14624
|
+
FILE * fout = stdout;
|
14625
|
+
|
14626
|
+
fprintf(fout, "\n");
|
14627
|
+
fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
|
14628
|
+
fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
|
14629
|
+
fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
|
14630
|
+
fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
|
14631
|
+
fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
|
14632
|
+
|
14633
|
+
// header
|
14634
|
+
fprintf(fout, "\n");
|
14635
|
+
fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
|
14636
|
+
"TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
|
14637
|
+
|
14638
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14639
|
+
ggml_graph_export_leaf(cgraph->leafs[i], fout);
|
14640
|
+
|
14641
|
+
GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
|
14642
|
+
GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
|
14643
|
+
GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
|
14644
|
+
}
|
14645
|
+
|
14646
|
+
// header
|
14647
|
+
fprintf(fout, "\n");
|
14648
|
+
fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
|
14649
|
+
"ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
|
14650
|
+
|
14651
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14652
|
+
ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
|
14653
|
+
|
14654
|
+
if (cgraph->nodes[i]->src0) {
|
14655
|
+
ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
|
14656
|
+
}
|
14657
|
+
|
14658
|
+
if (cgraph->nodes[i]->src1) {
|
14659
|
+
ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
|
14660
|
+
}
|
14661
|
+
|
14662
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14663
|
+
if (cgraph->nodes[i]->opt[j]) {
|
14664
|
+
ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
|
14665
|
+
}
|
14666
|
+
}
|
14667
|
+
|
14668
|
+
fprintf(fout, "\n");
|
14669
|
+
}
|
14670
|
+
|
14671
|
+
fprintf(fout, "\n");
|
14672
|
+
}
|
14673
|
+
|
14674
|
+
// write binary data
|
14675
|
+
{
|
14676
|
+
FILE * fout = fopen(fname, "wb");
|
14677
|
+
|
14678
|
+
if (!fout) {
|
14679
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14680
|
+
return;
|
14681
|
+
}
|
14682
|
+
|
14683
|
+
// header
|
14684
|
+
{
|
14685
|
+
const uint32_t magic = GGML_FILE_MAGIC;
|
14686
|
+
const uint32_t version = GGML_FILE_VERSION;
|
14687
|
+
const uint32_t n_leafs = cgraph->n_leafs;
|
14688
|
+
const uint32_t nodes = cgraph->n_nodes;
|
14689
|
+
|
14690
|
+
fwrite(&magic, sizeof(uint32_t), 1, fout);
|
14691
|
+
fwrite(&version, sizeof(uint32_t), 1, fout);
|
14692
|
+
fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
|
14693
|
+
fwrite(&nodes, sizeof(uint32_t), 1, fout);
|
14694
|
+
fwrite(&size_eval, sizeof(uint64_t), 1, fout);
|
14695
|
+
}
|
14696
|
+
|
14697
|
+
// leafs
|
14698
|
+
{
|
14699
|
+
for (int i = 0; i < cgraph->n_leafs; ++i) {
|
14700
|
+
const struct ggml_tensor * tensor = cgraph->leafs[i];
|
14701
|
+
|
14702
|
+
const uint32_t type = tensor->type;
|
14703
|
+
const uint32_t op = tensor->op;
|
14704
|
+
const uint32_t n_dims = tensor->n_dims;
|
14705
|
+
|
14706
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14707
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14708
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14709
|
+
|
14710
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14711
|
+
const uint64_t ne = tensor->ne[j];
|
14712
|
+
const uint64_t nb = tensor->nb[j];
|
14713
|
+
|
14714
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14715
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14716
|
+
}
|
14717
|
+
|
14718
|
+
// store the pointer address
|
14719
|
+
{
|
14720
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14721
|
+
|
14722
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14723
|
+
}
|
14724
|
+
|
14725
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14726
|
+
|
14727
|
+
// dump the data
|
14728
|
+
// TODO: pad this to 32 byte boundary
|
14729
|
+
{
|
14730
|
+
const size_t size = ggml_nbytes(tensor);
|
14731
|
+
|
14732
|
+
fwrite(tensor->data, sizeof(char), size, fout);
|
14733
|
+
}
|
14734
|
+
}
|
14735
|
+
}
|
14736
|
+
|
14737
|
+
// nodes
|
14738
|
+
{
|
14739
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
14740
|
+
const struct ggml_tensor * tensor = cgraph->nodes[i];
|
14741
|
+
|
14742
|
+
const uint32_t type = tensor->type;
|
14743
|
+
const uint32_t op = tensor->op;
|
14744
|
+
const uint32_t n_dims = tensor->n_dims;
|
14745
|
+
|
14746
|
+
fwrite(&type, sizeof(uint32_t), 1, fout);
|
14747
|
+
fwrite(&op, sizeof(uint32_t), 1, fout);
|
14748
|
+
fwrite(&n_dims, sizeof(uint32_t), 1, fout);
|
14749
|
+
|
14750
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14751
|
+
const uint64_t ne = tensor->ne[j];
|
14752
|
+
const uint64_t nb = tensor->nb[j];
|
14753
|
+
|
14754
|
+
fwrite(&ne, sizeof(uint64_t), 1, fout);
|
14755
|
+
fwrite(&nb, sizeof(uint64_t), 1, fout);
|
14756
|
+
}
|
14757
|
+
|
14758
|
+
// store the pointer address
|
14759
|
+
{
|
14760
|
+
const uint64_t ptr = (uint64_t) tensor->data;
|
14761
|
+
|
14762
|
+
fwrite(&ptr, sizeof(uint64_t), 1, fout);
|
14763
|
+
}
|
14764
|
+
|
14765
|
+
fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
|
14766
|
+
|
14767
|
+
// output the op arguments
|
14768
|
+
{
|
14769
|
+
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
14770
|
+
|
14771
|
+
args[0] = tensor->src0;
|
14772
|
+
args[1] = tensor->src1;
|
14773
|
+
|
14774
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
14775
|
+
args[2 + j] = tensor->opt[j];
|
14776
|
+
}
|
14777
|
+
|
14778
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
14779
|
+
if (args[j]) {
|
14780
|
+
int32_t idx = -1;
|
14781
|
+
|
14782
|
+
// check if leaf
|
14783
|
+
{
|
14784
|
+
for (int k = 0; k < cgraph->n_leafs; ++k) {
|
14785
|
+
if (args[j] == cgraph->leafs[k]) {
|
14786
|
+
idx = k;
|
14787
|
+
break;
|
14788
|
+
}
|
14789
|
+
}
|
14790
|
+
}
|
14791
|
+
|
14792
|
+
// check if node
|
14793
|
+
if (idx == -1) {
|
14794
|
+
for (int k = 0; k < cgraph->n_nodes; ++k) {
|
14795
|
+
if (args[j] == cgraph->nodes[k]) {
|
14796
|
+
idx = GGML_MAX_NODES + k;
|
14797
|
+
break;
|
14798
|
+
}
|
14799
|
+
}
|
14800
|
+
}
|
14801
|
+
|
14802
|
+
if (idx == -1) {
|
14803
|
+
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
14804
|
+
return;
|
14805
|
+
}
|
14806
|
+
|
14807
|
+
fwrite(&idx, sizeof(int32_t), 1, fout);
|
14808
|
+
} else {
|
14809
|
+
const int32_t nul = -1;
|
14810
|
+
|
14811
|
+
fwrite(&nul, sizeof(int32_t), 1, fout);
|
14812
|
+
}
|
14813
|
+
}
|
14814
|
+
}
|
14815
|
+
}
|
14816
|
+
}
|
14817
|
+
|
14818
|
+
fclose(fout);
|
14819
|
+
}
|
14820
|
+
}
|
14821
|
+
|
14822
|
+
struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
|
14823
|
+
assert(*ctx_data == NULL);
|
14824
|
+
assert(*ctx_eval == NULL);
|
14825
|
+
|
14826
|
+
struct ggml_cgraph result = { 0 };
|
14827
|
+
|
14828
|
+
struct ggml_tensor * data = NULL;
|
14829
|
+
|
14830
|
+
// read file into data
|
14831
|
+
{
|
14832
|
+
FILE * fin = fopen(fname, "rb");
|
14833
|
+
|
14834
|
+
if (!fin) {
|
14835
|
+
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
14836
|
+
return result;
|
14837
|
+
}
|
14838
|
+
|
14839
|
+
size_t fsize = 0;
|
14840
|
+
|
14841
|
+
fseek(fin, 0, SEEK_END);
|
14842
|
+
fsize = ftell(fin);
|
14843
|
+
fseek(fin, 0, SEEK_SET);
|
14844
|
+
|
14845
|
+
// create the data context
|
14846
|
+
{
|
14847
|
+
const size_t overhead = 1*ggml_tensor_overhead();
|
14848
|
+
|
14849
|
+
struct ggml_init_params params = {
|
14850
|
+
.mem_size = fsize + overhead,
|
14851
|
+
.mem_buffer = NULL,
|
14852
|
+
.no_alloc = false,
|
14853
|
+
};
|
14854
|
+
|
14855
|
+
*ctx_data = ggml_init(params);
|
14856
|
+
|
14857
|
+
if (!*ctx_data) {
|
14858
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
14859
|
+
return result;
|
14860
|
+
}
|
14861
|
+
}
|
14862
|
+
|
14863
|
+
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
14864
|
+
|
14865
|
+
fread(data->data, sizeof(char), fsize, fin);
|
14866
|
+
|
14867
|
+
fclose(fin);
|
14868
|
+
}
|
14869
|
+
|
14870
|
+
// populate result
|
14871
|
+
{
|
14872
|
+
char * ptr = (char *) data->data;
|
14873
|
+
|
14874
|
+
const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
|
14875
|
+
|
14876
|
+
if (magic != GGML_FILE_MAGIC) {
|
14877
|
+
fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
|
14878
|
+
return result;
|
14879
|
+
}
|
14880
|
+
|
14881
|
+
const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
|
14882
|
+
|
14883
|
+
if (version != GGML_FILE_VERSION) {
|
14884
|
+
fprintf(stderr, "%s: invalid version number\n", __func__);
|
14885
|
+
return result;
|
14886
|
+
}
|
14887
|
+
|
14888
|
+
const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
|
14889
|
+
const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
|
14890
|
+
const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
|
14891
|
+
|
14892
|
+
result.n_leafs = n_leafs;
|
14893
|
+
result.n_nodes = n_nodes;
|
14894
|
+
|
14895
|
+
// create the data context
|
14896
|
+
{
|
14897
|
+
const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
|
14898
|
+
|
14899
|
+
struct ggml_init_params params = {
|
14900
|
+
.mem_size = size_eval + overhead,
|
14901
|
+
.mem_buffer = NULL,
|
14902
|
+
.no_alloc = true,
|
14903
|
+
};
|
14904
|
+
|
14905
|
+
*ctx_eval = ggml_init(params);
|
14906
|
+
|
14907
|
+
if (!*ctx_eval) {
|
14908
|
+
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
14909
|
+
return result;
|
14910
|
+
}
|
14911
|
+
}
|
14912
|
+
|
14913
|
+
// leafs
|
14914
|
+
{
|
14915
|
+
uint32_t type;
|
14916
|
+
uint32_t op;
|
14917
|
+
uint32_t n_dims;
|
14918
|
+
|
14919
|
+
for (uint32_t i = 0; i < n_leafs; ++i) {
|
14920
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
14921
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
14922
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
14923
|
+
|
14924
|
+
int64_t ne[GGML_MAX_DIMS];
|
14925
|
+
size_t nb[GGML_MAX_DIMS];
|
14926
|
+
|
14927
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14928
|
+
uint64_t ne_cur;
|
14929
|
+
uint64_t nb_cur;
|
14930
|
+
|
14931
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
14932
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
14933
|
+
|
14934
|
+
ne[j] = ne_cur;
|
14935
|
+
nb[j] = nb_cur;
|
14936
|
+
}
|
14937
|
+
|
14938
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
14939
|
+
|
14940
|
+
tensor->op = (enum ggml_op) op;
|
14941
|
+
|
14942
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
14943
|
+
|
14944
|
+
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
14945
|
+
|
14946
|
+
tensor->data = (void *) ptr;
|
14947
|
+
|
14948
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14949
|
+
tensor->nb[j] = nb[j];
|
14950
|
+
}
|
14951
|
+
|
14952
|
+
result.leafs[i] = tensor;
|
14953
|
+
|
14954
|
+
ptr += ggml_nbytes(tensor);
|
14955
|
+
|
14956
|
+
fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
14957
|
+
}
|
14958
|
+
}
|
14959
|
+
|
14960
|
+
ggml_set_no_alloc(*ctx_eval, false);
|
14961
|
+
|
14962
|
+
// nodes
|
14963
|
+
{
|
14964
|
+
uint32_t type;
|
14965
|
+
uint32_t op;
|
14966
|
+
uint32_t n_dims;
|
14967
|
+
|
14968
|
+
for (uint32_t i = 0; i < n_nodes; ++i) {
|
14969
|
+
type = *(const uint32_t *) ptr; ptr += sizeof(type);
|
14970
|
+
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
14971
|
+
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
14972
|
+
|
14973
|
+
int64_t ne[GGML_MAX_DIMS];
|
14974
|
+
size_t nb[GGML_MAX_DIMS];
|
14975
|
+
|
14976
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14977
|
+
uint64_t ne_cur;
|
14978
|
+
uint64_t nb_cur;
|
14979
|
+
|
14980
|
+
ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
|
14981
|
+
nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
|
14982
|
+
|
14983
|
+
ne[j] = ne_cur;
|
14984
|
+
nb[j] = nb_cur;
|
14985
|
+
}
|
14986
|
+
|
14987
|
+
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
14988
|
+
|
14989
|
+
tensor->op = (enum ggml_op) op;
|
14990
|
+
|
14991
|
+
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
14992
|
+
|
14993
|
+
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
14994
|
+
|
14995
|
+
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
14996
|
+
tensor->nb[j] = nb[j];
|
14997
|
+
}
|
14998
|
+
|
14999
|
+
// parse args
|
15000
|
+
{
|
15001
|
+
struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
|
15002
|
+
&tensor->src0,
|
15003
|
+
&tensor->src1,
|
15004
|
+
};
|
15005
|
+
|
15006
|
+
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
15007
|
+
args[2 + j] = &tensor->opt[j];
|
15008
|
+
}
|
15009
|
+
|
15010
|
+
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
15011
|
+
const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
|
15012
|
+
|
15013
|
+
if (arg_idx == -1) {
|
15014
|
+
continue;
|
15015
|
+
}
|
15016
|
+
|
15017
|
+
if (arg_idx < GGML_MAX_NODES) {
|
15018
|
+
*args[j] = result.leafs[arg_idx];
|
15019
|
+
} else {
|
15020
|
+
*args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
15021
|
+
}
|
15022
|
+
}
|
15023
|
+
}
|
15024
|
+
|
15025
|
+
result.nodes[i] = tensor;
|
15026
|
+
|
15027
|
+
fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
|
15028
|
+
}
|
15029
|
+
}
|
15030
|
+
}
|
15031
|
+
|
15032
|
+
return result;
|
15033
|
+
}
|
15034
|
+
|
14513
15035
|
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
14514
15036
|
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
|
14515
15037
|
|
@@ -14527,7 +15049,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14527
15049
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
14528
15050
|
i,
|
14529
15051
|
node->ne[0], node->ne[1], node->ne[2],
|
14530
|
-
|
15052
|
+
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
14531
15053
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
14532
15054
|
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
14533
15055
|
(double) node->perf_time_us / 1000.0,
|
@@ -14541,7 +15063,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14541
15063
|
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
|
14542
15064
|
i,
|
14543
15065
|
node->ne[0], node->ne[1],
|
14544
|
-
|
15066
|
+
GGML_OP_NAME[node->op]);
|
14545
15067
|
}
|
14546
15068
|
|
14547
15069
|
for (int i = 0; i < GGML_OP_COUNT; i++) {
|
@@ -14549,7 +15071,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|
14549
15071
|
continue;
|
14550
15072
|
}
|
14551
15073
|
|
14552
|
-
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n",
|
15074
|
+
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
|
14553
15075
|
}
|
14554
15076
|
|
14555
15077
|
GGML_PRINT("========================================\n");
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -198,6 +198,7 @@
|
|
198
198
|
#define GGML_MAX_PARAMS 256
|
199
199
|
#define GGML_MAX_CONTEXTS 64
|
200
200
|
#define GGML_MAX_OPT 4
|
201
|
+
#define GGML_MAX_NAME 32
|
201
202
|
#define GGML_DEFAULT_N_THREADS 4
|
202
203
|
|
203
204
|
#define GGML_ASSERT(x) \
|
@@ -372,11 +373,13 @@ extern "C" {
|
|
372
373
|
|
373
374
|
void * data;
|
374
375
|
|
375
|
-
char name[
|
376
|
+
char name[GGML_MAX_NAME];
|
376
377
|
|
377
378
|
char padding[16];
|
378
379
|
};
|
379
380
|
|
381
|
+
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
382
|
+
|
380
383
|
// computation graph
|
381
384
|
struct ggml_cgraph {
|
382
385
|
int n_nodes;
|
@@ -429,6 +432,7 @@ extern "C" {
|
|
429
432
|
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
430
433
|
|
431
434
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
435
|
+
GGML_API const char * ggml_op_name (enum ggml_op op);
|
432
436
|
|
433
437
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
434
438
|
|
@@ -437,6 +441,9 @@ extern "C" {
|
|
437
441
|
// TODO: temporary until model loading of ggml examples is refactored
|
438
442
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
439
443
|
|
444
|
+
// use this to compute the memory overhead of a tensor
|
445
|
+
GGML_API size_t ggml_tensor_overhead(void);
|
446
|
+
|
440
447
|
// main
|
441
448
|
|
442
449
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
@@ -444,7 +451,11 @@ extern "C" {
|
|
444
451
|
|
445
452
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
446
453
|
|
447
|
-
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
454
|
+
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
455
|
+
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
456
|
+
|
457
|
+
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
458
|
+
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
|
448
459
|
|
449
460
|
GGML_API struct ggml_tensor * ggml_new_tensor(
|
450
461
|
struct ggml_context * ctx,
|
@@ -484,6 +495,8 @@ extern "C" {
|
|
484
495
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
485
496
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
486
497
|
|
498
|
+
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
499
|
+
|
487
500
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
488
501
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
489
502
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
@@ -970,6 +983,11 @@ extern "C" {
|
|
970
983
|
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
971
984
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
972
985
|
|
986
|
+
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
987
|
+
|
988
|
+
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
989
|
+
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
990
|
+
|
973
991
|
// print info and performance information for the graph
|
974
992
|
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
975
993
|
|
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -42,6 +42,7 @@
|
|
42
42
|
// available llama models
|
43
43
|
enum e_model {
|
44
44
|
MODEL_UNKNOWN,
|
45
|
+
MODEL_3B,
|
45
46
|
MODEL_7B,
|
46
47
|
MODEL_13B,
|
47
48
|
MODEL_30B,
|
@@ -58,6 +59,7 @@ static const size_t MB = 1024*1024;
|
|
58
59
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
59
60
|
{
|
60
61
|
static std::map<e_model, size_t> k_sizes = {
|
62
|
+
{ MODEL_3B, 128ull * MB },
|
61
63
|
{ MODEL_7B, 512ull * MB },
|
62
64
|
{ MODEL_13B, 512ull * MB },
|
63
65
|
{ MODEL_30B, 512ull * MB },
|
@@ -69,6 +71,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
69
71
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
70
72
|
{
|
71
73
|
static std::map<e_model, size_t> k_sizes = {
|
74
|
+
{ MODEL_3B, 128ull * MB },
|
72
75
|
{ MODEL_7B, 512ull * MB },
|
73
76
|
{ MODEL_13B, 512ull * MB },
|
74
77
|
{ MODEL_30B, 512ull * MB },
|
@@ -81,6 +84,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
81
84
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
82
85
|
{
|
83
86
|
static std::map<e_model, size_t> k_sizes = {
|
87
|
+
{ MODEL_3B, 682ull * MB },
|
84
88
|
{ MODEL_7B, 1026ull * MB },
|
85
89
|
{ MODEL_13B, 1608ull * MB },
|
86
90
|
{ MODEL_30B, 3124ull * MB },
|
@@ -94,6 +98,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
94
98
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
95
99
|
{
|
96
100
|
static std::map<e_model, size_t> k_sizes = {
|
101
|
+
{ MODEL_3B, 512ull * MB },
|
97
102
|
{ MODEL_7B, 768ull * MB },
|
98
103
|
{ MODEL_13B, 1024ull * MB },
|
99
104
|
{ MODEL_30B, 1280ull * MB },
|
@@ -899,6 +904,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
|
899
904
|
|
900
905
|
static const char *llama_model_type_name(e_model type) {
|
901
906
|
switch (type) {
|
907
|
+
case MODEL_3B: return "3B";
|
902
908
|
case MODEL_7B: return "7B";
|
903
909
|
case MODEL_13B: return "13B";
|
904
910
|
case MODEL_30B: return "30B";
|
@@ -932,6 +938,7 @@ static void llama_model_load_internal(
|
|
932
938
|
|
933
939
|
{
|
934
940
|
switch (hparams.n_layer) {
|
941
|
+
case 26: model.type = e_model::MODEL_3B; break;
|
935
942
|
case 32: model.type = e_model::MODEL_7B; break;
|
936
943
|
case 40: model.type = e_model::MODEL_13B; break;
|
937
944
|
case 60: model.type = e_model::MODEL_30B; break;
|
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -31,6 +31,11 @@
|
|
31
31
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
32
32
|
#define LLAMA_SESSION_VERSION 1
|
33
33
|
|
34
|
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
35
|
+
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
36
|
+
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
37
|
+
#endif
|
38
|
+
|
34
39
|
#ifdef __cplusplus
|
35
40
|
extern "C" {
|
36
41
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.1.
|
6
|
+
VERSION = '0.1.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = 'master-
|
9
|
+
LLAMA_CPP_VERSION = 'master-ffb06a3'
|
10
10
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|