llama_cpp 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +44 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1398 -702
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +112 -146
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/ggml.c +396 -150
- data/ext/llama_cpp/src/ggml.h +113 -32
- data/ext/llama_cpp/src/llama-util.h +51 -9
- data/ext/llama_cpp/src/llama.cpp +390 -210
- data/ext/llama_cpp/src/llama.h +20 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
|
4
|
+
data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
|
7
|
+
data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-9ca4abe to master-097e121.
|
4
|
+
- Add `type` method to Model.
|
5
|
+
- Revert pull request #2592 in llama.cpp.
|
6
|
+
It seems that PWIN32_MEMORY_RANGE_ENTRY and WIN32_MEMORY_RANGE_ENTRY do not exist in mingw.
|
7
|
+
|
8
|
+
## [[0.3.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.6...v0.3.7)] - 2023-08-12
|
9
|
+
|
10
|
+
- Bump bundled llama.cpp from master-468ea24 to master-9ca4abe .
|
11
|
+
|
1
12
|
## [[0.3.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.5...v0.3.6)] - 2023-08-04
|
2
13
|
|
3
14
|
- Bump bundled llama.cpp from master-1a94186 to master-468ea24.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -52,8 +52,8 @@ end
|
|
52
52
|
if with_config('metal')
|
53
53
|
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
|
-
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit
|
56
|
-
$objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
|
55
|
+
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
|
+
$objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
|
57
57
|
$objs << 'k_quants.o' unless with_config('no_k_quants')
|
58
58
|
end
|
59
59
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -814,6 +814,7 @@ public:
|
|
814
814
|
rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
|
815
815
|
rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
|
816
816
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
817
|
+
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
|
817
818
|
}
|
818
819
|
|
819
820
|
private:
|
@@ -1061,6 +1062,13 @@ private:
|
|
1061
1062
|
RB_GC_GUARD(text_);
|
1062
1063
|
return ret;
|
1063
1064
|
}
|
1065
|
+
|
1066
|
+
static VALUE _llama_model_get_model_type(VALUE self) {
|
1067
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1068
|
+
char buf[128];
|
1069
|
+
::llama_model_type(ptr->model, buf, sizeof(buf));
|
1070
|
+
return rb_str_new_cstr(buf);
|
1071
|
+
}
|
1064
1072
|
};
|
1065
1073
|
|
1066
1074
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -67,6 +67,8 @@ struct ggml_allocr {
|
|
67
67
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
68
|
size_t max_size;
|
69
69
|
bool measure;
|
70
|
+
int parse_seq[GGML_MAX_NODES];
|
71
|
+
bool has_parse_seq;
|
70
72
|
|
71
73
|
#ifdef GGML_ALLOCATOR_DEBUG
|
72
74
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
111
113
|
|
112
114
|
size_t max_avail = 0;
|
113
115
|
|
114
|
-
// find the best fitting free block
|
116
|
+
// find the best fitting free block besides the last block
|
115
117
|
int best_fit_block = -1;
|
116
118
|
size_t best_fit_size = SIZE_MAX;
|
117
|
-
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
119
|
+
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
118
120
|
struct free_block * block = &alloc->free_blocks[i];
|
119
121
|
max_avail = MAX(max_avail, block->size);
|
120
122
|
if (block->size >= size && block->size <= best_fit_size) {
|
@@ -126,10 +128,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
126
128
|
AT_PRINTF("block %d\n", best_fit_block);
|
127
129
|
|
128
130
|
if (best_fit_block == -1) {
|
129
|
-
|
130
|
-
|
131
|
-
|
131
|
+
// the last block is our last resort
|
132
|
+
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
133
|
+
if (block->size >= size) {
|
134
|
+
best_fit_block = alloc->n_free_blocks - 1;
|
135
|
+
max_avail = MAX(max_avail, block->size);
|
136
|
+
} else {
|
137
|
+
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
138
|
+
__func__, size, max_avail);
|
139
|
+
GGML_ASSERT(!"not enough space in the buffer");
|
132
140
|
return;
|
141
|
+
}
|
133
142
|
}
|
134
143
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
135
144
|
void * addr = block->addr;
|
@@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
229
238
|
alloc->n_free_blocks++;
|
230
239
|
}
|
231
240
|
|
241
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
242
|
+
int pos = 0;
|
243
|
+
for (int i = 0; i < n; i++) {
|
244
|
+
if (list[i] != -1) {
|
245
|
+
alloc->parse_seq[pos] = list[i];
|
246
|
+
pos++;
|
247
|
+
}
|
248
|
+
}
|
249
|
+
alloc->has_parse_seq = true;
|
250
|
+
}
|
251
|
+
|
232
252
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
233
253
|
alloc->n_free_blocks = 1;
|
234
254
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
@@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
248
268
|
/*.hash_table = */ {{0}},
|
249
269
|
/*.max_size = */ 0,
|
250
270
|
/*.measure = */ false,
|
271
|
+
/*.parse_seq = */ {0},
|
272
|
+
/*.has_parse_seq = */ false,
|
251
273
|
#ifdef GGML_ALLOCATOR_DEBUG
|
252
274
|
/*.allocated_tensors = */ = {0},
|
253
275
|
#endif
|
@@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
275
297
|
/*.hash_table = */ {{0}},
|
276
298
|
/*.max_size = */ 0,
|
277
299
|
/*.measure = */ true,
|
300
|
+
/*.parse_seq = */ {0},
|
301
|
+
/*.has_parse_seq = */ false,
|
278
302
|
#ifdef GGML_ALLOCATOR_DEBUG
|
279
303
|
/*.allocated_tensors = */ = {0},
|
280
304
|
#endif
|
@@ -394,6 +418,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
394
418
|
if (parent == NULL) {
|
395
419
|
break;
|
396
420
|
}
|
421
|
+
|
422
|
+
// if the node's data is external, then we cannot re-use it
|
423
|
+
if ((char *) parent->data < (char *) alloc->data ||
|
424
|
+
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
425
|
+
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
426
|
+
continue;
|
427
|
+
}
|
428
|
+
|
397
429
|
struct hash_node * p_hn = hash_get(ht, parent);
|
398
430
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
399
431
|
if (ggml_is_view(parent)) {
|
@@ -465,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
465
497
|
allocate_node(alloc, input);
|
466
498
|
}
|
467
499
|
}
|
468
|
-
for (int
|
500
|
+
for (int ind = 0; ind < gf->n_nodes; ind++) {
|
501
|
+
int i;
|
502
|
+
if (alloc->has_parse_seq) {
|
503
|
+
i = alloc->parse_seq[ind];
|
504
|
+
} else {
|
505
|
+
i = ind;
|
506
|
+
}
|
469
507
|
struct ggml_tensor * node = gf->nodes[i];
|
470
508
|
|
471
509
|
// allocate parents (leafs)
|
@@ -10,6 +10,10 @@ extern "C" {
|
|
10
10
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11
11
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
12
12
|
|
13
|
+
// tell the allocator to parse nodes following the order described in the list
|
14
|
+
// you should call this if your graph are optimized to execute out-of-order
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
16
|
+
|
13
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
14
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
15
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|