llama_cpp 0.3.7 → 0.3.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +36 -6
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +680 -428
- data/ext/llama_cpp/src/ggml-cuda.h +19 -23
- data/ext/llama_cpp/src/ggml-metal.h +6 -3
- data/ext/llama_cpp/src/ggml-metal.m +73 -128
- data/ext/llama_cpp/src/ggml-metal.metal +471 -498
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +176 -64
- data/ext/llama_cpp/src/llama.h +3 -1
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
|
4
|
+
data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
|
7
|
+
data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-9ca4abe to master-097e121.
|
4
|
+
- Add `type` method to Model.
|
5
|
+
- Revert pull request #2592 in llama.cpp.
|
6
|
+
It seems that PWIN32_MEMORY_RANGE_ENTRY and WIN32_MEMORY_RANGE_ENTRY do not exist in mingw.
|
7
|
+
|
1
8
|
## [[0.3.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.6...v0.3.7)] - 2023-08-12
|
2
9
|
|
3
10
|
- Bump bundled llama.cpp from master-468ea24 to master-9ca4abe .
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -52,8 +52,8 @@ end
|
|
52
52
|
if with_config('metal')
|
53
53
|
$CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
|
-
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit
|
56
|
-
$objs = %w[ggml.o llama.o llama_cpp.o ggml-metal.o]
|
55
|
+
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
|
+
$objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
|
57
57
|
$objs << 'k_quants.o' unless with_config('no_k_quants')
|
58
58
|
end
|
59
59
|
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -814,6 +814,7 @@ public:
|
|
814
814
|
rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
|
815
815
|
rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
|
816
816
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
817
|
+
rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
|
817
818
|
}
|
818
819
|
|
819
820
|
private:
|
@@ -1061,6 +1062,13 @@ private:
|
|
1061
1062
|
RB_GC_GUARD(text_);
|
1062
1063
|
return ret;
|
1063
1064
|
}
|
1065
|
+
|
1066
|
+
static VALUE _llama_model_get_model_type(VALUE self) {
|
1067
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1068
|
+
char buf[128];
|
1069
|
+
::llama_model_type(ptr->model, buf, sizeof(buf));
|
1070
|
+
return rb_str_new_cstr(buf);
|
1071
|
+
}
|
1064
1072
|
};
|
1065
1073
|
|
1066
1074
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -67,6 +67,8 @@ struct ggml_allocr {
|
|
67
67
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
68
68
|
size_t max_size;
|
69
69
|
bool measure;
|
70
|
+
int parse_seq[GGML_MAX_NODES];
|
71
|
+
bool has_parse_seq;
|
70
72
|
|
71
73
|
#ifdef GGML_ALLOCATOR_DEBUG
|
72
74
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
111
113
|
|
112
114
|
size_t max_avail = 0;
|
113
115
|
|
114
|
-
// find the best fitting free block
|
116
|
+
// find the best fitting free block besides the last block
|
115
117
|
int best_fit_block = -1;
|
116
118
|
size_t best_fit_size = SIZE_MAX;
|
117
|
-
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
119
|
+
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
118
120
|
struct free_block * block = &alloc->free_blocks[i];
|
119
121
|
max_avail = MAX(max_avail, block->size);
|
120
122
|
if (block->size >= size && block->size <= best_fit_size) {
|
@@ -126,10 +128,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
126
128
|
AT_PRINTF("block %d\n", best_fit_block);
|
127
129
|
|
128
130
|
if (best_fit_block == -1) {
|
129
|
-
|
130
|
-
|
131
|
-
|
131
|
+
// the last block is our last resort
|
132
|
+
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
133
|
+
if (block->size >= size) {
|
134
|
+
best_fit_block = alloc->n_free_blocks - 1;
|
135
|
+
max_avail = MAX(max_avail, block->size);
|
136
|
+
} else {
|
137
|
+
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
138
|
+
__func__, size, max_avail);
|
139
|
+
GGML_ASSERT(!"not enough space in the buffer");
|
132
140
|
return;
|
141
|
+
}
|
133
142
|
}
|
134
143
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
135
144
|
void * addr = block->addr;
|
@@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
|
229
238
|
alloc->n_free_blocks++;
|
230
239
|
}
|
231
240
|
|
241
|
+
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
242
|
+
int pos = 0;
|
243
|
+
for (int i = 0; i < n; i++) {
|
244
|
+
if (list[i] != -1) {
|
245
|
+
alloc->parse_seq[pos] = list[i];
|
246
|
+
pos++;
|
247
|
+
}
|
248
|
+
}
|
249
|
+
alloc->has_parse_seq = true;
|
250
|
+
}
|
251
|
+
|
232
252
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
233
253
|
alloc->n_free_blocks = 1;
|
234
254
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
@@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
248
268
|
/*.hash_table = */ {{0}},
|
249
269
|
/*.max_size = */ 0,
|
250
270
|
/*.measure = */ false,
|
271
|
+
/*.parse_seq = */ {0},
|
272
|
+
/*.has_parse_seq = */ false,
|
251
273
|
#ifdef GGML_ALLOCATOR_DEBUG
|
252
274
|
/*.allocated_tensors = */ = {0},
|
253
275
|
#endif
|
@@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
275
297
|
/*.hash_table = */ {{0}},
|
276
298
|
/*.max_size = */ 0,
|
277
299
|
/*.measure = */ true,
|
300
|
+
/*.parse_seq = */ {0},
|
301
|
+
/*.has_parse_seq = */ false,
|
278
302
|
#ifdef GGML_ALLOCATOR_DEBUG
|
279
303
|
/*.allocated_tensors = */ = {0},
|
280
304
|
#endif
|
@@ -473,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
473
497
|
allocate_node(alloc, input);
|
474
498
|
}
|
475
499
|
}
|
476
|
-
for (int
|
500
|
+
for (int ind = 0; ind < gf->n_nodes; ind++) {
|
501
|
+
int i;
|
502
|
+
if (alloc->has_parse_seq) {
|
503
|
+
i = alloc->parse_seq[ind];
|
504
|
+
} else {
|
505
|
+
i = ind;
|
506
|
+
}
|
477
507
|
struct ggml_tensor * node = gf->nodes[i];
|
478
508
|
|
479
509
|
// allocate parents (leafs)
|
@@ -10,6 +10,10 @@ extern "C" {
|
|
10
10
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11
11
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
12
12
|
|
13
|
+
// tell the allocator to parse nodes following the order described in the list
|
14
|
+
// you should call this if your graph are optimized to execute out-of-order
|
15
|
+
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
16
|
+
|
13
17
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
14
18
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
15
19
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|