llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
|
4
|
+
data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
|
7
|
+
data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-b1140 to master-b1198.
|
4
|
+
|
5
|
+
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
6
|
+
|
7
|
+
**Breaking Changes**
|
8
|
+
- Bump bundled llama.cpp from master-b1060 to master-b1140.
|
9
|
+
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
10
|
+
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
11
|
+
- Rename `type` method on Model to `desc` method.
|
12
|
+
- Add `size` and `n_params` methods to Model.
|
13
|
+
|
1
14
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
2
15
|
|
3
16
|
**Breaking Changes**
|
data/examples/chat.rb
CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
122
122
|
|
123
123
|
if input_echo
|
124
124
|
output = []
|
125
|
-
embd.each { |token| output << context.
|
125
|
+
embd.each { |token| output << context.token_to_piece(token) }
|
126
126
|
output_str = output.join
|
127
127
|
output_str.chomp!(antiprompt) if first_input
|
128
128
|
print(output_str)
|
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
131
131
|
if embd_input.size <= n_consumed
|
132
132
|
if antiprompt.size.positive?
|
133
133
|
last_output = []
|
134
|
-
last_n_tokens.each { |token| last_output << context.
|
134
|
+
last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
|
135
135
|
last_output_str = last_output.join
|
136
136
|
|
137
137
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -50,7 +50,7 @@ if with_config('accelerate')
|
|
50
50
|
end
|
51
51
|
|
52
52
|
if with_config('metal')
|
53
|
-
$CFLAGS << ' -DGGML_USE_METAL
|
53
|
+
$CFLAGS << ' -DGGML_USE_METAL'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
55
|
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
56
|
$objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -811,9 +811,11 @@ public:
|
|
811
811
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
812
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
813
813
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
|
-
rb_define_method(rb_cLLaMAModel, "
|
814
|
+
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
815
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
816
|
-
rb_define_method(rb_cLLaMAModel, "
|
816
|
+
rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
|
817
|
+
rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
|
818
|
+
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
817
819
|
}
|
818
820
|
|
819
821
|
private:
|
@@ -974,7 +976,7 @@ private:
|
|
974
976
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
975
977
|
}
|
976
978
|
|
977
|
-
static VALUE
|
979
|
+
static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
|
978
980
|
if (!RB_INTEGER_TYPE_P(token_)) {
|
979
981
|
rb_raise(rb_eArgError, "token must be an integer");
|
980
982
|
return Qnil;
|
@@ -982,10 +984,10 @@ private:
|
|
982
984
|
const llama_token token = NUM2INT(token_);
|
983
985
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
984
986
|
std::vector<char> result(8, 0);
|
985
|
-
const int n_tokens =
|
987
|
+
const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
986
988
|
if (n_tokens < 0) {
|
987
989
|
result.resize(-n_tokens);
|
988
|
-
const int check =
|
990
|
+
const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
989
991
|
if (check != -n_tokens) {
|
990
992
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
991
993
|
return Qnil;
|
@@ -1040,12 +1042,22 @@ private:
|
|
1040
1042
|
return ret;
|
1041
1043
|
}
|
1042
1044
|
|
1043
|
-
static VALUE
|
1045
|
+
static VALUE _llama_model_get_model_desc(VALUE self) {
|
1044
1046
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1045
1047
|
char buf[128];
|
1046
|
-
|
1048
|
+
llama_model_desc(ptr->model, buf, sizeof(buf));
|
1047
1049
|
return rb_str_new_cstr(buf);
|
1048
1050
|
}
|
1051
|
+
|
1052
|
+
static VALUE _llama_model_get_model_size(VALUE self) {
|
1053
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1054
|
+
return UINT2NUM(llama_model_size(ptr->model));
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_model_get_model_n_params(VALUE self) {
|
1058
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1059
|
+
return UINT2NUM(llama_model_n_params(ptr->model));
|
1060
|
+
}
|
1049
1061
|
};
|
1050
1062
|
|
1051
1063
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -1326,7 +1338,7 @@ public:
|
|
1326
1338
|
rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
|
1327
1339
|
rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
|
1328
1340
|
rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
|
1329
|
-
rb_define_method(rb_cLLaMAContext, "
|
1341
|
+
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1330
1342
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1331
1343
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1332
1344
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
@@ -1567,7 +1579,7 @@ private:
|
|
1567
1579
|
return output;
|
1568
1580
|
}
|
1569
1581
|
|
1570
|
-
static VALUE
|
1582
|
+
static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
|
1571
1583
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1572
1584
|
if (ptr->ctx == NULL) {
|
1573
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
@@ -1575,10 +1587,10 @@ private:
|
|
1575
1587
|
}
|
1576
1588
|
const llama_token token = NUM2INT(token_);
|
1577
1589
|
std::vector<char> result(8, 0);
|
1578
|
-
const int n_tokens =
|
1590
|
+
const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1579
1591
|
if (n_tokens < 0) {
|
1580
1592
|
result.resize(-n_tokens);
|
1581
|
-
const int check =
|
1593
|
+
const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1582
1594
|
if (check != -n_tokens) {
|
1583
1595
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1584
1596
|
return Qnil;
|
@@ -1,3 +1,8 @@
|
|
1
|
+
// defines MAP_ANONYMOUS
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#endif
|
5
|
+
|
1
6
|
#include "ggml-alloc.h"
|
2
7
|
#include "ggml.h"
|
3
8
|
#include <assert.h>
|
@@ -6,6 +11,26 @@
|
|
6
11
|
#include <stdlib.h>
|
7
12
|
#include <string.h>
|
8
13
|
|
14
|
+
#ifdef __has_include
|
15
|
+
#if __has_include(<unistd.h>)
|
16
|
+
#include <unistd.h>
|
17
|
+
#if defined(_POSIX_MAPPED_FILES)
|
18
|
+
#include <sys/types.h>
|
19
|
+
#include <sys/mman.h>
|
20
|
+
#endif
|
21
|
+
#endif
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#if defined(_WIN32)
|
25
|
+
#define WIN32_LEAN_AND_MEAN
|
26
|
+
#ifndef NOMINMAX
|
27
|
+
#define NOMINMAX
|
28
|
+
#endif
|
29
|
+
#include <windows.h>
|
30
|
+
#include <memoryapi.h>
|
31
|
+
#endif
|
32
|
+
|
33
|
+
|
9
34
|
#define UNUSED(x) (void)(x)
|
10
35
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
36
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
@@ -99,15 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
99
124
|
}
|
100
125
|
#endif
|
101
126
|
|
102
|
-
|
103
|
-
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
127
|
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
104
128
|
return ggml_nbytes(tensor);
|
105
129
|
|
106
130
|
UNUSED(alloc);
|
107
131
|
}
|
108
132
|
|
133
|
+
// check if a tensor is allocated by this buffer
|
134
|
+
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
135
|
+
void * ptr = tensor->data;
|
136
|
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
137
|
+
}
|
138
|
+
|
109
139
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
|
-
|
140
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
141
|
+
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
142
|
+
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
143
|
+
#endif
|
144
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
111
145
|
size = aligned_offset(NULL, size, alloc->alignment);
|
112
146
|
|
113
147
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -131,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
131
165
|
if (best_fit_block == -1) {
|
132
166
|
// the last block is our last resort
|
133
167
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
168
|
+
max_avail = MAX(max_avail, block->size);
|
134
169
|
if (block->size >= size) {
|
135
170
|
best_fit_block = alloc->n_free_blocks - 1;
|
136
|
-
max_avail = MAX(max_avail, block->size);
|
137
171
|
} else {
|
138
172
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
139
173
|
__func__, size, max_avail);
|
140
174
|
GGML_ASSERT(!"not enough space in the buffer");
|
141
|
-
|
175
|
+
return;
|
142
176
|
}
|
143
177
|
}
|
144
178
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
@@ -173,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
173
207
|
}
|
174
208
|
|
175
209
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
176
|
-
static void
|
210
|
+
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
177
211
|
void * ptr = tensor->data;
|
178
212
|
|
179
|
-
if (
|
213
|
+
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
180
214
|
// the tensor was not allocated in this buffer
|
181
215
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
182
216
|
// the easiest way to deal with this is just to ignore it
|
183
217
|
return;
|
184
218
|
}
|
185
219
|
|
186
|
-
size_t size =
|
220
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
187
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
188
222
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
189
223
|
|
@@ -268,7 +302,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
268
302
|
/*.parse_seq = */ {0},
|
269
303
|
/*.parse_seq_len = */ 0,
|
270
304
|
#ifdef GGML_ALLOCATOR_DEBUG
|
271
|
-
/*.allocated_tensors = */
|
305
|
+
/*.allocated_tensors = */ {0},
|
272
306
|
#endif
|
273
307
|
};
|
274
308
|
|
@@ -277,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
277
311
|
return alloc;
|
278
312
|
}
|
279
313
|
|
280
|
-
//
|
281
|
-
|
282
|
-
|
283
|
-
|
314
|
+
// OS specific functions to allocate and free uncommitted virtual memory
|
315
|
+
static void * alloc_vmem(size_t size) {
|
316
|
+
#if defined(_WIN32)
|
317
|
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
318
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
319
|
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
320
|
+
if (ptr == MAP_FAILED) {
|
321
|
+
return NULL;
|
322
|
+
}
|
323
|
+
return ptr;
|
324
|
+
#else
|
325
|
+
// use a fixed address for other platforms
|
326
|
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
327
|
+
return (void *)base_addr;
|
328
|
+
#endif
|
329
|
+
}
|
330
|
+
|
331
|
+
static void free_vmem(void * base_addr, size_t size) {
|
332
|
+
#if defined(_WIN32)
|
333
|
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
334
|
+
UNUSED(size);
|
335
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
336
|
+
munmap(base_addr, size);
|
337
|
+
#else
|
338
|
+
// nothing to do
|
339
|
+
UNUSED(base_addr);
|
340
|
+
UNUSED(size);
|
341
|
+
#endif
|
342
|
+
}
|
343
|
+
|
344
|
+
// allocate uncommitted virtual memory to measure the size of the graph
|
345
|
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
346
|
+
// 1TB for 64-bit, 1GB for 32-bit
|
347
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
348
|
+
do {
|
349
|
+
*base_addr = alloc_vmem(*size);
|
350
|
+
if (*base_addr != NULL) {
|
351
|
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
352
|
+
return;
|
353
|
+
}
|
354
|
+
// try again with half the size
|
355
|
+
*size /= 2;
|
356
|
+
} while (*size > 0);
|
357
|
+
|
358
|
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
359
|
+
}
|
360
|
+
|
361
|
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
362
|
+
free_vmem(base_addr, size);
|
363
|
+
}
|
284
364
|
|
285
365
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
286
366
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
287
367
|
|
368
|
+
void * base_addr;
|
369
|
+
size_t size;
|
370
|
+
|
371
|
+
alloc_measure_vmem(&base_addr, &size);
|
372
|
+
|
288
373
|
*alloc = (struct ggml_allocr){
|
289
|
-
/*.data = */
|
290
|
-
/*.size = */
|
374
|
+
/*.data = */ base_addr,
|
375
|
+
/*.size = */ size,
|
291
376
|
/*.alignment = */ alignment,
|
292
377
|
/*.n_free_blocks = */ 0,
|
293
378
|
/*.free_blocks = */ {{0}},
|
@@ -297,7 +382,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
297
382
|
/*.parse_seq = */ {0},
|
298
383
|
/*.parse_seq_len = */ 0,
|
299
384
|
#ifdef GGML_ALLOCATOR_DEBUG
|
300
|
-
/*.allocated_tensors = */
|
385
|
+
/*.allocated_tensors = */ {0},
|
301
386
|
#endif
|
302
387
|
};
|
303
388
|
|
@@ -307,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
307
392
|
}
|
308
393
|
|
309
394
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
395
|
+
if (alloc->measure) {
|
396
|
+
free_measure_vmem(alloc->data, alloc->size);
|
397
|
+
}
|
310
398
|
free(alloc);
|
311
399
|
}
|
312
400
|
|
@@ -317,8 +405,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
317
405
|
//////////// compute graph allocator
|
318
406
|
|
319
407
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
320
|
-
return t->
|
321
|
-
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
408
|
+
return t->view_src != NULL;
|
322
409
|
}
|
323
410
|
|
324
411
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -336,28 +423,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
336
423
|
return true;
|
337
424
|
}
|
338
425
|
|
339
|
-
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
340
|
-
switch (t->op) {
|
341
|
-
case GGML_OP_PERMUTE:
|
342
|
-
case GGML_OP_RESHAPE:
|
343
|
-
case GGML_OP_TRANSPOSE:
|
344
|
-
case GGML_OP_VIEW:
|
345
|
-
return t->src[0];
|
346
|
-
case GGML_OP_CPY:
|
347
|
-
return t->src[1];
|
348
|
-
default:
|
349
|
-
return NULL;
|
350
|
-
}
|
351
|
-
}
|
352
|
-
|
353
|
-
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
354
|
-
struct ggml_tensor * parent = t;
|
355
|
-
do {
|
356
|
-
parent = get_view_parent(parent);
|
357
|
-
} while (ggml_is_view(parent));
|
358
|
-
return parent;
|
359
|
-
}
|
360
|
-
|
361
426
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
362
427
|
switch (op) {
|
363
428
|
case GGML_OP_SCALE:
|
@@ -365,7 +430,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
365
430
|
case GGML_OP_DIAG_MASK_INF:
|
366
431
|
case GGML_OP_ADD:
|
367
432
|
case GGML_OP_ADD1:
|
368
|
-
case GGML_OP_ACC:
|
369
433
|
case GGML_OP_SUB:
|
370
434
|
case GGML_OP_MUL:
|
371
435
|
case GGML_OP_DIV:
|
@@ -375,7 +439,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
375
439
|
case GGML_OP_UNARY:
|
376
440
|
case GGML_OP_ROPE:
|
377
441
|
case GGML_OP_RMS_NORM:
|
378
|
-
case GGML_OP_SET:
|
379
442
|
case GGML_OP_SOFT_MAX:
|
380
443
|
case GGML_OP_CONT:
|
381
444
|
return true;
|
@@ -389,24 +452,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
389
452
|
struct hash_node * ht = alloc->hash_table;
|
390
453
|
if (node->data == NULL) {
|
391
454
|
if (ggml_is_view(node)) {
|
392
|
-
|
393
|
-
|
394
|
-
case GGML_OP_VIEW:
|
395
|
-
memcpy(&offset, node->op_params, sizeof(size_t));
|
396
|
-
node->data = (char *) node->src[0]->data + offset;
|
397
|
-
break;
|
398
|
-
case GGML_OP_PERMUTE:
|
399
|
-
case GGML_OP_RESHAPE:
|
400
|
-
case GGML_OP_TRANSPOSE:
|
401
|
-
node->data = node->src[0]->data;
|
402
|
-
break;
|
403
|
-
case GGML_OP_CPY:
|
404
|
-
node->data = node->src[1]->data;
|
405
|
-
break;
|
406
|
-
default:
|
407
|
-
GGML_ASSERT(!"unknown view op");
|
408
|
-
break;
|
409
|
-
}
|
455
|
+
assert(node->view_src->data != NULL);
|
456
|
+
node->data = (char *)node->view_src->data + node->view_offs;
|
410
457
|
} else {
|
411
458
|
// see if we can reuse a parent's buffer (inplace)
|
412
459
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -417,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
417
464
|
}
|
418
465
|
|
419
466
|
// if the node's data is external, then we cannot re-use it
|
420
|
-
if ((
|
421
|
-
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
467
|
+
if (ggml_allocr_is_own(alloc, parent) == false) {
|
422
468
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
423
469
|
continue;
|
424
470
|
}
|
@@ -426,7 +472,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
426
472
|
struct hash_node * p_hn = hash_get(ht, parent);
|
427
473
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
428
474
|
if (ggml_is_view(parent)) {
|
429
|
-
struct ggml_tensor * view_src =
|
475
|
+
struct ggml_tensor * view_src = parent->view_src;
|
430
476
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
431
477
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
432
478
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
@@ -452,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
452
498
|
}
|
453
499
|
}
|
454
500
|
|
455
|
-
static size_t
|
501
|
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
456
502
|
struct ggml_allocr * alloc,
|
457
503
|
struct ggml_cgraph ** graphs, int n_graphs,
|
458
504
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -468,7 +514,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
468
514
|
struct ggml_tensor * node = gf->nodes[i];
|
469
515
|
|
470
516
|
if (ggml_is_view(node)) {
|
471
|
-
struct ggml_tensor * view_src =
|
517
|
+
struct ggml_tensor * view_src = node->view_src;
|
472
518
|
hash_get(ht, view_src)->n_views += 1;
|
473
519
|
}
|
474
520
|
|
@@ -530,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
530
576
|
AT_PRINTF("\n");
|
531
577
|
}
|
532
578
|
|
533
|
-
|
534
579
|
// update parents
|
535
580
|
// update immediately if there is no parse_seq
|
536
581
|
// update only at barriers if there is parse_seq
|
537
|
-
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
582
|
+
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
538
583
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
584
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
585
|
for (int i = update_start; i < update_end; i++) {
|
@@ -553,17 +598,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
553
598
|
|
554
599
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
600
|
if (ggml_is_view(parent)) {
|
556
|
-
struct ggml_tensor * view_src =
|
601
|
+
struct ggml_tensor * view_src = parent->view_src;
|
557
602
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
603
|
view_src_hn->n_views -= 1;
|
559
|
-
AT_PRINTF("view_src %s\n", view_src->name);
|
604
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
560
605
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
-
|
606
|
+
ggml_allocr_free_tensor(alloc, view_src);
|
562
607
|
}
|
563
608
|
}
|
564
609
|
else {
|
565
610
|
if (parent->data != node->data) {
|
566
|
-
|
611
|
+
ggml_allocr_free_tensor(alloc, parent);
|
567
612
|
}
|
568
613
|
}
|
569
614
|
}
|
@@ -580,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
580
625
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
581
626
|
struct ggml_tensor * output = outputs[g][i];
|
582
627
|
AT_PRINTF("output: %s\n", output->name);
|
583
|
-
|
628
|
+
ggml_allocr_free_tensor(alloc, output);
|
584
629
|
}
|
585
630
|
}
|
586
631
|
}
|
@@ -589,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
589
634
|
}
|
590
635
|
|
591
636
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
592
|
-
return
|
637
|
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
593
638
|
}
|