llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
|
4
|
+
data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
|
7
|
+
data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from master-b1140 to master-b1198.
|
4
|
+
|
5
|
+
## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
|
6
|
+
|
7
|
+
**Breaking Changes**
|
8
|
+
- Bump bundled llama.cpp from master-b1060 to master-b1140.
|
9
|
+
- Rename `token_to_str` method on Context to `token_to_piece` method.
|
10
|
+
- Rename `token_to_str` method on Model to `token_to_piece` method.
|
11
|
+
- Rename `type` method on Model to `desc` method.
|
12
|
+
- Add `size` and `n_params` methods to Model.
|
13
|
+
|
1
14
|
## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
|
2
15
|
|
3
16
|
**Breaking Changes**
|
data/examples/chat.rb
CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
122
122
|
|
123
123
|
if input_echo
|
124
124
|
output = []
|
125
|
-
embd.each { |token| output << context.
|
125
|
+
embd.each { |token| output << context.token_to_piece(token) }
|
126
126
|
output_str = output.join
|
127
127
|
output_str.chomp!(antiprompt) if first_input
|
128
128
|
print(output_str)
|
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
|
|
131
131
|
if embd_input.size <= n_consumed
|
132
132
|
if antiprompt.size.positive?
|
133
133
|
last_output = []
|
134
|
-
last_n_tokens.each { |token| last_output << context.
|
134
|
+
last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
|
135
135
|
last_output_str = last_output.join
|
136
136
|
|
137
137
|
search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -50,7 +50,7 @@ if with_config('accelerate')
|
|
50
50
|
end
|
51
51
|
|
52
52
|
if with_config('metal')
|
53
|
-
$CFLAGS << ' -DGGML_USE_METAL
|
53
|
+
$CFLAGS << ' -DGGML_USE_METAL'
|
54
54
|
$CXXFLAGS << ' -DGGML_USE_METAL'
|
55
55
|
$LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
|
56
56
|
$objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -811,9 +811,11 @@ public:
|
|
811
811
|
rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
|
812
812
|
rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
|
813
813
|
rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
|
814
|
-
rb_define_method(rb_cLLaMAModel, "
|
814
|
+
rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
|
815
815
|
rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
|
816
|
-
rb_define_method(rb_cLLaMAModel, "
|
816
|
+
rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
|
817
|
+
rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
|
818
|
+
rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
|
817
819
|
}
|
818
820
|
|
819
821
|
private:
|
@@ -974,7 +976,7 @@ private:
|
|
974
976
|
return INT2NUM(llama_model_n_embd(ptr->model));
|
975
977
|
}
|
976
978
|
|
977
|
-
static VALUE
|
979
|
+
static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
|
978
980
|
if (!RB_INTEGER_TYPE_P(token_)) {
|
979
981
|
rb_raise(rb_eArgError, "token must be an integer");
|
980
982
|
return Qnil;
|
@@ -982,10 +984,10 @@ private:
|
|
982
984
|
const llama_token token = NUM2INT(token_);
|
983
985
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
984
986
|
std::vector<char> result(8, 0);
|
985
|
-
const int n_tokens =
|
987
|
+
const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
986
988
|
if (n_tokens < 0) {
|
987
989
|
result.resize(-n_tokens);
|
988
|
-
const int check =
|
990
|
+
const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
|
989
991
|
if (check != -n_tokens) {
|
990
992
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
991
993
|
return Qnil;
|
@@ -1040,12 +1042,22 @@ private:
|
|
1040
1042
|
return ret;
|
1041
1043
|
}
|
1042
1044
|
|
1043
|
-
static VALUE
|
1045
|
+
static VALUE _llama_model_get_model_desc(VALUE self) {
|
1044
1046
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1045
1047
|
char buf[128];
|
1046
|
-
|
1048
|
+
llama_model_desc(ptr->model, buf, sizeof(buf));
|
1047
1049
|
return rb_str_new_cstr(buf);
|
1048
1050
|
}
|
1051
|
+
|
1052
|
+
static VALUE _llama_model_get_model_size(VALUE self) {
|
1053
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1054
|
+
return UINT2NUM(llama_model_size(ptr->model));
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_model_get_model_n_params(VALUE self) {
|
1058
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1059
|
+
return UINT2NUM(llama_model_n_params(ptr->model));
|
1060
|
+
}
|
1049
1061
|
};
|
1050
1062
|
|
1051
1063
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -1326,7 +1338,7 @@ public:
|
|
1326
1338
|
rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
|
1327
1339
|
rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
|
1328
1340
|
rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
|
1329
|
-
rb_define_method(rb_cLLaMAContext, "
|
1341
|
+
rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
|
1330
1342
|
rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
|
1331
1343
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
1332
1344
|
rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
|
@@ -1567,7 +1579,7 @@ private:
|
|
1567
1579
|
return output;
|
1568
1580
|
}
|
1569
1581
|
|
1570
|
-
static VALUE
|
1582
|
+
static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
|
1571
1583
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
1572
1584
|
if (ptr->ctx == NULL) {
|
1573
1585
|
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
@@ -1575,10 +1587,10 @@ private:
|
|
1575
1587
|
}
|
1576
1588
|
const llama_token token = NUM2INT(token_);
|
1577
1589
|
std::vector<char> result(8, 0);
|
1578
|
-
const int n_tokens =
|
1590
|
+
const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1579
1591
|
if (n_tokens < 0) {
|
1580
1592
|
result.resize(-n_tokens);
|
1581
|
-
const int check =
|
1593
|
+
const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
|
1582
1594
|
if (check != -n_tokens) {
|
1583
1595
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1584
1596
|
return Qnil;
|
@@ -1,3 +1,8 @@
|
|
1
|
+
// defines MAP_ANONYMOUS
|
2
|
+
#ifndef _GNU_SOURCE
|
3
|
+
#define _GNU_SOURCE
|
4
|
+
#endif
|
5
|
+
|
1
6
|
#include "ggml-alloc.h"
|
2
7
|
#include "ggml.h"
|
3
8
|
#include <assert.h>
|
@@ -6,6 +11,26 @@
|
|
6
11
|
#include <stdlib.h>
|
7
12
|
#include <string.h>
|
8
13
|
|
14
|
+
#ifdef __has_include
|
15
|
+
#if __has_include(<unistd.h>)
|
16
|
+
#include <unistd.h>
|
17
|
+
#if defined(_POSIX_MAPPED_FILES)
|
18
|
+
#include <sys/types.h>
|
19
|
+
#include <sys/mman.h>
|
20
|
+
#endif
|
21
|
+
#endif
|
22
|
+
#endif
|
23
|
+
|
24
|
+
#if defined(_WIN32)
|
25
|
+
#define WIN32_LEAN_AND_MEAN
|
26
|
+
#ifndef NOMINMAX
|
27
|
+
#define NOMINMAX
|
28
|
+
#endif
|
29
|
+
#include <windows.h>
|
30
|
+
#include <memoryapi.h>
|
31
|
+
#endif
|
32
|
+
|
33
|
+
|
9
34
|
#define UNUSED(x) (void)(x)
|
10
35
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
11
36
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
@@ -99,15 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
99
124
|
}
|
100
125
|
#endif
|
101
126
|
|
102
|
-
|
103
|
-
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
127
|
+
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
104
128
|
return ggml_nbytes(tensor);
|
105
129
|
|
106
130
|
UNUSED(alloc);
|
107
131
|
}
|
108
132
|
|
133
|
+
// check if a tensor is allocated by this buffer
|
134
|
+
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
135
|
+
void * ptr = tensor->data;
|
136
|
+
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
137
|
+
}
|
138
|
+
|
109
139
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
110
|
-
|
140
|
+
#ifdef GGML_ALLOCATOR_DEBUG
|
141
|
+
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
142
|
+
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
143
|
+
#endif
|
144
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
111
145
|
size = aligned_offset(NULL, size, alloc->alignment);
|
112
146
|
|
113
147
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -131,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
131
165
|
if (best_fit_block == -1) {
|
132
166
|
// the last block is our last resort
|
133
167
|
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
168
|
+
max_avail = MAX(max_avail, block->size);
|
134
169
|
if (block->size >= size) {
|
135
170
|
best_fit_block = alloc->n_free_blocks - 1;
|
136
|
-
max_avail = MAX(max_avail, block->size);
|
137
171
|
} else {
|
138
172
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
139
173
|
__func__, size, max_avail);
|
140
174
|
GGML_ASSERT(!"not enough space in the buffer");
|
141
|
-
|
175
|
+
return;
|
142
176
|
}
|
143
177
|
}
|
144
178
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
@@ -173,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
173
207
|
}
|
174
208
|
|
175
209
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
176
|
-
static void
|
210
|
+
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
177
211
|
void * ptr = tensor->data;
|
178
212
|
|
179
|
-
if (
|
213
|
+
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
180
214
|
// the tensor was not allocated in this buffer
|
181
215
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
182
216
|
// the easiest way to deal with this is just to ignore it
|
183
217
|
return;
|
184
218
|
}
|
185
219
|
|
186
|
-
size_t size =
|
220
|
+
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
187
221
|
size = aligned_offset(NULL, size, alloc->alignment);
|
188
222
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
189
223
|
|
@@ -268,7 +302,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
268
302
|
/*.parse_seq = */ {0},
|
269
303
|
/*.parse_seq_len = */ 0,
|
270
304
|
#ifdef GGML_ALLOCATOR_DEBUG
|
271
|
-
/*.allocated_tensors = */
|
305
|
+
/*.allocated_tensors = */ {0},
|
272
306
|
#endif
|
273
307
|
};
|
274
308
|
|
@@ -277,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
277
311
|
return alloc;
|
278
312
|
}
|
279
313
|
|
280
|
-
//
|
281
|
-
|
282
|
-
|
283
|
-
|
314
|
+
// OS specific functions to allocate and free uncommitted virtual memory
|
315
|
+
static void * alloc_vmem(size_t size) {
|
316
|
+
#if defined(_WIN32)
|
317
|
+
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
318
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
319
|
+
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
320
|
+
if (ptr == MAP_FAILED) {
|
321
|
+
return NULL;
|
322
|
+
}
|
323
|
+
return ptr;
|
324
|
+
#else
|
325
|
+
// use a fixed address for other platforms
|
326
|
+
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
327
|
+
return (void *)base_addr;
|
328
|
+
#endif
|
329
|
+
}
|
330
|
+
|
331
|
+
static void free_vmem(void * base_addr, size_t size) {
|
332
|
+
#if defined(_WIN32)
|
333
|
+
VirtualFree(base_addr, 0, MEM_RELEASE);
|
334
|
+
UNUSED(size);
|
335
|
+
#elif defined(_POSIX_MAPPED_FILES)
|
336
|
+
munmap(base_addr, size);
|
337
|
+
#else
|
338
|
+
// nothing to do
|
339
|
+
UNUSED(base_addr);
|
340
|
+
UNUSED(size);
|
341
|
+
#endif
|
342
|
+
}
|
343
|
+
|
344
|
+
// allocate uncommitted virtual memory to measure the size of the graph
|
345
|
+
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
346
|
+
// 1TB for 64-bit, 1GB for 32-bit
|
347
|
+
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
|
348
|
+
do {
|
349
|
+
*base_addr = alloc_vmem(*size);
|
350
|
+
if (*base_addr != NULL) {
|
351
|
+
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
352
|
+
return;
|
353
|
+
}
|
354
|
+
// try again with half the size
|
355
|
+
*size /= 2;
|
356
|
+
} while (*size > 0);
|
357
|
+
|
358
|
+
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
359
|
+
}
|
360
|
+
|
361
|
+
static void free_measure_vmem(void * base_addr, size_t size) {
|
362
|
+
free_vmem(base_addr, size);
|
363
|
+
}
|
284
364
|
|
285
365
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
286
366
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
287
367
|
|
368
|
+
void * base_addr;
|
369
|
+
size_t size;
|
370
|
+
|
371
|
+
alloc_measure_vmem(&base_addr, &size);
|
372
|
+
|
288
373
|
*alloc = (struct ggml_allocr){
|
289
|
-
/*.data = */
|
290
|
-
/*.size = */
|
374
|
+
/*.data = */ base_addr,
|
375
|
+
/*.size = */ size,
|
291
376
|
/*.alignment = */ alignment,
|
292
377
|
/*.n_free_blocks = */ 0,
|
293
378
|
/*.free_blocks = */ {{0}},
|
@@ -297,7 +382,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
297
382
|
/*.parse_seq = */ {0},
|
298
383
|
/*.parse_seq_len = */ 0,
|
299
384
|
#ifdef GGML_ALLOCATOR_DEBUG
|
300
|
-
/*.allocated_tensors = */
|
385
|
+
/*.allocated_tensors = */ {0},
|
301
386
|
#endif
|
302
387
|
};
|
303
388
|
|
@@ -307,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
307
392
|
}
|
308
393
|
|
309
394
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
395
|
+
if (alloc->measure) {
|
396
|
+
free_measure_vmem(alloc->data, alloc->size);
|
397
|
+
}
|
310
398
|
free(alloc);
|
311
399
|
}
|
312
400
|
|
@@ -317,8 +405,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
|
317
405
|
//////////// compute graph allocator
|
318
406
|
|
319
407
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
320
|
-
return t->
|
321
|
-
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
408
|
+
return t->view_src != NULL;
|
322
409
|
}
|
323
410
|
|
324
411
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
@@ -336,28 +423,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
336
423
|
return true;
|
337
424
|
}
|
338
425
|
|
339
|
-
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
340
|
-
switch (t->op) {
|
341
|
-
case GGML_OP_PERMUTE:
|
342
|
-
case GGML_OP_RESHAPE:
|
343
|
-
case GGML_OP_TRANSPOSE:
|
344
|
-
case GGML_OP_VIEW:
|
345
|
-
return t->src[0];
|
346
|
-
case GGML_OP_CPY:
|
347
|
-
return t->src[1];
|
348
|
-
default:
|
349
|
-
return NULL;
|
350
|
-
}
|
351
|
-
}
|
352
|
-
|
353
|
-
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
354
|
-
struct ggml_tensor * parent = t;
|
355
|
-
do {
|
356
|
-
parent = get_view_parent(parent);
|
357
|
-
} while (ggml_is_view(parent));
|
358
|
-
return parent;
|
359
|
-
}
|
360
|
-
|
361
426
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
362
427
|
switch (op) {
|
363
428
|
case GGML_OP_SCALE:
|
@@ -365,7 +430,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
365
430
|
case GGML_OP_DIAG_MASK_INF:
|
366
431
|
case GGML_OP_ADD:
|
367
432
|
case GGML_OP_ADD1:
|
368
|
-
case GGML_OP_ACC:
|
369
433
|
case GGML_OP_SUB:
|
370
434
|
case GGML_OP_MUL:
|
371
435
|
case GGML_OP_DIV:
|
@@ -375,7 +439,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
375
439
|
case GGML_OP_UNARY:
|
376
440
|
case GGML_OP_ROPE:
|
377
441
|
case GGML_OP_RMS_NORM:
|
378
|
-
case GGML_OP_SET:
|
379
442
|
case GGML_OP_SOFT_MAX:
|
380
443
|
case GGML_OP_CONT:
|
381
444
|
return true;
|
@@ -389,24 +452,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
389
452
|
struct hash_node * ht = alloc->hash_table;
|
390
453
|
if (node->data == NULL) {
|
391
454
|
if (ggml_is_view(node)) {
|
392
|
-
|
393
|
-
|
394
|
-
case GGML_OP_VIEW:
|
395
|
-
memcpy(&offset, node->op_params, sizeof(size_t));
|
396
|
-
node->data = (char *) node->src[0]->data + offset;
|
397
|
-
break;
|
398
|
-
case GGML_OP_PERMUTE:
|
399
|
-
case GGML_OP_RESHAPE:
|
400
|
-
case GGML_OP_TRANSPOSE:
|
401
|
-
node->data = node->src[0]->data;
|
402
|
-
break;
|
403
|
-
case GGML_OP_CPY:
|
404
|
-
node->data = node->src[1]->data;
|
405
|
-
break;
|
406
|
-
default:
|
407
|
-
GGML_ASSERT(!"unknown view op");
|
408
|
-
break;
|
409
|
-
}
|
455
|
+
assert(node->view_src->data != NULL);
|
456
|
+
node->data = (char *)node->view_src->data + node->view_offs;
|
410
457
|
} else {
|
411
458
|
// see if we can reuse a parent's buffer (inplace)
|
412
459
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -417,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
417
464
|
}
|
418
465
|
|
419
466
|
// if the node's data is external, then we cannot re-use it
|
420
|
-
if ((
|
421
|
-
(char *) parent->data >= ((char *) alloc->data + alloc->size)) {
|
467
|
+
if (ggml_allocr_is_own(alloc, parent) == false) {
|
422
468
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
423
469
|
continue;
|
424
470
|
}
|
@@ -426,7 +472,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
426
472
|
struct hash_node * p_hn = hash_get(ht, parent);
|
427
473
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
428
474
|
if (ggml_is_view(parent)) {
|
429
|
-
struct ggml_tensor * view_src =
|
475
|
+
struct ggml_tensor * view_src = parent->view_src;
|
430
476
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
431
477
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
432
478
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
@@ -452,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
452
498
|
}
|
453
499
|
}
|
454
500
|
|
455
|
-
static size_t
|
501
|
+
static size_t ggml_allocr_alloc_graph_tensors_n(
|
456
502
|
struct ggml_allocr * alloc,
|
457
503
|
struct ggml_cgraph ** graphs, int n_graphs,
|
458
504
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -468,7 +514,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
468
514
|
struct ggml_tensor * node = gf->nodes[i];
|
469
515
|
|
470
516
|
if (ggml_is_view(node)) {
|
471
|
-
struct ggml_tensor * view_src =
|
517
|
+
struct ggml_tensor * view_src = node->view_src;
|
472
518
|
hash_get(ht, view_src)->n_views += 1;
|
473
519
|
}
|
474
520
|
|
@@ -530,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
530
576
|
AT_PRINTF("\n");
|
531
577
|
}
|
532
578
|
|
533
|
-
|
534
579
|
// update parents
|
535
580
|
// update immediately if there is no parse_seq
|
536
581
|
// update only at barriers if there is parse_seq
|
537
|
-
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
582
|
+
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
538
583
|
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
584
|
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
585
|
for (int i = update_start; i < update_end; i++) {
|
@@ -553,17 +598,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
553
598
|
|
554
599
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
600
|
if (ggml_is_view(parent)) {
|
556
|
-
struct ggml_tensor * view_src =
|
601
|
+
struct ggml_tensor * view_src = parent->view_src;
|
557
602
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
603
|
view_src_hn->n_views -= 1;
|
559
|
-
AT_PRINTF("view_src %s\n", view_src->name);
|
604
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
560
605
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
-
|
606
|
+
ggml_allocr_free_tensor(alloc, view_src);
|
562
607
|
}
|
563
608
|
}
|
564
609
|
else {
|
565
610
|
if (parent->data != node->data) {
|
566
|
-
|
611
|
+
ggml_allocr_free_tensor(alloc, parent);
|
567
612
|
}
|
568
613
|
}
|
569
614
|
}
|
@@ -580,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
580
625
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
581
626
|
struct ggml_tensor * output = outputs[g][i];
|
582
627
|
AT_PRINTF("output: %s\n", output->name);
|
583
|
-
|
628
|
+
ggml_allocr_free_tensor(alloc, output);
|
584
629
|
}
|
585
630
|
}
|
586
631
|
}
|
@@ -589,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
|
589
634
|
}
|
590
635
|
|
591
636
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
592
|
-
return
|
637
|
+
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
593
638
|
}
|