llama_cpp 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +194 -8
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +826 -1482
- data/ext/llama_cpp/src/ggml.h +63 -45
- data/ext/llama_cpp/src/llama.cpp +364 -38
- data/ext/llama_cpp/src/llama.h +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,51 +1,21 @@
|
|
1
1
|
#include "ggml-alloc.h"
|
2
|
-
#include "ggml-backend.h"
|
2
|
+
#include "ggml-backend-impl.h"
|
3
3
|
#include "ggml.h"
|
4
|
+
#include "ggml-impl.h"
|
4
5
|
#include <assert.h>
|
6
|
+
#include <limits.h>
|
5
7
|
#include <stdarg.h>
|
6
8
|
#include <stdio.h>
|
7
9
|
#include <stdlib.h>
|
8
10
|
#include <string.h>
|
9
11
|
|
10
|
-
|
11
|
-
#define UNUSED(x) (void)(x)
|
12
12
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
13
|
-
#define
|
13
|
+
#define MAX_FREE_BLOCKS 256
|
14
14
|
|
15
15
|
//#define GGML_ALLOCATOR_DEBUG
|
16
16
|
|
17
|
-
//#define AT_PRINTF
|
18
|
-
#define AT_PRINTF(...)
|
19
|
-
|
20
|
-
struct hash_node {
|
21
|
-
struct ggml_tensor * t;
|
22
|
-
int n_children;
|
23
|
-
int n_views;
|
24
|
-
};
|
25
|
-
|
26
|
-
static size_t hash(void * p) {
|
27
|
-
return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
|
28
|
-
}
|
29
|
-
|
30
|
-
static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
|
31
|
-
size_t h = hash(t);
|
32
|
-
|
33
|
-
// linear probing
|
34
|
-
size_t i = h;
|
35
|
-
while (hash_table[i].t != NULL) {
|
36
|
-
if (hash_table[i].t == t) {
|
37
|
-
return &hash_table[i];
|
38
|
-
}
|
39
|
-
i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
|
40
|
-
if (i == h) {
|
41
|
-
// hash table is full
|
42
|
-
GGML_ASSERT(false);
|
43
|
-
}
|
44
|
-
}
|
45
|
-
|
46
|
-
hash_table[i].t = t;
|
47
|
-
return &hash_table[i];
|
48
|
-
}
|
17
|
+
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
18
|
+
#define AT_PRINTF(...)
|
49
19
|
|
50
20
|
// TODO: GGML_PAD ?
|
51
21
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
@@ -59,20 +29,18 @@ struct free_block {
|
|
59
29
|
size_t size;
|
60
30
|
};
|
61
31
|
|
62
|
-
|
63
|
-
|
64
|
-
struct ggml_allocr {
|
32
|
+
struct ggml_tallocr {
|
65
33
|
struct ggml_backend_buffer * buffer;
|
66
34
|
bool buffer_owned;
|
67
|
-
void *
|
35
|
+
void * base;
|
68
36
|
size_t alignment;
|
37
|
+
|
69
38
|
int n_free_blocks;
|
70
39
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
71
|
-
|
40
|
+
|
72
41
|
size_t max_size;
|
42
|
+
|
73
43
|
bool measure;
|
74
|
-
int parse_seq[GGML_MAX_CONCUR];
|
75
|
-
int parse_seq_len;
|
76
44
|
|
77
45
|
#ifdef GGML_ALLOCATOR_DEBUG
|
78
46
|
struct ggml_tensor * allocated_tensors[1024];
|
@@ -80,7 +48,7 @@ struct ggml_allocr {
|
|
80
48
|
};
|
81
49
|
|
82
50
|
#ifdef GGML_ALLOCATOR_DEBUG
|
83
|
-
static void add_allocated_tensor(
|
51
|
+
static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
84
52
|
for (int i = 0; i < 1024; i++) {
|
85
53
|
if (alloc->allocated_tensors[i] == NULL) {
|
86
54
|
alloc->allocated_tensors[i] = tensor;
|
@@ -89,7 +57,7 @@ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor
|
|
89
57
|
}
|
90
58
|
GGML_ASSERT(!"out of allocated_tensors");
|
91
59
|
}
|
92
|
-
static void remove_allocated_tensor(
|
60
|
+
static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
93
61
|
for (int i = 0; i < 1024; i++) {
|
94
62
|
if (alloc->allocated_tensors[i] == tensor ||
|
95
63
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
@@ -103,7 +71,7 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
103
71
|
#endif
|
104
72
|
|
105
73
|
// check if a tensor is allocated by this buffer
|
106
|
-
static bool
|
74
|
+
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
107
75
|
return tensor->buffer == alloc->buffer;
|
108
76
|
}
|
109
77
|
|
@@ -111,7 +79,7 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
|
111
79
|
return t->view_src != NULL;
|
112
80
|
}
|
113
81
|
|
114
|
-
void
|
82
|
+
void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
115
83
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
116
84
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
117
85
|
|
@@ -162,9 +130,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
162
130
|
}
|
163
131
|
|
164
132
|
tensor->data = addr;
|
165
|
-
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
166
133
|
tensor->buffer = alloc->buffer;
|
167
|
-
|
134
|
+
if (!alloc->measure) {
|
135
|
+
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
136
|
+
}
|
168
137
|
|
169
138
|
#ifdef GGML_ALLOCATOR_DEBUG
|
170
139
|
add_allocated_tensor(alloc, tensor);
|
@@ -180,16 +149,16 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
180
149
|
}
|
181
150
|
#endif
|
182
151
|
|
183
|
-
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->
|
152
|
+
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
|
184
153
|
}
|
185
154
|
|
186
155
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
187
|
-
static void
|
188
|
-
if (
|
156
|
+
static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
157
|
+
if (ggml_tallocr_is_own(alloc, tensor) == false) {
|
189
158
|
// the tensor was not allocated in this buffer
|
190
159
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
191
160
|
// the easiest way to deal with this is just to ignore it
|
192
|
-
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
161
|
+
// AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
193
162
|
return;
|
194
163
|
}
|
195
164
|
|
@@ -199,7 +168,9 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
199
168
|
size = aligned_offset(NULL, size, alloc->alignment);
|
200
169
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
201
170
|
|
202
|
-
|
171
|
+
if (!alloc->measure) {
|
172
|
+
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
173
|
+
}
|
203
174
|
|
204
175
|
#ifdef GGML_ALLOCATOR_DEBUG
|
205
176
|
remove_allocated_tensor(alloc, tensor);
|
@@ -253,91 +224,180 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
253
224
|
alloc->n_free_blocks++;
|
254
225
|
}
|
255
226
|
|
256
|
-
void
|
257
|
-
for (int i = 0; i < n; i++) {
|
258
|
-
alloc->parse_seq[i] = list[i];
|
259
|
-
}
|
260
|
-
alloc->parse_seq_len = n;
|
261
|
-
}
|
262
|
-
|
263
|
-
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
227
|
+
void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
264
228
|
alloc->n_free_blocks = 1;
|
265
|
-
size_t align_offset = aligned_offset(alloc->
|
266
|
-
alloc->free_blocks[0].addr = (char *)alloc->
|
267
|
-
|
229
|
+
size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
|
230
|
+
alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
|
231
|
+
|
232
|
+
if (alloc->measure) {
|
233
|
+
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
234
|
+
} else {
|
235
|
+
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
236
|
+
}
|
268
237
|
}
|
269
238
|
|
270
|
-
|
239
|
+
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
271
240
|
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
272
241
|
|
273
|
-
|
242
|
+
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
274
243
|
|
275
|
-
*alloc = (struct
|
244
|
+
*alloc = (struct ggml_tallocr) {
|
276
245
|
/*.buffer = */ buffer,
|
277
246
|
/*.buffer_owned = */ true,
|
278
247
|
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
279
248
|
/*.alignment = */ alignment,
|
280
249
|
/*.n_free_blocks = */ 0,
|
281
250
|
/*.free_blocks = */ {{0}},
|
282
|
-
/*.hash_table = */ {{0}},
|
283
251
|
/*.max_size = */ 0,
|
284
252
|
/*.measure = */ false,
|
285
|
-
/*.parse_seq = */ {0},
|
286
|
-
/*.parse_seq_len = */ 0,
|
287
253
|
#ifdef GGML_ALLOCATOR_DEBUG
|
288
254
|
/*.allocated_tensors = */ {0},
|
289
255
|
#endif
|
290
256
|
};
|
291
257
|
|
292
|
-
|
258
|
+
ggml_tallocr_reset(alloc);
|
259
|
+
|
260
|
+
return alloc;
|
261
|
+
}
|
262
|
+
|
263
|
+
ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
264
|
+
ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
|
265
|
+
alloc->measure = true;
|
293
266
|
|
294
267
|
return alloc;
|
295
268
|
}
|
296
269
|
|
297
|
-
struct
|
298
|
-
|
270
|
+
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
271
|
+
// create a backend buffer to get the correct tensor allocation sizes
|
272
|
+
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
|
273
|
+
|
274
|
+
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
275
|
+
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
276
|
+
alloc->buffer_owned = true;
|
299
277
|
alloc->measure = true;
|
278
|
+
ggml_tallocr_reset(alloc);
|
279
|
+
return alloc;
|
280
|
+
}
|
300
281
|
|
282
|
+
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
283
|
+
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
|
284
|
+
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
285
|
+
alloc->buffer_owned = true;
|
301
286
|
return alloc;
|
302
287
|
}
|
303
288
|
|
304
|
-
|
305
|
-
|
289
|
+
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
290
|
+
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
306
291
|
|
307
|
-
*alloc = (struct
|
292
|
+
*alloc = (struct ggml_tallocr) {
|
308
293
|
/*.buffer = */ buffer,
|
309
294
|
/*.buffer_owned = */ false,
|
310
295
|
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
311
296
|
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
312
297
|
/*.n_free_blocks = */ 0,
|
313
298
|
/*.free_blocks = */ {{0}},
|
314
|
-
/*.hash_table = */ {{0}},
|
315
299
|
/*.max_size = */ 0,
|
316
300
|
/*.measure = */ false,
|
317
|
-
/*.parse_seq = */ {0},
|
318
|
-
/*.parse_seq_len = */ 0,
|
319
301
|
#ifdef GGML_ALLOCATOR_DEBUG
|
320
302
|
/*.allocated_tensors = */ {0},
|
321
303
|
#endif
|
322
304
|
};
|
323
305
|
|
324
|
-
|
306
|
+
ggml_tallocr_reset(alloc);
|
325
307
|
|
326
308
|
return alloc;
|
327
309
|
}
|
328
310
|
|
329
|
-
|
311
|
+
struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
|
312
|
+
return alloc->buffer;
|
313
|
+
}
|
314
|
+
|
315
|
+
void ggml_tallocr_free(ggml_tallocr_t alloc) {
|
316
|
+
if (alloc == NULL) {
|
317
|
+
return;
|
318
|
+
}
|
319
|
+
|
330
320
|
if (alloc->buffer_owned) {
|
331
321
|
ggml_backend_buffer_free(alloc->buffer);
|
332
322
|
}
|
333
323
|
free(alloc);
|
334
324
|
}
|
335
325
|
|
336
|
-
bool
|
326
|
+
bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
337
327
|
return alloc->measure;
|
338
328
|
}
|
339
329
|
|
340
|
-
|
330
|
+
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
331
|
+
return alloc->max_size;
|
332
|
+
}
|
333
|
+
|
334
|
+
// graph allocator
|
335
|
+
|
336
|
+
struct hash_node {
|
337
|
+
int n_children;
|
338
|
+
int n_views;
|
339
|
+
};
|
340
|
+
|
341
|
+
struct ggml_gallocr {
|
342
|
+
ggml_tallocr_t talloc;
|
343
|
+
struct ggml_hash_set hash_set;
|
344
|
+
struct hash_node * hash_values;
|
345
|
+
size_t hash_values_size;
|
346
|
+
ggml_tallocr_t * hash_allocs;
|
347
|
+
int * parse_seq;
|
348
|
+
int parse_seq_len;
|
349
|
+
};
|
350
|
+
|
351
|
+
ggml_gallocr_t ggml_gallocr_new(void) {
|
352
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
|
353
|
+
|
354
|
+
*galloc = (struct ggml_gallocr) {
|
355
|
+
/*.talloc = */ NULL,
|
356
|
+
/*.hash_set = */ {0},
|
357
|
+
/*.hash_values = */ NULL,
|
358
|
+
/*.hash_values_size = */ 0,
|
359
|
+
/*.hash_allocs = */ NULL,
|
360
|
+
/*.parse_seq = */ NULL,
|
361
|
+
/*.parse_seq_len = */ 0,
|
362
|
+
};
|
363
|
+
|
364
|
+
return galloc;
|
365
|
+
}
|
366
|
+
|
367
|
+
void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
368
|
+
if (galloc == NULL) {
|
369
|
+
return;
|
370
|
+
}
|
371
|
+
|
372
|
+
if (galloc->hash_set.keys != NULL) {
|
373
|
+
free(galloc->hash_set.keys);
|
374
|
+
}
|
375
|
+
if (galloc->hash_values != NULL) {
|
376
|
+
free(galloc->hash_values);
|
377
|
+
}
|
378
|
+
if (galloc->hash_allocs != NULL) {
|
379
|
+
free(galloc->hash_allocs);
|
380
|
+
}
|
381
|
+
if (galloc->parse_seq != NULL) {
|
382
|
+
free(galloc->parse_seq);
|
383
|
+
}
|
384
|
+
free(galloc);
|
385
|
+
}
|
386
|
+
|
387
|
+
void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
|
388
|
+
free(galloc->parse_seq);
|
389
|
+
galloc->parse_seq = malloc(sizeof(int) * n);
|
390
|
+
|
391
|
+
for (int i = 0; i < n; i++) {
|
392
|
+
galloc->parse_seq[i] = list[i];
|
393
|
+
}
|
394
|
+
galloc->parse_seq_len = n;
|
395
|
+
}
|
396
|
+
|
397
|
+
static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
398
|
+
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
399
|
+
return &galloc->hash_values[i];
|
400
|
+
}
|
341
401
|
|
342
402
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
343
403
|
if (a->type != b->type) {
|
@@ -378,27 +438,40 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
378
438
|
}
|
379
439
|
}
|
380
440
|
|
381
|
-
static
|
382
|
-
|
441
|
+
static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
442
|
+
if (galloc->talloc != NULL) {
|
443
|
+
return galloc->talloc;
|
444
|
+
}
|
445
|
+
|
446
|
+
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
447
|
+
}
|
448
|
+
|
449
|
+
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
450
|
+
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
383
451
|
|
452
|
+
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
453
|
+
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
384
454
|
if (update_backend) {
|
385
455
|
view->backend = view->view_src->backend;
|
386
456
|
}
|
387
|
-
|
388
457
|
view->buffer = view->view_src->buffer;
|
389
458
|
view->data = (char *)view->view_src->data + view->view_offs;
|
390
459
|
|
391
460
|
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
392
461
|
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
393
|
-
assert(
|
394
|
-
|
462
|
+
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
463
|
+
|
464
|
+
if (!alloc->measure) {
|
465
|
+
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
466
|
+
}
|
395
467
|
}
|
396
468
|
|
397
|
-
static void allocate_node(
|
398
|
-
|
469
|
+
static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
470
|
+
ggml_tallocr_t alloc = node_tallocr(galloc, node);
|
471
|
+
|
399
472
|
if (node->data == NULL) {
|
400
473
|
if (ggml_is_view(node)) {
|
401
|
-
init_view(
|
474
|
+
init_view(galloc, node, true);
|
402
475
|
} else {
|
403
476
|
// see if we can reuse a parent's buffer (inplace)
|
404
477
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -409,16 +482,16 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
409
482
|
}
|
410
483
|
|
411
484
|
// if the node's data is external, then we cannot re-use it
|
412
|
-
if (
|
485
|
+
if (ggml_tallocr_is_own(alloc, parent) == false) {
|
413
486
|
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
414
487
|
continue;
|
415
488
|
}
|
416
489
|
|
417
|
-
struct hash_node * p_hn = hash_get(
|
490
|
+
struct hash_node * p_hn = hash_get(galloc, parent);
|
418
491
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
419
492
|
if (ggml_is_view(parent)) {
|
420
493
|
struct ggml_tensor * view_src = parent->view_src;
|
421
|
-
struct hash_node * view_src_hn = hash_get(
|
494
|
+
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
422
495
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
423
496
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
424
497
|
// the parent's data that it will need later (same layout requirement). the problem is that then
|
@@ -428,170 +501,267 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
428
501
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
429
502
|
node->view_src = view_src;
|
430
503
|
view_src_hn->n_views += 1;
|
431
|
-
init_view(
|
504
|
+
init_view(galloc, node, false);
|
432
505
|
return;
|
433
506
|
}
|
434
507
|
} else {
|
435
508
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
436
509
|
node->view_src = parent;
|
437
510
|
p_hn->n_views += 1;
|
438
|
-
init_view(
|
511
|
+
init_view(galloc, node, false);
|
439
512
|
return;
|
440
513
|
}
|
441
514
|
}
|
442
515
|
}
|
443
516
|
}
|
444
|
-
|
517
|
+
ggml_tallocr_alloc(alloc, node);
|
445
518
|
}
|
446
519
|
}
|
447
520
|
}
|
448
521
|
|
449
|
-
|
450
|
-
|
451
|
-
struct ggml_cgraph ** graphs, int n_graphs,
|
452
|
-
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
522
|
+
static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
523
|
+
ggml_tallocr_t alloc = node_tallocr(galloc, node);
|
453
524
|
|
454
|
-
|
455
|
-
|
456
|
-
|
525
|
+
ggml_tallocr_free_tensor(alloc, node);
|
526
|
+
}
|
527
|
+
|
528
|
+
static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
|
529
|
+
const int * parse_seq = galloc->parse_seq;
|
530
|
+
int parse_seq_len = galloc->parse_seq_len;
|
457
531
|
|
458
532
|
// count number of children and views
|
459
|
-
for (int
|
460
|
-
struct
|
461
|
-
|
533
|
+
for (int i = 0; i < gf->n_nodes; i++) {
|
534
|
+
struct ggml_tensor * node = gf->nodes[i];
|
535
|
+
|
536
|
+
if (ggml_is_view(node)) {
|
537
|
+
struct ggml_tensor * view_src = node->view_src;
|
538
|
+
hash_get(galloc, view_src)->n_views += 1;
|
539
|
+
if (node->buffer == NULL && node->data != NULL) {
|
540
|
+
// view of a pre-allocated tensor, didn't call init_view() yet
|
541
|
+
init_view(galloc, node, true);
|
542
|
+
}
|
543
|
+
}
|
544
|
+
|
545
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
546
|
+
struct ggml_tensor * parent = node->src[j];
|
547
|
+
if (parent == NULL) {
|
548
|
+
break;
|
549
|
+
}
|
550
|
+
hash_get(galloc, parent)->n_children += 1;
|
551
|
+
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
552
|
+
init_view(galloc, parent, true);
|
553
|
+
}
|
554
|
+
}
|
555
|
+
}
|
556
|
+
|
557
|
+
// allocate tensors
|
558
|
+
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
559
|
+
int last_barrier_pos = 0;
|
560
|
+
int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
|
561
|
+
|
562
|
+
for (int ind = 0; ind < n_nodes; ind++) {
|
563
|
+
// allocate a node if there is no parse_seq or this is not a barrier
|
564
|
+
if (parse_seq_len == 0 || parse_seq[ind] != -1) {
|
565
|
+
int i = parse_seq_len ? parse_seq[ind] : ind;
|
462
566
|
struct ggml_tensor * node = gf->nodes[i];
|
463
567
|
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
if (
|
468
|
-
|
469
|
-
init_view(alloc, node, true);
|
568
|
+
// allocate parents (leafs)
|
569
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
570
|
+
struct ggml_tensor * parent = node->src[j];
|
571
|
+
if (parent == NULL) {
|
572
|
+
break;
|
470
573
|
}
|
574
|
+
allocate_node(galloc, parent);
|
471
575
|
}
|
472
576
|
|
577
|
+
// allocate node
|
578
|
+
allocate_node(galloc, node);
|
579
|
+
|
580
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
473
581
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
474
582
|
struct ggml_tensor * parent = node->src[j];
|
475
583
|
if (parent == NULL) {
|
476
584
|
break;
|
477
585
|
}
|
478
|
-
|
479
|
-
if (
|
480
|
-
|
586
|
+
AT_PRINTF("%s", parent->name);
|
587
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
588
|
+
AT_PRINTF(", ");
|
481
589
|
}
|
482
590
|
}
|
591
|
+
AT_PRINTF("\n");
|
483
592
|
}
|
484
|
-
}
|
485
|
-
|
486
|
-
// allocate tensors
|
487
|
-
for (int g = 0; g < n_graphs; g++) {
|
488
|
-
struct ggml_cgraph * gf = graphs[g];
|
489
|
-
AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
|
490
|
-
// graph inputs are allocated first to ensure that they are not overwritten by each other
|
491
|
-
if (inputs != NULL && inputs[g] != NULL) {
|
492
|
-
for (int i = 0; inputs[g][i] != NULL; i++) {
|
493
|
-
struct ggml_tensor * input = inputs[g][i];
|
494
|
-
AT_PRINTF("input: %s\n", input->name);
|
495
|
-
allocate_node(alloc, input);
|
496
|
-
}
|
497
|
-
}
|
498
|
-
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
499
|
-
int last_barrier_pos = 0;
|
500
|
-
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
501
593
|
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
594
|
+
// update parents
|
595
|
+
// update immediately if there is no parse_seq
|
596
|
+
// update only at barriers if there is parse_seq
|
597
|
+
if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
|
598
|
+
int update_start = parse_seq_len ? last_barrier_pos : ind;
|
599
|
+
int update_end = parse_seq_len ? ind : ind + 1;
|
600
|
+
for (int i = update_start; i < update_end; i++) {
|
601
|
+
int node_i = parse_seq_len ? parse_seq[i] : i;
|
602
|
+
struct ggml_tensor * node = gf->nodes[node_i];
|
507
603
|
|
508
|
-
// allocate parents (leafs)
|
509
604
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
510
605
|
struct ggml_tensor * parent = node->src[j];
|
511
606
|
if (parent == NULL) {
|
512
607
|
break;
|
513
608
|
}
|
514
|
-
|
515
|
-
|
609
|
+
struct hash_node * p_hn = hash_get(galloc, parent);
|
610
|
+
p_hn->n_children -= 1;
|
516
611
|
|
517
|
-
|
518
|
-
allocate_node(alloc, node);
|
612
|
+
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
519
613
|
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
AT_PRINTF(", ");
|
529
|
-
}
|
530
|
-
}
|
531
|
-
AT_PRINTF("\n");
|
532
|
-
}
|
533
|
-
|
534
|
-
// update parents
|
535
|
-
// update immediately if there is no parse_seq
|
536
|
-
// update only at barriers if there is parse_seq
|
537
|
-
if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
|
538
|
-
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
539
|
-
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
540
|
-
for (int i = update_start; i < update_end; i++) {
|
541
|
-
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
542
|
-
struct ggml_tensor * node = gf->nodes[node_i];
|
543
|
-
|
544
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
545
|
-
struct ggml_tensor * parent = node->src[j];
|
546
|
-
if (parent == NULL) {
|
547
|
-
break;
|
548
|
-
}
|
549
|
-
struct hash_node * p_hn = hash_get(ht, parent);
|
550
|
-
p_hn->n_children -= 1;
|
551
|
-
|
552
|
-
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
553
|
-
|
554
|
-
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
555
|
-
if (ggml_is_view(parent)) {
|
556
|
-
struct ggml_tensor * view_src = parent->view_src;
|
557
|
-
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
558
|
-
view_src_hn->n_views -= 1;
|
559
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
560
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
561
|
-
ggml_allocr_free_tensor(alloc, view_src);
|
562
|
-
}
|
563
|
-
}
|
564
|
-
else {
|
565
|
-
if (parent->data != node->data) {
|
566
|
-
ggml_allocr_free_tensor(alloc, parent);
|
567
|
-
}
|
614
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
615
|
+
if (ggml_is_view(parent)) {
|
616
|
+
struct ggml_tensor * view_src = parent->view_src;
|
617
|
+
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
618
|
+
view_src_hn->n_views -= 1;
|
619
|
+
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
620
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
|
621
|
+
free_node(galloc, view_src);
|
568
622
|
}
|
569
623
|
}
|
624
|
+
else {
|
625
|
+
free_node(galloc, parent);
|
626
|
+
}
|
570
627
|
}
|
571
628
|
}
|
572
|
-
AT_PRINTF("\n");
|
573
|
-
if (alloc->parse_seq_len) {
|
574
|
-
last_barrier_pos = ind + 1;
|
575
|
-
}
|
576
629
|
}
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
for (int i = 0; outputs[g][i] != NULL; i++) {
|
581
|
-
struct ggml_tensor * output = outputs[g][i];
|
582
|
-
AT_PRINTF("output: %s\n", output->name);
|
583
|
-
ggml_allocr_free_tensor(alloc, output);
|
630
|
+
AT_PRINTF("\n");
|
631
|
+
if (parse_seq_len) {
|
632
|
+
last_barrier_pos = ind + 1;
|
584
633
|
}
|
585
634
|
}
|
586
635
|
}
|
636
|
+
}
|
587
637
|
|
588
|
-
|
638
|
+
size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
|
639
|
+
size_t hash_size = graph->visited_hash_table.size;
|
640
|
+
|
641
|
+
// check if the hash table is initialized and large enough
|
642
|
+
if (galloc->hash_set.size < hash_size) {
|
643
|
+
if (galloc->hash_set.keys != NULL) {
|
644
|
+
free(galloc->hash_set.keys);
|
645
|
+
}
|
646
|
+
if (galloc->hash_values != NULL) {
|
647
|
+
free(galloc->hash_values);
|
648
|
+
}
|
649
|
+
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
|
650
|
+
galloc->hash_set.size = hash_size;
|
651
|
+
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
|
652
|
+
}
|
653
|
+
|
654
|
+
// reset hash table
|
655
|
+
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
|
656
|
+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
657
|
+
|
658
|
+
galloc->talloc = talloc;
|
659
|
+
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
660
|
+
galloc->talloc = NULL;
|
661
|
+
|
662
|
+
size_t max_size = ggml_tallocr_max_size(talloc);
|
663
|
+
|
664
|
+
return max_size;
|
589
665
|
}
|
590
666
|
|
591
|
-
|
592
|
-
|
667
|
+
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
668
|
+
const size_t hash_size = hash_set.size;
|
669
|
+
|
670
|
+
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
671
|
+
|
672
|
+
galloc->talloc = NULL;
|
673
|
+
|
674
|
+
// alloc hash_values if needed
|
675
|
+
if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
|
676
|
+
free(galloc->hash_values);
|
677
|
+
galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
|
678
|
+
galloc->hash_values_size = hash_size;
|
679
|
+
}
|
680
|
+
|
681
|
+
// free hash_set.keys if needed
|
682
|
+
if (galloc->hash_set.keys != NULL) {
|
683
|
+
free(galloc->hash_set.keys);
|
684
|
+
}
|
685
|
+
galloc->hash_set = hash_set;
|
686
|
+
|
687
|
+
// reset hash values
|
688
|
+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
689
|
+
|
690
|
+
galloc->hash_allocs = hash_node_talloc;
|
691
|
+
|
692
|
+
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
693
|
+
|
694
|
+
// remove unowned resources
|
695
|
+
galloc->hash_set.keys = NULL;
|
696
|
+
galloc->hash_allocs = NULL;
|
593
697
|
}
|
594
698
|
|
595
|
-
|
596
|
-
|
699
|
+
// legacy API wrapper
|
700
|
+
|
701
|
+
struct ggml_allocr {
|
702
|
+
ggml_tallocr_t talloc;
|
703
|
+
ggml_gallocr_t galloc;
|
704
|
+
};
|
705
|
+
|
706
|
+
static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
|
707
|
+
ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
|
708
|
+
*alloc = (struct ggml_allocr) {
|
709
|
+
/*.talloc = */ talloc,
|
710
|
+
/*.galloc = */ ggml_gallocr_new(),
|
711
|
+
};
|
712
|
+
return alloc;
|
713
|
+
}
|
714
|
+
|
715
|
+
ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
716
|
+
return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
|
717
|
+
}
|
718
|
+
|
719
|
+
ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
|
720
|
+
return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
|
721
|
+
}
|
722
|
+
|
723
|
+
ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
724
|
+
return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
|
725
|
+
}
|
726
|
+
|
727
|
+
ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
728
|
+
return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
|
729
|
+
}
|
730
|
+
|
731
|
+
ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
|
732
|
+
return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
|
733
|
+
}
|
734
|
+
|
735
|
+
struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
|
736
|
+
return ggml_tallocr_get_buffer(alloc->talloc);
|
737
|
+
}
|
738
|
+
|
739
|
+
void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
740
|
+
ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
|
741
|
+
}
|
742
|
+
|
743
|
+
void ggml_allocr_free(ggml_allocr_t alloc) {
|
744
|
+
ggml_gallocr_free(alloc->galloc);
|
745
|
+
ggml_tallocr_free(alloc->talloc);
|
746
|
+
free(alloc);
|
747
|
+
}
|
748
|
+
|
749
|
+
bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
|
750
|
+
return ggml_tallocr_is_measure(alloc->talloc);
|
751
|
+
}
|
752
|
+
|
753
|
+
void ggml_allocr_reset(ggml_allocr_t alloc) {
|
754
|
+
ggml_tallocr_reset(alloc->talloc);
|
755
|
+
}
|
756
|
+
|
757
|
+
void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
|
758
|
+
ggml_tallocr_alloc(alloc->talloc, tensor);
|
759
|
+
}
|
760
|
+
|
761
|
+
size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
|
762
|
+
return ggml_tallocr_max_size(alloc->talloc);
|
763
|
+
}
|
764
|
+
|
765
|
+
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
766
|
+
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
597
767
|
}
|