gpt_neox_client 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,594 @@
1
+ #include "ggml-alloc.h"
2
+ #include "ggml.h"
3
+ #include <assert.h>
4
+ #include <stdarg.h>
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ #define UNUSED(x) (void)(x)
10
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
12
+
13
+ //#define GGML_ALLOCATOR_DEBUG
14
+
15
+ //#define AT_PRINTF printf
16
+ #define AT_PRINTF(...) ((void)0)
17
+
18
+ struct hash_node {
19
+ struct ggml_tensor * t;
20
+ int n_children;
21
+ int n_views;
22
+ };
23
+
24
+ static size_t hash(void * p) {
25
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
26
+ }
27
+
28
+ static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
29
+ size_t h = hash(t);
30
+
31
+ // linear probing
32
+ size_t i = h;
33
+ while (hash_table[i].t != NULL) {
34
+ if (hash_table[i].t == t) {
35
+ return &hash_table[i];
36
+ }
37
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
38
+ if (i == h) {
39
+ // hash table is full
40
+ GGML_ASSERT(false);
41
+ }
42
+ }
43
+
44
+ hash_table[i].t = t;
45
+ return &hash_table[i];
46
+ }
47
+
48
+ // TODO: GGML_PAD ?
49
+ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
50
+ assert(alignment && !(alignment & (alignment - 1))); // power of 2
51
+ size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
52
+ return offset + align;
53
+ }
54
+
55
+ struct free_block {
56
+ void * addr;
57
+ size_t size;
58
+ };
59
+
60
+ #define MAX_FREE_BLOCKS 128
61
+
62
+ struct ggml_allocr {
63
+ void * data;
64
+ size_t size;
65
+ size_t alignment;
66
+ int n_free_blocks;
67
+ struct free_block free_blocks[MAX_FREE_BLOCKS];
68
+ struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
69
+ size_t max_size;
70
+ bool measure;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
73
+
74
+ #ifdef GGML_ALLOCATOR_DEBUG
75
+ struct ggml_tensor * allocated_tensors[1024];
76
+ #endif
77
+ };
78
+
79
+ #ifdef GGML_ALLOCATOR_DEBUG
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
81
+ for (int i = 0; i < 1024; i++) {
82
+ if (alloc->allocated_tensors[i] == NULL) {
83
+ alloc->allocated_tensors[i] = tensor;
84
+ return;
85
+ }
86
+ }
87
+ GGML_ASSERT(!"out of allocated_tensors");
88
+ }
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
90
+ for (int i = 0; i < 1024; i++) {
91
+ if (alloc->allocated_tensors[i] == tensor ||
92
+ (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
93
+ alloc->allocated_tensors[i] = NULL;
94
+ return;
95
+ }
96
+ }
97
+ printf("tried to free tensor %s not found\n", tensor->name);
98
+ GGML_ASSERT(!"tensor not found");
99
+ }
100
+ #endif
101
+
102
+
103
+ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
+ return ggml_nbytes(tensor);
105
+
106
+ UNUSED(alloc);
107
+ }
108
+
109
+ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
+ size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
111
+ size = aligned_offset(NULL, size, alloc->alignment);
112
+
113
+ AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
114
+
115
+ size_t max_avail = 0;
116
+
117
+ // find the best fitting free block besides the last block
118
+ int best_fit_block = -1;
119
+ size_t best_fit_size = SIZE_MAX;
120
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
121
+ struct free_block * block = &alloc->free_blocks[i];
122
+ max_avail = MAX(max_avail, block->size);
123
+ if (block->size >= size && block->size <= best_fit_size) {
124
+ best_fit_block = i;
125
+ best_fit_size = block->size;
126
+ }
127
+ }
128
+
129
+ AT_PRINTF("block %d\n", best_fit_block);
130
+
131
+ if (best_fit_block == -1) {
132
+ // the last block is our last resort
133
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
134
+ if (block->size >= size) {
135
+ best_fit_block = alloc->n_free_blocks - 1;
136
+ max_avail = MAX(max_avail, block->size);
137
+ } else {
138
+ fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
139
+ __func__, size, max_avail);
140
+ GGML_ASSERT(!"not enough space in the buffer");
141
+ return;
142
+ }
143
+ }
144
+ struct free_block * block = &alloc->free_blocks[best_fit_block];
145
+ void * addr = block->addr;
146
+ block->addr = (char*)block->addr + size;
147
+ block->size -= size;
148
+ if (block->size == 0) {
149
+ // remove block if empty
150
+ alloc->n_free_blocks--;
151
+ for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
152
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
153
+ }
154
+ }
155
+
156
+ tensor->data = addr;
157
+
158
+ #ifdef GGML_ALLOCATOR_DEBUG
159
+ add_allocated_tensor(alloc, tensor);
160
+ size_t cur_max = (char*)addr - (char*)alloc->data + size;
161
+ if (cur_max > alloc->max_size) {
162
+ printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
163
+ for (int i = 0; i < 1024; i++) {
164
+ if (alloc->allocated_tensors[i]) {
165
+ printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
166
+ }
167
+ }
168
+ printf("\n");
169
+ }
170
+ #endif
171
+
172
+ alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
173
+ }
174
+
175
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
176
+ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
177
+ void * ptr = tensor->data;
178
+
179
+ if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
180
+ // the tensor was not allocated in this buffer
181
+ // this can happen because the graph allocator will try to free weights and other tensors from different buffers
182
+ // the easiest way to deal with this is just to ignore it
183
+ return;
184
+ }
185
+
186
+ size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
187
+ size = aligned_offset(NULL, size, alloc->alignment);
188
+ AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
189
+
190
+ #ifdef GGML_ALLOCATOR_DEBUG
191
+ remove_allocated_tensor(alloc, tensor);
192
+ #endif
193
+
194
+ // see if we can merge with an existing block
195
+ for (int i = 0; i < alloc->n_free_blocks; i++) {
196
+ struct free_block * block = &alloc->free_blocks[i];
197
+ // check if ptr is at the end of the block
198
+ if ((char*)block->addr + block->size == ptr) {
199
+ block->size += size;
200
+ // check if we can merge with the next block
201
+ if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
202
+ block->size += alloc->free_blocks[i+1].size;
203
+ alloc->n_free_blocks--;
204
+ for (int j = i+1; j < alloc->n_free_blocks; j++) {
205
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
206
+ }
207
+ }
208
+ return;
209
+ }
210
+ // check if ptr is at the beginning of the block
211
+ if ((char*)ptr + size == block->addr) {
212
+ block->addr = ptr;
213
+ block->size += size;
214
+ // check if we can merge with the previous block
215
+ if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
216
+ alloc->free_blocks[i-1].size += block->size;
217
+ alloc->n_free_blocks--;
218
+ for (int j = i; j < alloc->n_free_blocks; j++) {
219
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
220
+ }
221
+ }
222
+ return;
223
+ }
224
+ }
225
+ // otherwise, add a new block
226
+ GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
227
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
228
+ int insert_pos = 0;
229
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
230
+ insert_pos++;
231
+ }
232
+ // shift all blocks from insert_pos onward to make room for the new block
233
+ for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
234
+ alloc->free_blocks[i] = alloc->free_blocks[i-1];
235
+ }
236
+ // insert the new block
237
+ alloc->free_blocks[insert_pos].addr = ptr;
238
+ alloc->free_blocks[insert_pos].size = size;
239
+ alloc->n_free_blocks++;
240
+ }
241
+
242
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
+ for (int i = 0; i < n; i++) {
244
+ alloc->parse_seq[i] = list[i];
245
+ }
246
+ alloc->parse_seq_len = n;
247
+ }
248
+
249
+ void ggml_allocr_reset(struct ggml_allocr * alloc) {
250
+ alloc->n_free_blocks = 1;
251
+ size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
252
+ alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
253
+ alloc->free_blocks[0].size = alloc->size - align_offset;
254
+ }
255
+
256
+ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
257
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
258
+
259
+ *alloc = (struct ggml_allocr){
260
+ /*.data = */ data,
261
+ /*.size = */ size,
262
+ /*.alignment = */ alignment,
263
+ /*.n_free_blocks = */ 0,
264
+ /*.free_blocks = */ {{0}},
265
+ /*.hash_table = */ {{0}},
266
+ /*.max_size = */ 0,
267
+ /*.measure = */ false,
268
+ /*.parse_seq = */ {0},
269
+ /*.parse_seq_len = */ 0,
270
+ #ifdef GGML_ALLOCATOR_DEBUG
271
+ /*.allocated_tensors = */ {0},
272
+ #endif
273
+ };
274
+
275
+ ggml_allocr_reset(alloc);
276
+
277
+ return alloc;
278
+ }
279
+
280
+ // address and size of the buffer when measuring
281
+ // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
282
+ static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
283
+ static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
284
+
285
+ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
286
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
287
+
288
+ *alloc = (struct ggml_allocr){
289
+ /*.data = */ MEASURE_BASE_ADDR,
290
+ /*.size = */ MEASURE_MAX_SIZE,
291
+ /*.alignment = */ alignment,
292
+ /*.n_free_blocks = */ 0,
293
+ /*.free_blocks = */ {{0}},
294
+ /*.hash_table = */ {{0}},
295
+ /*.max_size = */ 0,
296
+ /*.measure = */ true,
297
+ /*.parse_seq = */ {0},
298
+ /*.parse_seq_len = */ 0,
299
+ #ifdef GGML_ALLOCATOR_DEBUG
300
+ /*.allocated_tensors = */ {0},
301
+ #endif
302
+ };
303
+
304
+ ggml_allocr_reset(alloc);
305
+
306
+ return alloc;
307
+ }
308
+
309
+ void ggml_allocr_free(struct ggml_allocr * alloc) {
310
+ free(alloc);
311
+ }
312
+
313
+ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
314
+ return alloc->measure;
315
+ }
316
+
317
+ //////////// compute graph allocator
318
+
319
+ static bool ggml_is_view(struct ggml_tensor * t) {
320
+ return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
321
+ t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
322
+ }
323
+
324
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
325
+ if (a->type != b->type) {
326
+ return false;
327
+ }
328
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
329
+ if (a->ne[i] != b->ne[i]) {
330
+ return false;
331
+ }
332
+ if (a->nb[i] != b->nb[i]) {
333
+ return false;
334
+ }
335
+ }
336
+ return true;
337
+ }
338
+
339
+ static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
340
+ switch (t->op) {
341
+ case GGML_OP_PERMUTE:
342
+ case GGML_OP_RESHAPE:
343
+ case GGML_OP_TRANSPOSE:
344
+ case GGML_OP_VIEW:
345
+ return t->src[0];
346
+ case GGML_OP_CPY:
347
+ return t->src[1];
348
+ default:
349
+ return NULL;
350
+ }
351
+ }
352
+
353
+ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
354
+ struct ggml_tensor * parent = t;
355
+ do {
356
+ parent = get_view_parent(parent);
357
+ } while (ggml_is_view(parent));
358
+ return parent;
359
+ }
360
+
361
+ static bool ggml_op_can_inplace(enum ggml_op op) {
362
+ switch (op) {
363
+ case GGML_OP_SCALE:
364
+ case GGML_OP_DIAG_MASK_ZERO:
365
+ case GGML_OP_DIAG_MASK_INF:
366
+ case GGML_OP_ADD:
367
+ case GGML_OP_ADD1:
368
+ case GGML_OP_ACC:
369
+ case GGML_OP_SUB:
370
+ case GGML_OP_MUL:
371
+ case GGML_OP_DIV:
372
+ case GGML_OP_SQR:
373
+ case GGML_OP_SQRT:
374
+ case GGML_OP_LOG:
375
+ case GGML_OP_UNARY:
376
+ case GGML_OP_ROPE:
377
+ case GGML_OP_RMS_NORM:
378
+ case GGML_OP_SET:
379
+ case GGML_OP_SOFT_MAX:
380
+ case GGML_OP_CONT:
381
+ case GGML_OP_ADD_REL_POS:
382
+ return true;
383
+
384
+ default:
385
+ return false;
386
+ }
387
+ }
388
+
389
+ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
390
+ struct hash_node * ht = alloc->hash_table;
391
+ if (node->data == NULL) {
392
+ if (ggml_is_view(node)) {
393
+ size_t offset;
394
+ switch(node->op) {
395
+ case GGML_OP_VIEW:
396
+ memcpy(&offset, node->op_params, sizeof(size_t));
397
+ node->data = (char *) node->src[0]->data + offset;
398
+ break;
399
+ case GGML_OP_PERMUTE:
400
+ case GGML_OP_RESHAPE:
401
+ case GGML_OP_TRANSPOSE:
402
+ node->data = node->src[0]->data;
403
+ break;
404
+ case GGML_OP_CPY:
405
+ node->data = node->src[1]->data;
406
+ break;
407
+ default:
408
+ GGML_ASSERT(!"unknown view op");
409
+ break;
410
+ }
411
+ } else {
412
+ // see if we can reuse a parent's buffer (inplace)
413
+ if (ggml_op_can_inplace(node->op)) {
414
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
415
+ struct ggml_tensor * parent = node->src[i];
416
+ if (parent == NULL) {
417
+ break;
418
+ }
419
+
420
+ // if the node's data is external, then we cannot re-use it
421
+ if ((char *) parent->data < (char *) alloc->data ||
422
+ (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
423
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
424
+ continue;
425
+ }
426
+
427
+ struct hash_node * p_hn = hash_get(ht, parent);
428
+ if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
429
+ if (ggml_is_view(parent)) {
430
+ struct ggml_tensor * view_src = get_view_source(parent);
431
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
432
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
433
+ // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
434
+ // the parent's data that it will need later (same layout requirement). the problem is that then
435
+ // we cannot free the tensor because the original address of the allocation is lost.
436
+ // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
437
+ // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
438
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
439
+ node->data = parent->data;
440
+ return;
441
+ }
442
+ }
443
+ else {
444
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
445
+ node->data = parent->data;
446
+ return;
447
+ }
448
+ }
449
+ }
450
+ }
451
+ ggml_allocr_alloc(alloc, node);
452
+ }
453
+ }
454
+ }
455
+
456
+ static size_t ggml_allocator_alloc_graph_tensors_n(
457
+ struct ggml_allocr * alloc,
458
+ struct ggml_cgraph ** graphs, int n_graphs,
459
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
460
+
461
+ // reset hash table
462
+ struct hash_node * ht = alloc->hash_table;
463
+ memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
464
+
465
+ // count number of children and views
466
+ for (int g = 0; g < n_graphs; g++) {
467
+ struct ggml_cgraph * gf = graphs[g];
468
+ for (int i = 0; i < gf->n_nodes; i++) {
469
+ struct ggml_tensor * node = gf->nodes[i];
470
+
471
+ if (ggml_is_view(node)) {
472
+ struct ggml_tensor * view_src = get_view_source(node);
473
+ hash_get(ht, view_src)->n_views += 1;
474
+ }
475
+
476
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
477
+ struct ggml_tensor * parent = node->src[j];
478
+ if (parent == NULL) {
479
+ break;
480
+ }
481
+ hash_get(ht, parent)->n_children += 1;
482
+ }
483
+ }
484
+ }
485
+
486
+ // allocate tensors
487
+ for (int g = 0; g < n_graphs; g++) {
488
+ struct ggml_cgraph * gf = graphs[g];
489
+ AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
490
+ // graph inputs are allocated first to ensure that they are not overwritten by each other
491
+ if (inputs != NULL && inputs[g] != NULL) {
492
+ for (int i = 0; inputs[g][i] != NULL; i++) {
493
+ struct ggml_tensor * input = inputs[g][i];
494
+ AT_PRINTF("input: %s\n", input->name);
495
+ allocate_node(alloc, input);
496
+ }
497
+ }
498
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
499
+ int last_barrier_pos = 0;
500
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
501
+
502
+ for (int ind = 0; ind < n_nodes; ind++) {
503
+ // allocate a node if there is no parse_seq or this is not a barrier
504
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
505
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
506
+ struct ggml_tensor * node = gf->nodes[i];
507
+
508
+ // allocate parents (leafs)
509
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
510
+ struct ggml_tensor * parent = node->src[j];
511
+ if (parent == NULL) {
512
+ break;
513
+ }
514
+ allocate_node(alloc, parent);
515
+ }
516
+
517
+ // allocate node
518
+ allocate_node(alloc, node);
519
+
520
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
521
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
522
+ struct ggml_tensor * parent = node->src[j];
523
+ if (parent == NULL) {
524
+ break;
525
+ }
526
+ AT_PRINTF("%s", parent->name);
527
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
528
+ AT_PRINTF(", ");
529
+ }
530
+ }
531
+ AT_PRINTF("\n");
532
+ }
533
+
534
+
535
+ // update parents
536
+ // update immediately if there is no parse_seq
537
+ // update only at barriers if there is parse_seq
538
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
539
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
540
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
541
+ for (int i = update_start; i < update_end; i++) {
542
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
543
+ struct ggml_tensor * node = gf->nodes[node_i];
544
+
545
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
546
+ struct ggml_tensor * parent = node->src[j];
547
+ if (parent == NULL) {
548
+ break;
549
+ }
550
+ struct hash_node * p_hn = hash_get(ht, parent);
551
+ p_hn->n_children -= 1;
552
+
553
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
554
+
555
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
556
+ if (ggml_is_view(parent)) {
557
+ struct ggml_tensor * view_src = get_view_source(parent);
558
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
559
+ view_src_hn->n_views -= 1;
560
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
561
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
562
+ ggml_allocator_free_tensor(alloc, view_src);
563
+ }
564
+ }
565
+ else {
566
+ if (parent->data != node->data) {
567
+ ggml_allocator_free_tensor(alloc, parent);
568
+ }
569
+ }
570
+ }
571
+ }
572
+ }
573
+ AT_PRINTF("\n");
574
+ if (alloc->parse_seq_len) {
575
+ last_barrier_pos = ind + 1;
576
+ }
577
+ }
578
+ }
579
+ // free graph outputs here that wouldn't be freed otherwise because they have no children
580
+ if (outputs != NULL && outputs[g] != NULL) {
581
+ for (int i = 0; outputs[g][i] != NULL; i++) {
582
+ struct ggml_tensor * output = outputs[g][i];
583
+ AT_PRINTF("output: %s\n", output->name);
584
+ ggml_allocator_free_tensor(alloc, output);
585
+ }
586
+ }
587
+ }
588
+
589
+ return alloc->max_size;
590
+ }
591
+
592
+ size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
593
+ return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
594
+ }
@@ -0,0 +1,26 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+
9
+
10
+ GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
+ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
+
13
+ // tell the allocator to parse nodes following the order described in the list
14
+ // you should call this if your graph are optimized to execute out-of-order
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
+
17
+ GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
+ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
+ GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
+ GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
+ GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
+
23
+
24
+ #ifdef __cplusplus
25
+ }
26
+ #endif