gpt_neox_client 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,594 @@
1
+ #include "ggml-alloc.h"
2
+ #include "ggml.h"
3
+ #include <assert.h>
4
+ #include <stdarg.h>
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ #define UNUSED(x) (void)(x)
10
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
12
+
13
+ //#define GGML_ALLOCATOR_DEBUG
14
+
15
+ //#define AT_PRINTF printf
16
+ #define AT_PRINTF(...) ((void)0)
17
+
18
+ struct hash_node {
19
+ struct ggml_tensor * t;
20
+ int n_children;
21
+ int n_views;
22
+ };
23
+
24
+ static size_t hash(void * p) {
25
+ return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
26
+ }
27
+
28
+ static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
29
+ size_t h = hash(t);
30
+
31
+ // linear probing
32
+ size_t i = h;
33
+ while (hash_table[i].t != NULL) {
34
+ if (hash_table[i].t == t) {
35
+ return &hash_table[i];
36
+ }
37
+ i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
38
+ if (i == h) {
39
+ // hash table is full
40
+ GGML_ASSERT(false);
41
+ }
42
+ }
43
+
44
+ hash_table[i].t = t;
45
+ return &hash_table[i];
46
+ }
47
+
48
+ // TODO: GGML_PAD ?
49
+ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
50
+ assert(alignment && !(alignment & (alignment - 1))); // power of 2
51
+ size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
52
+ return offset + align;
53
+ }
54
+
55
+ struct free_block {
56
+ void * addr;
57
+ size_t size;
58
+ };
59
+
60
+ #define MAX_FREE_BLOCKS 128
61
+
62
+ struct ggml_allocr {
63
+ void * data;
64
+ size_t size;
65
+ size_t alignment;
66
+ int n_free_blocks;
67
+ struct free_block free_blocks[MAX_FREE_BLOCKS];
68
+ struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
69
+ size_t max_size;
70
+ bool measure;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
73
+
74
+ #ifdef GGML_ALLOCATOR_DEBUG
75
+ struct ggml_tensor * allocated_tensors[1024];
76
+ #endif
77
+ };
78
+
79
+ #ifdef GGML_ALLOCATOR_DEBUG
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
81
+ for (int i = 0; i < 1024; i++) {
82
+ if (alloc->allocated_tensors[i] == NULL) {
83
+ alloc->allocated_tensors[i] = tensor;
84
+ return;
85
+ }
86
+ }
87
+ GGML_ASSERT(!"out of allocated_tensors");
88
+ }
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
90
+ for (int i = 0; i < 1024; i++) {
91
+ if (alloc->allocated_tensors[i] == tensor ||
92
+ (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
93
+ alloc->allocated_tensors[i] = NULL;
94
+ return;
95
+ }
96
+ }
97
+ printf("tried to free tensor %s not found\n", tensor->name);
98
+ GGML_ASSERT(!"tensor not found");
99
+ }
100
+ #endif
101
+
102
+
103
+ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
+ return ggml_nbytes(tensor);
105
+
106
+ UNUSED(alloc);
107
+ }
108
+
109
+ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
+ size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
111
+ size = aligned_offset(NULL, size, alloc->alignment);
112
+
113
+ AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
114
+
115
+ size_t max_avail = 0;
116
+
117
+ // find the best fitting free block besides the last block
118
+ int best_fit_block = -1;
119
+ size_t best_fit_size = SIZE_MAX;
120
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
121
+ struct free_block * block = &alloc->free_blocks[i];
122
+ max_avail = MAX(max_avail, block->size);
123
+ if (block->size >= size && block->size <= best_fit_size) {
124
+ best_fit_block = i;
125
+ best_fit_size = block->size;
126
+ }
127
+ }
128
+
129
+ AT_PRINTF("block %d\n", best_fit_block);
130
+
131
+ if (best_fit_block == -1) {
132
+ // the last block is our last resort
133
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
134
+ if (block->size >= size) {
135
+ best_fit_block = alloc->n_free_blocks - 1;
136
+ max_avail = MAX(max_avail, block->size);
137
+ } else {
138
+ fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
139
+ __func__, size, max_avail);
140
+ GGML_ASSERT(!"not enough space in the buffer");
141
+ return;
142
+ }
143
+ }
144
+ struct free_block * block = &alloc->free_blocks[best_fit_block];
145
+ void * addr = block->addr;
146
+ block->addr = (char*)block->addr + size;
147
+ block->size -= size;
148
+ if (block->size == 0) {
149
+ // remove block if empty
150
+ alloc->n_free_blocks--;
151
+ for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
152
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
153
+ }
154
+ }
155
+
156
+ tensor->data = addr;
157
+
158
+ #ifdef GGML_ALLOCATOR_DEBUG
159
+ add_allocated_tensor(alloc, tensor);
160
+ size_t cur_max = (char*)addr - (char*)alloc->data + size;
161
+ if (cur_max > alloc->max_size) {
162
+ printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
163
+ for (int i = 0; i < 1024; i++) {
164
+ if (alloc->allocated_tensors[i]) {
165
+ printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
166
+ }
167
+ }
168
+ printf("\n");
169
+ }
170
+ #endif
171
+
172
+ alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
173
+ }
174
+
175
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
176
+ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
177
+ void * ptr = tensor->data;
178
+
179
+ if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
180
+ // the tensor was not allocated in this buffer
181
+ // this can happen because the graph allocator will try to free weights and other tensors from different buffers
182
+ // the easiest way to deal with this is just to ignore it
183
+ return;
184
+ }
185
+
186
+ size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
187
+ size = aligned_offset(NULL, size, alloc->alignment);
188
+ AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
189
+
190
+ #ifdef GGML_ALLOCATOR_DEBUG
191
+ remove_allocated_tensor(alloc, tensor);
192
+ #endif
193
+
194
+ // see if we can merge with an existing block
195
+ for (int i = 0; i < alloc->n_free_blocks; i++) {
196
+ struct free_block * block = &alloc->free_blocks[i];
197
+ // check if ptr is at the end of the block
198
+ if ((char*)block->addr + block->size == ptr) {
199
+ block->size += size;
200
+ // check if we can merge with the next block
201
+ if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
202
+ block->size += alloc->free_blocks[i+1].size;
203
+ alloc->n_free_blocks--;
204
+ for (int j = i+1; j < alloc->n_free_blocks; j++) {
205
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
206
+ }
207
+ }
208
+ return;
209
+ }
210
+ // check if ptr is at the beginning of the block
211
+ if ((char*)ptr + size == block->addr) {
212
+ block->addr = ptr;
213
+ block->size += size;
214
+ // check if we can merge with the previous block
215
+ if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
216
+ alloc->free_blocks[i-1].size += block->size;
217
+ alloc->n_free_blocks--;
218
+ for (int j = i; j < alloc->n_free_blocks; j++) {
219
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
220
+ }
221
+ }
222
+ return;
223
+ }
224
+ }
225
+ // otherwise, add a new block
226
+ GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
227
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
228
+ int insert_pos = 0;
229
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
230
+ insert_pos++;
231
+ }
232
+ // shift all blocks from insert_pos onward to make room for the new block
233
+ for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
234
+ alloc->free_blocks[i] = alloc->free_blocks[i-1];
235
+ }
236
+ // insert the new block
237
+ alloc->free_blocks[insert_pos].addr = ptr;
238
+ alloc->free_blocks[insert_pos].size = size;
239
+ alloc->n_free_blocks++;
240
+ }
241
+
242
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
+ for (int i = 0; i < n; i++) {
244
+ alloc->parse_seq[i] = list[i];
245
+ }
246
+ alloc->parse_seq_len = n;
247
+ }
248
+
249
+ void ggml_allocr_reset(struct ggml_allocr * alloc) {
250
+ alloc->n_free_blocks = 1;
251
+ size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
252
+ alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
253
+ alloc->free_blocks[0].size = alloc->size - align_offset;
254
+ }
255
+
256
+ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
257
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
258
+
259
+ *alloc = (struct ggml_allocr){
260
+ /*.data = */ data,
261
+ /*.size = */ size,
262
+ /*.alignment = */ alignment,
263
+ /*.n_free_blocks = */ 0,
264
+ /*.free_blocks = */ {{0}},
265
+ /*.hash_table = */ {{0}},
266
+ /*.max_size = */ 0,
267
+ /*.measure = */ false,
268
+ /*.parse_seq = */ {0},
269
+ /*.parse_seq_len = */ 0,
270
+ #ifdef GGML_ALLOCATOR_DEBUG
271
+ /*.allocated_tensors = */ {0},
272
+ #endif
273
+ };
274
+
275
+ ggml_allocr_reset(alloc);
276
+
277
+ return alloc;
278
+ }
279
+
280
+ // address and size of the buffer when measuring
281
+ // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
282
+ static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
283
+ static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
284
+
285
+ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
286
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
287
+
288
+ *alloc = (struct ggml_allocr){
289
+ /*.data = */ MEASURE_BASE_ADDR,
290
+ /*.size = */ MEASURE_MAX_SIZE,
291
+ /*.alignment = */ alignment,
292
+ /*.n_free_blocks = */ 0,
293
+ /*.free_blocks = */ {{0}},
294
+ /*.hash_table = */ {{0}},
295
+ /*.max_size = */ 0,
296
+ /*.measure = */ true,
297
+ /*.parse_seq = */ {0},
298
+ /*.parse_seq_len = */ 0,
299
+ #ifdef GGML_ALLOCATOR_DEBUG
300
+ /*.allocated_tensors = */ {0},
301
+ #endif
302
+ };
303
+
304
+ ggml_allocr_reset(alloc);
305
+
306
+ return alloc;
307
+ }
308
+
309
+ void ggml_allocr_free(struct ggml_allocr * alloc) {
310
+ free(alloc);
311
+ }
312
+
313
+ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
314
+ return alloc->measure;
315
+ }
316
+
317
+ //////////// compute graph allocator
318
+
319
+ static bool ggml_is_view(struct ggml_tensor * t) {
320
+ return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
321
+ t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
322
+ }
323
+
324
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
325
+ if (a->type != b->type) {
326
+ return false;
327
+ }
328
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
329
+ if (a->ne[i] != b->ne[i]) {
330
+ return false;
331
+ }
332
+ if (a->nb[i] != b->nb[i]) {
333
+ return false;
334
+ }
335
+ }
336
+ return true;
337
+ }
338
+
339
+ static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
340
+ switch (t->op) {
341
+ case GGML_OP_PERMUTE:
342
+ case GGML_OP_RESHAPE:
343
+ case GGML_OP_TRANSPOSE:
344
+ case GGML_OP_VIEW:
345
+ return t->src[0];
346
+ case GGML_OP_CPY:
347
+ return t->src[1];
348
+ default:
349
+ return NULL;
350
+ }
351
+ }
352
+
353
+ static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
354
+ struct ggml_tensor * parent = t;
355
+ do {
356
+ parent = get_view_parent(parent);
357
+ } while (ggml_is_view(parent));
358
+ return parent;
359
+ }
360
+
361
+ static bool ggml_op_can_inplace(enum ggml_op op) {
362
+ switch (op) {
363
+ case GGML_OP_SCALE:
364
+ case GGML_OP_DIAG_MASK_ZERO:
365
+ case GGML_OP_DIAG_MASK_INF:
366
+ case GGML_OP_ADD:
367
+ case GGML_OP_ADD1:
368
+ case GGML_OP_ACC:
369
+ case GGML_OP_SUB:
370
+ case GGML_OP_MUL:
371
+ case GGML_OP_DIV:
372
+ case GGML_OP_SQR:
373
+ case GGML_OP_SQRT:
374
+ case GGML_OP_LOG:
375
+ case GGML_OP_UNARY:
376
+ case GGML_OP_ROPE:
377
+ case GGML_OP_RMS_NORM:
378
+ case GGML_OP_SET:
379
+ case GGML_OP_SOFT_MAX:
380
+ case GGML_OP_CONT:
381
+ case GGML_OP_ADD_REL_POS:
382
+ return true;
383
+
384
+ default:
385
+ return false;
386
+ }
387
+ }
388
+
389
+ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
390
+ struct hash_node * ht = alloc->hash_table;
391
+ if (node->data == NULL) {
392
+ if (ggml_is_view(node)) {
393
+ size_t offset;
394
+ switch(node->op) {
395
+ case GGML_OP_VIEW:
396
+ memcpy(&offset, node->op_params, sizeof(size_t));
397
+ node->data = (char *) node->src[0]->data + offset;
398
+ break;
399
+ case GGML_OP_PERMUTE:
400
+ case GGML_OP_RESHAPE:
401
+ case GGML_OP_TRANSPOSE:
402
+ node->data = node->src[0]->data;
403
+ break;
404
+ case GGML_OP_CPY:
405
+ node->data = node->src[1]->data;
406
+ break;
407
+ default:
408
+ GGML_ASSERT(!"unknown view op");
409
+ break;
410
+ }
411
+ } else {
412
+ // see if we can reuse a parent's buffer (inplace)
413
+ if (ggml_op_can_inplace(node->op)) {
414
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
415
+ struct ggml_tensor * parent = node->src[i];
416
+ if (parent == NULL) {
417
+ break;
418
+ }
419
+
420
+ // if the node's data is external, then we cannot re-use it
421
+ if ((char *) parent->data < (char *) alloc->data ||
422
+ (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
423
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
424
+ continue;
425
+ }
426
+
427
+ struct hash_node * p_hn = hash_get(ht, parent);
428
+ if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
429
+ if (ggml_is_view(parent)) {
430
+ struct ggml_tensor * view_src = get_view_source(parent);
431
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
432
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
433
+ // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
434
+ // the parent's data that it will need later (same layout requirement). the problem is that then
435
+ // we cannot free the tensor because the original address of the allocation is lost.
436
+ // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
437
+ // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
438
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
439
+ node->data = parent->data;
440
+ return;
441
+ }
442
+ }
443
+ else {
444
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
445
+ node->data = parent->data;
446
+ return;
447
+ }
448
+ }
449
+ }
450
+ }
451
+ ggml_allocr_alloc(alloc, node);
452
+ }
453
+ }
454
+ }
455
+
456
+ static size_t ggml_allocator_alloc_graph_tensors_n(
457
+ struct ggml_allocr * alloc,
458
+ struct ggml_cgraph ** graphs, int n_graphs,
459
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
460
+
461
+ // reset hash table
462
+ struct hash_node * ht = alloc->hash_table;
463
+ memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
464
+
465
+ // count number of children and views
466
+ for (int g = 0; g < n_graphs; g++) {
467
+ struct ggml_cgraph * gf = graphs[g];
468
+ for (int i = 0; i < gf->n_nodes; i++) {
469
+ struct ggml_tensor * node = gf->nodes[i];
470
+
471
+ if (ggml_is_view(node)) {
472
+ struct ggml_tensor * view_src = get_view_source(node);
473
+ hash_get(ht, view_src)->n_views += 1;
474
+ }
475
+
476
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
477
+ struct ggml_tensor * parent = node->src[j];
478
+ if (parent == NULL) {
479
+ break;
480
+ }
481
+ hash_get(ht, parent)->n_children += 1;
482
+ }
483
+ }
484
+ }
485
+
486
+ // allocate tensors
487
+ for (int g = 0; g < n_graphs; g++) {
488
+ struct ggml_cgraph * gf = graphs[g];
489
+ AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
490
+ // graph inputs are allocated first to ensure that they are not overwritten by each other
491
+ if (inputs != NULL && inputs[g] != NULL) {
492
+ for (int i = 0; inputs[g][i] != NULL; i++) {
493
+ struct ggml_tensor * input = inputs[g][i];
494
+ AT_PRINTF("input: %s\n", input->name);
495
+ allocate_node(alloc, input);
496
+ }
497
+ }
498
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
499
+ int last_barrier_pos = 0;
500
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
501
+
502
+ for (int ind = 0; ind < n_nodes; ind++) {
503
+ // allocate a node if there is no parse_seq or this is not a barrier
504
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
505
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
506
+ struct ggml_tensor * node = gf->nodes[i];
507
+
508
+ // allocate parents (leafs)
509
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
510
+ struct ggml_tensor * parent = node->src[j];
511
+ if (parent == NULL) {
512
+ break;
513
+ }
514
+ allocate_node(alloc, parent);
515
+ }
516
+
517
+ // allocate node
518
+ allocate_node(alloc, node);
519
+
520
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
521
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
522
+ struct ggml_tensor * parent = node->src[j];
523
+ if (parent == NULL) {
524
+ break;
525
+ }
526
+ AT_PRINTF("%s", parent->name);
527
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
528
+ AT_PRINTF(", ");
529
+ }
530
+ }
531
+ AT_PRINTF("\n");
532
+ }
533
+
534
+
535
+ // update parents
536
+ // update immediately if there is no parse_seq
537
+ // update only at barriers if there is parse_seq
538
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
539
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
540
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
541
+ for (int i = update_start; i < update_end; i++) {
542
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
543
+ struct ggml_tensor * node = gf->nodes[node_i];
544
+
545
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
546
+ struct ggml_tensor * parent = node->src[j];
547
+ if (parent == NULL) {
548
+ break;
549
+ }
550
+ struct hash_node * p_hn = hash_get(ht, parent);
551
+ p_hn->n_children -= 1;
552
+
553
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
554
+
555
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
556
+ if (ggml_is_view(parent)) {
557
+ struct ggml_tensor * view_src = get_view_source(parent);
558
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
559
+ view_src_hn->n_views -= 1;
560
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
561
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
562
+ ggml_allocator_free_tensor(alloc, view_src);
563
+ }
564
+ }
565
+ else {
566
+ if (parent->data != node->data) {
567
+ ggml_allocator_free_tensor(alloc, parent);
568
+ }
569
+ }
570
+ }
571
+ }
572
+ }
573
+ AT_PRINTF("\n");
574
+ if (alloc->parse_seq_len) {
575
+ last_barrier_pos = ind + 1;
576
+ }
577
+ }
578
+ }
579
+ // free graph outputs here that wouldn't be freed otherwise because they have no children
580
+ if (outputs != NULL && outputs[g] != NULL) {
581
+ for (int i = 0; outputs[g][i] != NULL; i++) {
582
+ struct ggml_tensor * output = outputs[g][i];
583
+ AT_PRINTF("output: %s\n", output->name);
584
+ ggml_allocator_free_tensor(alloc, output);
585
+ }
586
+ }
587
+ }
588
+
589
+ return alloc->max_size;
590
+ }
591
+
592
+ size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
593
+ return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
594
+ }
@@ -0,0 +1,26 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+
9
+
10
+ GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
+ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
+
13
+ // tell the allocator to parse nodes following the order described in the list
14
+ // you should call this if your graph are optimized to execute out-of-order
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
+
17
+ GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
+ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
+ GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
+ GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
+ GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
+
23
+
24
+ #ifdef __cplusplus
25
+ }
26
+ #endif