llama_cpp 0.9.2 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,51 +1,21 @@
1
1
  #include "ggml-alloc.h"
2
- #include "ggml-backend.h"
2
+ #include "ggml-backend-impl.h"
3
3
  #include "ggml.h"
4
+ #include "ggml-impl.h"
4
5
  #include <assert.h>
6
+ #include <limits.h>
5
7
  #include <stdarg.h>
6
8
  #include <stdio.h>
7
9
  #include <stdlib.h>
8
10
  #include <string.h>
9
11
 
10
-
11
- #define UNUSED(x) (void)(x)
12
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
- #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
13
+ #define MAX_FREE_BLOCKS 256
14
14
 
15
15
  //#define GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF printf
18
- #define AT_PRINTF(...) ((void)0)
19
-
20
- struct hash_node {
21
- struct ggml_tensor * t;
22
- int n_children;
23
- int n_views;
24
- };
25
-
26
- static size_t hash(void * p) {
27
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
28
- }
29
-
30
- static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
31
- size_t h = hash(t);
32
-
33
- // linear probing
34
- size_t i = h;
35
- while (hash_table[i].t != NULL) {
36
- if (hash_table[i].t == t) {
37
- return &hash_table[i];
38
- }
39
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
40
- if (i == h) {
41
- // hash table is full
42
- GGML_ASSERT(false);
43
- }
44
- }
45
-
46
- hash_table[i].t = t;
47
- return &hash_table[i];
48
- }
17
+ //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
+ #define AT_PRINTF(...)
49
19
 
50
20
  // TODO: GGML_PAD ?
51
21
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@@ -59,20 +29,18 @@ struct free_block {
59
29
  size_t size;
60
30
  };
61
31
 
62
- #define MAX_FREE_BLOCKS 256
63
-
64
- struct ggml_allocr {
32
+ struct ggml_tallocr {
65
33
  struct ggml_backend_buffer * buffer;
66
34
  bool buffer_owned;
67
- void * data;
35
+ void * base;
68
36
  size_t alignment;
37
+
69
38
  int n_free_blocks;
70
39
  struct free_block free_blocks[MAX_FREE_BLOCKS];
71
- struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
40
+
72
41
  size_t max_size;
42
+
73
43
  bool measure;
74
- int parse_seq[GGML_MAX_CONCUR];
75
- int parse_seq_len;
76
44
 
77
45
  #ifdef GGML_ALLOCATOR_DEBUG
78
46
  struct ggml_tensor * allocated_tensors[1024];
@@ -80,7 +48,7 @@ struct ggml_allocr {
80
48
  };
81
49
 
82
50
  #ifdef GGML_ALLOCATOR_DEBUG
83
- static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
51
+ static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
84
52
  for (int i = 0; i < 1024; i++) {
85
53
  if (alloc->allocated_tensors[i] == NULL) {
86
54
  alloc->allocated_tensors[i] = tensor;
@@ -89,7 +57,7 @@ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor
89
57
  }
90
58
  GGML_ASSERT(!"out of allocated_tensors");
91
59
  }
92
- static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
60
+ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
93
61
  for (int i = 0; i < 1024; i++) {
94
62
  if (alloc->allocated_tensors[i] == tensor ||
95
63
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -103,7 +71,7 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
103
71
  #endif
104
72
 
105
73
  // check if a tensor is allocated by this buffer
106
- static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
74
+ static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
107
75
  return tensor->buffer == alloc->buffer;
108
76
  }
109
77
 
@@ -111,7 +79,7 @@ static bool ggml_is_view(struct ggml_tensor * t) {
111
79
  return t->view_src != NULL;
112
80
  }
113
81
 
114
- void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
82
+ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
115
83
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
116
84
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
117
85
 
@@ -162,9 +130,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
162
130
  }
163
131
 
164
132
  tensor->data = addr;
165
- AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
133
  tensor->buffer = alloc->buffer;
167
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
134
+ if (!alloc->measure) {
135
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
136
+ }
168
137
 
169
138
  #ifdef GGML_ALLOCATOR_DEBUG
170
139
  add_allocated_tensor(alloc, tensor);
@@ -180,16 +149,16 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
180
149
  }
181
150
  #endif
182
151
 
183
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
152
+ alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
184
153
  }
185
154
 
186
155
  // this is a very naive implementation, but for our case the number of free blocks should be very small
187
- static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
188
- if (ggml_allocr_is_own(alloc, tensor) == false) {
156
+ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
157
+ if (ggml_tallocr_is_own(alloc, tensor) == false) {
189
158
  // the tensor was not allocated in this buffer
190
159
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
191
160
  // the easiest way to deal with this is just to ignore it
192
- AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
161
+ // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
193
162
  return;
194
163
  }
195
164
 
@@ -199,7 +168,9 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
199
168
  size = aligned_offset(NULL, size, alloc->alignment);
200
169
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
201
170
 
202
- ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
171
+ if (!alloc->measure) {
172
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
173
+ }
203
174
 
204
175
  #ifdef GGML_ALLOCATOR_DEBUG
205
176
  remove_allocated_tensor(alloc, tensor);
@@ -253,91 +224,180 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
253
224
  alloc->n_free_blocks++;
254
225
  }
255
226
 
256
- void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
257
- for (int i = 0; i < n; i++) {
258
- alloc->parse_seq[i] = list[i];
259
- }
260
- alloc->parse_seq_len = n;
261
- }
262
-
263
- void ggml_allocr_reset(struct ggml_allocr * alloc) {
227
+ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
264
228
  alloc->n_free_blocks = 1;
265
- size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
266
- alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
267
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
229
+ size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
230
+ alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
231
+
232
+ if (alloc->measure) {
233
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
234
+ } else {
235
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
236
+ }
268
237
  }
269
238
 
270
- struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
239
+ ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
271
240
  struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
241
 
273
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
242
+ ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
274
243
 
275
- *alloc = (struct ggml_allocr){
244
+ *alloc = (struct ggml_tallocr) {
276
245
  /*.buffer = */ buffer,
277
246
  /*.buffer_owned = */ true,
278
247
  /*.base = */ ggml_backend_buffer_get_base(buffer),
279
248
  /*.alignment = */ alignment,
280
249
  /*.n_free_blocks = */ 0,
281
250
  /*.free_blocks = */ {{0}},
282
- /*.hash_table = */ {{0}},
283
251
  /*.max_size = */ 0,
284
252
  /*.measure = */ false,
285
- /*.parse_seq = */ {0},
286
- /*.parse_seq_len = */ 0,
287
253
  #ifdef GGML_ALLOCATOR_DEBUG
288
254
  /*.allocated_tensors = */ {0},
289
255
  #endif
290
256
  };
291
257
 
292
- ggml_allocr_reset(alloc);
258
+ ggml_tallocr_reset(alloc);
259
+
260
+ return alloc;
261
+ }
262
+
263
+ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
264
+ ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
265
+ alloc->measure = true;
293
266
 
294
267
  return alloc;
295
268
  }
296
269
 
297
- struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
- struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
270
+ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
271
+ // create a backend buffer to get the correct tensor allocation sizes
272
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
273
+
274
+ // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
275
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
276
+ alloc->buffer_owned = true;
299
277
  alloc->measure = true;
278
+ ggml_tallocr_reset(alloc);
279
+ return alloc;
280
+ }
300
281
 
282
+ ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
283
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
284
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
285
+ alloc->buffer_owned = true;
301
286
  return alloc;
302
287
  }
303
288
 
304
- struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
289
+ ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
290
+ ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
306
291
 
307
- *alloc = (struct ggml_allocr){
292
+ *alloc = (struct ggml_tallocr) {
308
293
  /*.buffer = */ buffer,
309
294
  /*.buffer_owned = */ false,
310
295
  /*.base = */ ggml_backend_buffer_get_base(buffer),
311
296
  /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
312
297
  /*.n_free_blocks = */ 0,
313
298
  /*.free_blocks = */ {{0}},
314
- /*.hash_table = */ {{0}},
315
299
  /*.max_size = */ 0,
316
300
  /*.measure = */ false,
317
- /*.parse_seq = */ {0},
318
- /*.parse_seq_len = */ 0,
319
301
  #ifdef GGML_ALLOCATOR_DEBUG
320
302
  /*.allocated_tensors = */ {0},
321
303
  #endif
322
304
  };
323
305
 
324
- ggml_allocr_reset(alloc);
306
+ ggml_tallocr_reset(alloc);
325
307
 
326
308
  return alloc;
327
309
  }
328
310
 
329
- void ggml_allocr_free(struct ggml_allocr * alloc) {
311
+ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
312
+ return alloc->buffer;
313
+ }
314
+
315
+ void ggml_tallocr_free(ggml_tallocr_t alloc) {
316
+ if (alloc == NULL) {
317
+ return;
318
+ }
319
+
330
320
  if (alloc->buffer_owned) {
331
321
  ggml_backend_buffer_free(alloc->buffer);
332
322
  }
333
323
  free(alloc);
334
324
  }
335
325
 
336
- bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
326
+ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
337
327
  return alloc->measure;
338
328
  }
339
329
 
340
- //////////// compute graph allocator
330
+ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
331
+ return alloc->max_size;
332
+ }
333
+
334
+ // graph allocator
335
+
336
+ struct hash_node {
337
+ int n_children;
338
+ int n_views;
339
+ };
340
+
341
+ struct ggml_gallocr {
342
+ ggml_tallocr_t talloc;
343
+ struct ggml_hash_set hash_set;
344
+ struct hash_node * hash_values;
345
+ size_t hash_values_size;
346
+ ggml_tallocr_t * hash_allocs;
347
+ int * parse_seq;
348
+ int parse_seq_len;
349
+ };
350
+
351
+ ggml_gallocr_t ggml_gallocr_new(void) {
352
+ ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
353
+
354
+ *galloc = (struct ggml_gallocr) {
355
+ /*.talloc = */ NULL,
356
+ /*.hash_set = */ {0},
357
+ /*.hash_values = */ NULL,
358
+ /*.hash_values_size = */ 0,
359
+ /*.hash_allocs = */ NULL,
360
+ /*.parse_seq = */ NULL,
361
+ /*.parse_seq_len = */ 0,
362
+ };
363
+
364
+ return galloc;
365
+ }
366
+
367
+ void ggml_gallocr_free(ggml_gallocr_t galloc) {
368
+ if (galloc == NULL) {
369
+ return;
370
+ }
371
+
372
+ if (galloc->hash_set.keys != NULL) {
373
+ free(galloc->hash_set.keys);
374
+ }
375
+ if (galloc->hash_values != NULL) {
376
+ free(galloc->hash_values);
377
+ }
378
+ if (galloc->hash_allocs != NULL) {
379
+ free(galloc->hash_allocs);
380
+ }
381
+ if (galloc->parse_seq != NULL) {
382
+ free(galloc->parse_seq);
383
+ }
384
+ free(galloc);
385
+ }
386
+
387
+ void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
388
+ free(galloc->parse_seq);
389
+ galloc->parse_seq = malloc(sizeof(int) * n);
390
+
391
+ for (int i = 0; i < n; i++) {
392
+ galloc->parse_seq[i] = list[i];
393
+ }
394
+ galloc->parse_seq_len = n;
395
+ }
396
+
397
+ static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
398
+ size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
399
+ return &galloc->hash_values[i];
400
+ }
341
401
 
342
402
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
343
403
  if (a->type != b->type) {
@@ -378,27 +438,40 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
438
  }
379
439
  }
380
440
 
381
- static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
382
- assert(view->view_src != NULL && view->view_src->data != NULL);
441
+ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
442
+ if (galloc->talloc != NULL) {
443
+ return galloc->talloc;
444
+ }
445
+
446
+ return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
447
+ }
448
+
449
+ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
450
+ ggml_tallocr_t alloc = node_tallocr(galloc, view);
383
451
 
452
+ //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
453
+ GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
384
454
  if (update_backend) {
385
455
  view->backend = view->view_src->backend;
386
456
  }
387
-
388
457
  view->buffer = view->view_src->buffer;
389
458
  view->data = (char *)view->view_src->data + view->view_offs;
390
459
 
391
460
  // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
392
461
  // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
393
- assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
394
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
462
+ assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
463
+
464
+ if (!alloc->measure) {
465
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
466
+ }
395
467
  }
396
468
 
397
- static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
398
- struct hash_node * ht = alloc->hash_table;
469
+ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
470
+ ggml_tallocr_t alloc = node_tallocr(galloc, node);
471
+
399
472
  if (node->data == NULL) {
400
473
  if (ggml_is_view(node)) {
401
- init_view(alloc, node, true);
474
+ init_view(galloc, node, true);
402
475
  } else {
403
476
  // see if we can reuse a parent's buffer (inplace)
404
477
  if (ggml_op_can_inplace(node->op)) {
@@ -409,16 +482,16 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
409
482
  }
410
483
 
411
484
  // if the node's data is external, then we cannot re-use it
412
- if (ggml_allocr_is_own(alloc, parent) == false) {
485
+ if (ggml_tallocr_is_own(alloc, parent) == false) {
413
486
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
414
487
  continue;
415
488
  }
416
489
 
417
- struct hash_node * p_hn = hash_get(ht, parent);
490
+ struct hash_node * p_hn = hash_get(galloc, parent);
418
491
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
419
492
  if (ggml_is_view(parent)) {
420
493
  struct ggml_tensor * view_src = parent->view_src;
421
- struct hash_node * view_src_hn = hash_get(ht, view_src);
494
+ struct hash_node * view_src_hn = hash_get(galloc, view_src);
422
495
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
423
496
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
424
497
  // the parent's data that it will need later (same layout requirement). the problem is that then
@@ -428,170 +501,267 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
428
501
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
429
502
  node->view_src = view_src;
430
503
  view_src_hn->n_views += 1;
431
- init_view(alloc, node, false);
504
+ init_view(galloc, node, false);
432
505
  return;
433
506
  }
434
507
  } else {
435
508
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
436
509
  node->view_src = parent;
437
510
  p_hn->n_views += 1;
438
- init_view(alloc, node, false);
511
+ init_view(galloc, node, false);
439
512
  return;
440
513
  }
441
514
  }
442
515
  }
443
516
  }
444
- ggml_allocr_alloc(alloc, node);
517
+ ggml_tallocr_alloc(alloc, node);
445
518
  }
446
519
  }
447
520
  }
448
521
 
449
- size_t ggml_allocr_alloc_graph_n(
450
- struct ggml_allocr * alloc,
451
- struct ggml_cgraph ** graphs, int n_graphs,
452
- struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
522
+ static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
523
+ ggml_tallocr_t alloc = node_tallocr(galloc, node);
453
524
 
454
- // reset hash table
455
- struct hash_node * ht = alloc->hash_table;
456
- memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
525
+ ggml_tallocr_free_tensor(alloc, node);
526
+ }
527
+
528
+ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
529
+ const int * parse_seq = galloc->parse_seq;
530
+ int parse_seq_len = galloc->parse_seq_len;
457
531
 
458
532
  // count number of children and views
459
- for (int g = 0; g < n_graphs; g++) {
460
- struct ggml_cgraph * gf = graphs[g];
461
- for (int i = 0; i < gf->n_nodes; i++) {
533
+ for (int i = 0; i < gf->n_nodes; i++) {
534
+ struct ggml_tensor * node = gf->nodes[i];
535
+
536
+ if (ggml_is_view(node)) {
537
+ struct ggml_tensor * view_src = node->view_src;
538
+ hash_get(galloc, view_src)->n_views += 1;
539
+ if (node->buffer == NULL && node->data != NULL) {
540
+ // view of a pre-allocated tensor, didn't call init_view() yet
541
+ init_view(galloc, node, true);
542
+ }
543
+ }
544
+
545
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
546
+ struct ggml_tensor * parent = node->src[j];
547
+ if (parent == NULL) {
548
+ break;
549
+ }
550
+ hash_get(galloc, parent)->n_children += 1;
551
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
552
+ init_view(galloc, parent, true);
553
+ }
554
+ }
555
+ }
556
+
557
+ // allocate tensors
558
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
559
+ int last_barrier_pos = 0;
560
+ int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
561
+
562
+ for (int ind = 0; ind < n_nodes; ind++) {
563
+ // allocate a node if there is no parse_seq or this is not a barrier
564
+ if (parse_seq_len == 0 || parse_seq[ind] != -1) {
565
+ int i = parse_seq_len ? parse_seq[ind] : ind;
462
566
  struct ggml_tensor * node = gf->nodes[i];
463
567
 
464
- if (ggml_is_view(node)) {
465
- struct ggml_tensor * view_src = node->view_src;
466
- hash_get(ht, view_src)->n_views += 1;
467
- if (node->buffer == NULL && node->data != NULL) {
468
- // view of a pre-allocated tensor, didn't call init_view() yet
469
- init_view(alloc, node, true);
568
+ // allocate parents (leafs)
569
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
570
+ struct ggml_tensor * parent = node->src[j];
571
+ if (parent == NULL) {
572
+ break;
470
573
  }
574
+ allocate_node(galloc, parent);
471
575
  }
472
576
 
577
+ // allocate node
578
+ allocate_node(galloc, node);
579
+
580
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
473
581
  for (int j = 0; j < GGML_MAX_SRC; j++) {
474
582
  struct ggml_tensor * parent = node->src[j];
475
583
  if (parent == NULL) {
476
584
  break;
477
585
  }
478
- hash_get(ht, parent)->n_children += 1;
479
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
480
- init_view(alloc, parent, true);
586
+ AT_PRINTF("%s", parent->name);
587
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
588
+ AT_PRINTF(", ");
481
589
  }
482
590
  }
591
+ AT_PRINTF("\n");
483
592
  }
484
- }
485
-
486
- // allocate tensors
487
- for (int g = 0; g < n_graphs; g++) {
488
- struct ggml_cgraph * gf = graphs[g];
489
- AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
490
- // graph inputs are allocated first to ensure that they are not overwritten by each other
491
- if (inputs != NULL && inputs[g] != NULL) {
492
- for (int i = 0; inputs[g][i] != NULL; i++) {
493
- struct ggml_tensor * input = inputs[g][i];
494
- AT_PRINTF("input: %s\n", input->name);
495
- allocate_node(alloc, input);
496
- }
497
- }
498
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
499
- int last_barrier_pos = 0;
500
- int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
501
593
 
502
- for (int ind = 0; ind < n_nodes; ind++) {
503
- // allocate a node if there is no parse_seq or this is not a barrier
504
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
505
- int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
506
- struct ggml_tensor * node = gf->nodes[i];
594
+ // update parents
595
+ // update immediately if there is no parse_seq
596
+ // update only at barriers if there is parse_seq
597
+ if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
598
+ int update_start = parse_seq_len ? last_barrier_pos : ind;
599
+ int update_end = parse_seq_len ? ind : ind + 1;
600
+ for (int i = update_start; i < update_end; i++) {
601
+ int node_i = parse_seq_len ? parse_seq[i] : i;
602
+ struct ggml_tensor * node = gf->nodes[node_i];
507
603
 
508
- // allocate parents (leafs)
509
604
  for (int j = 0; j < GGML_MAX_SRC; j++) {
510
605
  struct ggml_tensor * parent = node->src[j];
511
606
  if (parent == NULL) {
512
607
  break;
513
608
  }
514
- allocate_node(alloc, parent);
515
- }
609
+ struct hash_node * p_hn = hash_get(galloc, parent);
610
+ p_hn->n_children -= 1;
516
611
 
517
- // allocate node
518
- allocate_node(alloc, node);
612
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
519
613
 
520
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
521
- for (int j = 0; j < GGML_MAX_SRC; j++) {
522
- struct ggml_tensor * parent = node->src[j];
523
- if (parent == NULL) {
524
- break;
525
- }
526
- AT_PRINTF("%s", parent->name);
527
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
528
- AT_PRINTF(", ");
529
- }
530
- }
531
- AT_PRINTF("\n");
532
- }
533
-
534
- // update parents
535
- // update immediately if there is no parse_seq
536
- // update only at barriers if there is parse_seq
537
- if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
538
- int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
- int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
- for (int i = update_start; i < update_end; i++) {
541
- int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
542
- struct ggml_tensor * node = gf->nodes[node_i];
543
-
544
- for (int j = 0; j < GGML_MAX_SRC; j++) {
545
- struct ggml_tensor * parent = node->src[j];
546
- if (parent == NULL) {
547
- break;
548
- }
549
- struct hash_node * p_hn = hash_get(ht, parent);
550
- p_hn->n_children -= 1;
551
-
552
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
553
-
554
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
- if (ggml_is_view(parent)) {
556
- struct ggml_tensor * view_src = parent->view_src;
557
- struct hash_node * view_src_hn = hash_get(ht, view_src);
558
- view_src_hn->n_views -= 1;
559
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
560
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
- ggml_allocr_free_tensor(alloc, view_src);
562
- }
563
- }
564
- else {
565
- if (parent->data != node->data) {
566
- ggml_allocr_free_tensor(alloc, parent);
567
- }
614
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
615
+ if (ggml_is_view(parent)) {
616
+ struct ggml_tensor * view_src = parent->view_src;
617
+ struct hash_node * view_src_hn = hash_get(galloc, view_src);
618
+ view_src_hn->n_views -= 1;
619
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
620
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
621
+ free_node(galloc, view_src);
568
622
  }
569
623
  }
624
+ else {
625
+ free_node(galloc, parent);
626
+ }
570
627
  }
571
628
  }
572
- AT_PRINTF("\n");
573
- if (alloc->parse_seq_len) {
574
- last_barrier_pos = ind + 1;
575
- }
576
629
  }
577
- }
578
- // free graph outputs here that wouldn't be freed otherwise because they have no children
579
- if (outputs != NULL && outputs[g] != NULL) {
580
- for (int i = 0; outputs[g][i] != NULL; i++) {
581
- struct ggml_tensor * output = outputs[g][i];
582
- AT_PRINTF("output: %s\n", output->name);
583
- ggml_allocr_free_tensor(alloc, output);
630
+ AT_PRINTF("\n");
631
+ if (parse_seq_len) {
632
+ last_barrier_pos = ind + 1;
584
633
  }
585
634
  }
586
635
  }
636
+ }
587
637
 
588
- return alloc->max_size;
638
+ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
639
+ size_t hash_size = graph->visited_hash_table.size;
640
+
641
+ // check if the hash table is initialized and large enough
642
+ if (galloc->hash_set.size < hash_size) {
643
+ if (galloc->hash_set.keys != NULL) {
644
+ free(galloc->hash_set.keys);
645
+ }
646
+ if (galloc->hash_values != NULL) {
647
+ free(galloc->hash_values);
648
+ }
649
+ galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
650
+ galloc->hash_set.size = hash_size;
651
+ galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
652
+ }
653
+
654
+ // reset hash table
655
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
656
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
657
+
658
+ galloc->talloc = talloc;
659
+ ggml_tallocr_alloc_graph_impl(galloc, graph);
660
+ galloc->talloc = NULL;
661
+
662
+ size_t max_size = ggml_tallocr_max_size(talloc);
663
+
664
+ return max_size;
589
665
  }
590
666
 
591
- size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
592
- return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
667
+ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
668
+ const size_t hash_size = hash_set.size;
669
+
670
+ GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
671
+
672
+ galloc->talloc = NULL;
673
+
674
+ // alloc hash_values if needed
675
+ if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
676
+ free(galloc->hash_values);
677
+ galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
678
+ galloc->hash_values_size = hash_size;
679
+ }
680
+
681
+ // free hash_set.keys if needed
682
+ if (galloc->hash_set.keys != NULL) {
683
+ free(galloc->hash_set.keys);
684
+ }
685
+ galloc->hash_set = hash_set;
686
+
687
+ // reset hash values
688
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
689
+
690
+ galloc->hash_allocs = hash_node_talloc;
691
+
692
+ ggml_tallocr_alloc_graph_impl(galloc, graph);
693
+
694
+ // remove unowned resources
695
+ galloc->hash_set.keys = NULL;
696
+ galloc->hash_allocs = NULL;
593
697
  }
594
698
 
595
- size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
596
- return alloc->max_size;
699
+ // legacy API wrapper
700
+
701
+ struct ggml_allocr {
702
+ ggml_tallocr_t talloc;
703
+ ggml_gallocr_t galloc;
704
+ };
705
+
706
+ static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
707
+ ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
708
+ *alloc = (struct ggml_allocr) {
709
+ /*.talloc = */ talloc,
710
+ /*.galloc = */ ggml_gallocr_new(),
711
+ };
712
+ return alloc;
713
+ }
714
+
715
+ ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
716
+ return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
717
+ }
718
+
719
+ ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
720
+ return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
721
+ }
722
+
723
+ ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
724
+ return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
725
+ }
726
+
727
+ ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
728
+ return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
729
+ }
730
+
731
+ ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
732
+ return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
733
+ }
734
+
735
+ struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
736
+ return ggml_tallocr_get_buffer(alloc->talloc);
737
+ }
738
+
739
+ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
740
+ ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
741
+ }
742
+
743
+ void ggml_allocr_free(ggml_allocr_t alloc) {
744
+ ggml_gallocr_free(alloc->galloc);
745
+ ggml_tallocr_free(alloc->talloc);
746
+ free(alloc);
747
+ }
748
+
749
+ bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
750
+ return ggml_tallocr_is_measure(alloc->talloc);
751
+ }
752
+
753
+ void ggml_allocr_reset(ggml_allocr_t alloc) {
754
+ ggml_tallocr_reset(alloc->talloc);
755
+ }
756
+
757
+ void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
758
+ ggml_tallocr_alloc(alloc->talloc, tensor);
759
+ }
760
+
761
+ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
762
+ return ggml_tallocr_max_size(alloc->talloc);
763
+ }
764
+
765
+ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
766
+ return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
597
767
  }