llama_cpp 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,51 +1,21 @@
1
1
  #include "ggml-alloc.h"
2
- #include "ggml-backend.h"
2
+ #include "ggml-backend-impl.h"
3
3
  #include "ggml.h"
4
+ #include "ggml-impl.h"
4
5
  #include <assert.h>
6
+ #include <limits.h>
5
7
  #include <stdarg.h>
6
8
  #include <stdio.h>
7
9
  #include <stdlib.h>
8
10
  #include <string.h>
9
11
 
10
-
11
- #define UNUSED(x) (void)(x)
12
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
- #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
13
+ #define MAX_FREE_BLOCKS 256
14
14
 
15
15
  //#define GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF printf
18
- #define AT_PRINTF(...) ((void)0)
19
-
20
- struct hash_node {
21
- struct ggml_tensor * t;
22
- int n_children;
23
- int n_views;
24
- };
25
-
26
- static size_t hash(void * p) {
27
- return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
28
- }
29
-
30
- static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
31
- size_t h = hash(t);
32
-
33
- // linear probing
34
- size_t i = h;
35
- while (hash_table[i].t != NULL) {
36
- if (hash_table[i].t == t) {
37
- return &hash_table[i];
38
- }
39
- i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
40
- if (i == h) {
41
- // hash table is full
42
- GGML_ASSERT(false);
43
- }
44
- }
45
-
46
- hash_table[i].t = t;
47
- return &hash_table[i];
48
- }
17
+ //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
+ #define AT_PRINTF(...)
49
19
 
50
20
  // TODO: GGML_PAD ?
51
21
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
@@ -59,20 +29,18 @@ struct free_block {
59
29
  size_t size;
60
30
  };
61
31
 
62
- #define MAX_FREE_BLOCKS 256
63
-
64
- struct ggml_allocr {
32
+ struct ggml_tallocr {
65
33
  struct ggml_backend_buffer * buffer;
66
34
  bool buffer_owned;
67
- void * data;
35
+ void * base;
68
36
  size_t alignment;
37
+
69
38
  int n_free_blocks;
70
39
  struct free_block free_blocks[MAX_FREE_BLOCKS];
71
- struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
40
+
72
41
  size_t max_size;
42
+
73
43
  bool measure;
74
- int parse_seq[GGML_MAX_CONCUR];
75
- int parse_seq_len;
76
44
 
77
45
  #ifdef GGML_ALLOCATOR_DEBUG
78
46
  struct ggml_tensor * allocated_tensors[1024];
@@ -80,7 +48,7 @@ struct ggml_allocr {
80
48
  };
81
49
 
82
50
  #ifdef GGML_ALLOCATOR_DEBUG
83
- static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
51
+ static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
84
52
  for (int i = 0; i < 1024; i++) {
85
53
  if (alloc->allocated_tensors[i] == NULL) {
86
54
  alloc->allocated_tensors[i] = tensor;
@@ -89,7 +57,7 @@ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor
89
57
  }
90
58
  GGML_ASSERT(!"out of allocated_tensors");
91
59
  }
92
- static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
60
+ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
93
61
  for (int i = 0; i < 1024; i++) {
94
62
  if (alloc->allocated_tensors[i] == tensor ||
95
63
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -103,7 +71,7 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
103
71
  #endif
104
72
 
105
73
  // check if a tensor is allocated by this buffer
106
- static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
74
+ static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
107
75
  return tensor->buffer == alloc->buffer;
108
76
  }
109
77
 
@@ -111,7 +79,7 @@ static bool ggml_is_view(struct ggml_tensor * t) {
111
79
  return t->view_src != NULL;
112
80
  }
113
81
 
114
- void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
82
+ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
115
83
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
116
84
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
117
85
 
@@ -162,9 +130,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
162
130
  }
163
131
 
164
132
  tensor->data = addr;
165
- AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
133
  tensor->buffer = alloc->buffer;
167
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
134
+ if (!alloc->measure) {
135
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
136
+ }
168
137
 
169
138
  #ifdef GGML_ALLOCATOR_DEBUG
170
139
  add_allocated_tensor(alloc, tensor);
@@ -180,16 +149,16 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
180
149
  }
181
150
  #endif
182
151
 
183
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
152
+ alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
184
153
  }
185
154
 
186
155
  // this is a very naive implementation, but for our case the number of free blocks should be very small
187
- static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
188
- if (ggml_allocr_is_own(alloc, tensor) == false) {
156
+ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
157
+ if (ggml_tallocr_is_own(alloc, tensor) == false) {
189
158
  // the tensor was not allocated in this buffer
190
159
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
191
160
  // the easiest way to deal with this is just to ignore it
192
- AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
161
+ // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
193
162
  return;
194
163
  }
195
164
 
@@ -199,7 +168,9 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
199
168
  size = aligned_offset(NULL, size, alloc->alignment);
200
169
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
201
170
 
202
- ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
171
+ if (!alloc->measure) {
172
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
173
+ }
203
174
 
204
175
  #ifdef GGML_ALLOCATOR_DEBUG
205
176
  remove_allocated_tensor(alloc, tensor);
@@ -253,91 +224,180 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
253
224
  alloc->n_free_blocks++;
254
225
  }
255
226
 
256
- void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
257
- for (int i = 0; i < n; i++) {
258
- alloc->parse_seq[i] = list[i];
259
- }
260
- alloc->parse_seq_len = n;
261
- }
262
-
263
- void ggml_allocr_reset(struct ggml_allocr * alloc) {
227
+ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
264
228
  alloc->n_free_blocks = 1;
265
- size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
266
- alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
267
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
229
+ size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
230
+ alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
231
+
232
+ if (alloc->measure) {
233
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
234
+ } else {
235
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
236
+ }
268
237
  }
269
238
 
270
- struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
239
+ ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
271
240
  struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
241
 
273
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
242
+ ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
274
243
 
275
- *alloc = (struct ggml_allocr){
244
+ *alloc = (struct ggml_tallocr) {
276
245
  /*.buffer = */ buffer,
277
246
  /*.buffer_owned = */ true,
278
247
  /*.base = */ ggml_backend_buffer_get_base(buffer),
279
248
  /*.alignment = */ alignment,
280
249
  /*.n_free_blocks = */ 0,
281
250
  /*.free_blocks = */ {{0}},
282
- /*.hash_table = */ {{0}},
283
251
  /*.max_size = */ 0,
284
252
  /*.measure = */ false,
285
- /*.parse_seq = */ {0},
286
- /*.parse_seq_len = */ 0,
287
253
  #ifdef GGML_ALLOCATOR_DEBUG
288
254
  /*.allocated_tensors = */ {0},
289
255
  #endif
290
256
  };
291
257
 
292
- ggml_allocr_reset(alloc);
258
+ ggml_tallocr_reset(alloc);
259
+
260
+ return alloc;
261
+ }
262
+
263
+ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
264
+ ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
265
+ alloc->measure = true;
293
266
 
294
267
  return alloc;
295
268
  }
296
269
 
297
- struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
- struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
270
+ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
271
+ // create a backend buffer to get the correct tensor allocation sizes
272
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
273
+
274
+ // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
275
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
276
+ alloc->buffer_owned = true;
299
277
  alloc->measure = true;
278
+ ggml_tallocr_reset(alloc);
279
+ return alloc;
280
+ }
300
281
 
282
+ ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
283
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
284
+ ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
285
+ alloc->buffer_owned = true;
301
286
  return alloc;
302
287
  }
303
288
 
304
- struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
289
+ ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
290
+ ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
306
291
 
307
- *alloc = (struct ggml_allocr){
292
+ *alloc = (struct ggml_tallocr) {
308
293
  /*.buffer = */ buffer,
309
294
  /*.buffer_owned = */ false,
310
295
  /*.base = */ ggml_backend_buffer_get_base(buffer),
311
296
  /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
312
297
  /*.n_free_blocks = */ 0,
313
298
  /*.free_blocks = */ {{0}},
314
- /*.hash_table = */ {{0}},
315
299
  /*.max_size = */ 0,
316
300
  /*.measure = */ false,
317
- /*.parse_seq = */ {0},
318
- /*.parse_seq_len = */ 0,
319
301
  #ifdef GGML_ALLOCATOR_DEBUG
320
302
  /*.allocated_tensors = */ {0},
321
303
  #endif
322
304
  };
323
305
 
324
- ggml_allocr_reset(alloc);
306
+ ggml_tallocr_reset(alloc);
325
307
 
326
308
  return alloc;
327
309
  }
328
310
 
329
- void ggml_allocr_free(struct ggml_allocr * alloc) {
311
+ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
312
+ return alloc->buffer;
313
+ }
314
+
315
+ void ggml_tallocr_free(ggml_tallocr_t alloc) {
316
+ if (alloc == NULL) {
317
+ return;
318
+ }
319
+
330
320
  if (alloc->buffer_owned) {
331
321
  ggml_backend_buffer_free(alloc->buffer);
332
322
  }
333
323
  free(alloc);
334
324
  }
335
325
 
336
- bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
326
+ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
337
327
  return alloc->measure;
338
328
  }
339
329
 
340
- //////////// compute graph allocator
330
+ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
331
+ return alloc->max_size;
332
+ }
333
+
334
+ // graph allocator
335
+
336
+ struct hash_node {
337
+ int n_children;
338
+ int n_views;
339
+ };
340
+
341
+ struct ggml_gallocr {
342
+ ggml_tallocr_t talloc;
343
+ struct ggml_hash_set hash_set;
344
+ struct hash_node * hash_values;
345
+ size_t hash_values_size;
346
+ ggml_tallocr_t * hash_allocs;
347
+ int * parse_seq;
348
+ int parse_seq_len;
349
+ };
350
+
351
+ ggml_gallocr_t ggml_gallocr_new(void) {
352
+ ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
353
+
354
+ *galloc = (struct ggml_gallocr) {
355
+ /*.talloc = */ NULL,
356
+ /*.hash_set = */ {0},
357
+ /*.hash_values = */ NULL,
358
+ /*.hash_values_size = */ 0,
359
+ /*.hash_allocs = */ NULL,
360
+ /*.parse_seq = */ NULL,
361
+ /*.parse_seq_len = */ 0,
362
+ };
363
+
364
+ return galloc;
365
+ }
366
+
367
+ void ggml_gallocr_free(ggml_gallocr_t galloc) {
368
+ if (galloc == NULL) {
369
+ return;
370
+ }
371
+
372
+ if (galloc->hash_set.keys != NULL) {
373
+ free(galloc->hash_set.keys);
374
+ }
375
+ if (galloc->hash_values != NULL) {
376
+ free(galloc->hash_values);
377
+ }
378
+ if (galloc->hash_allocs != NULL) {
379
+ free(galloc->hash_allocs);
380
+ }
381
+ if (galloc->parse_seq != NULL) {
382
+ free(galloc->parse_seq);
383
+ }
384
+ free(galloc);
385
+ }
386
+
387
+ void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
388
+ free(galloc->parse_seq);
389
+ galloc->parse_seq = malloc(sizeof(int) * n);
390
+
391
+ for (int i = 0; i < n; i++) {
392
+ galloc->parse_seq[i] = list[i];
393
+ }
394
+ galloc->parse_seq_len = n;
395
+ }
396
+
397
+ static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
398
+ size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
399
+ return &galloc->hash_values[i];
400
+ }
341
401
 
342
402
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
343
403
  if (a->type != b->type) {
@@ -378,27 +438,40 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
378
438
  }
379
439
  }
380
440
 
381
- static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
382
- assert(view->view_src != NULL && view->view_src->data != NULL);
441
+ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
442
+ if (galloc->talloc != NULL) {
443
+ return galloc->talloc;
444
+ }
445
+
446
+ return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
447
+ }
448
+
449
+ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
450
+ ggml_tallocr_t alloc = node_tallocr(galloc, view);
383
451
 
452
+ //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
453
+ GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
384
454
  if (update_backend) {
385
455
  view->backend = view->view_src->backend;
386
456
  }
387
-
388
457
  view->buffer = view->view_src->buffer;
389
458
  view->data = (char *)view->view_src->data + view->view_offs;
390
459
 
391
460
  // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
392
461
  // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
393
- assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
394
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
462
+ assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
463
+
464
+ if (!alloc->measure) {
465
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
466
+ }
395
467
  }
396
468
 
397
- static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
398
- struct hash_node * ht = alloc->hash_table;
469
+ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
470
+ ggml_tallocr_t alloc = node_tallocr(galloc, node);
471
+
399
472
  if (node->data == NULL) {
400
473
  if (ggml_is_view(node)) {
401
- init_view(alloc, node, true);
474
+ init_view(galloc, node, true);
402
475
  } else {
403
476
  // see if we can reuse a parent's buffer (inplace)
404
477
  if (ggml_op_can_inplace(node->op)) {
@@ -409,16 +482,16 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
409
482
  }
410
483
 
411
484
  // if the node's data is external, then we cannot re-use it
412
- if (ggml_allocr_is_own(alloc, parent) == false) {
485
+ if (ggml_tallocr_is_own(alloc, parent) == false) {
413
486
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
414
487
  continue;
415
488
  }
416
489
 
417
- struct hash_node * p_hn = hash_get(ht, parent);
490
+ struct hash_node * p_hn = hash_get(galloc, parent);
418
491
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
419
492
  if (ggml_is_view(parent)) {
420
493
  struct ggml_tensor * view_src = parent->view_src;
421
- struct hash_node * view_src_hn = hash_get(ht, view_src);
494
+ struct hash_node * view_src_hn = hash_get(galloc, view_src);
422
495
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
423
496
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
424
497
  // the parent's data that it will need later (same layout requirement). the problem is that then
@@ -428,170 +501,267 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
428
501
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
429
502
  node->view_src = view_src;
430
503
  view_src_hn->n_views += 1;
431
- init_view(alloc, node, false);
504
+ init_view(galloc, node, false);
432
505
  return;
433
506
  }
434
507
  } else {
435
508
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
436
509
  node->view_src = parent;
437
510
  p_hn->n_views += 1;
438
- init_view(alloc, node, false);
511
+ init_view(galloc, node, false);
439
512
  return;
440
513
  }
441
514
  }
442
515
  }
443
516
  }
444
- ggml_allocr_alloc(alloc, node);
517
+ ggml_tallocr_alloc(alloc, node);
445
518
  }
446
519
  }
447
520
  }
448
521
 
449
- size_t ggml_allocr_alloc_graph_n(
450
- struct ggml_allocr * alloc,
451
- struct ggml_cgraph ** graphs, int n_graphs,
452
- struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
522
+ static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
523
+ ggml_tallocr_t alloc = node_tallocr(galloc, node);
453
524
 
454
- // reset hash table
455
- struct hash_node * ht = alloc->hash_table;
456
- memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
525
+ ggml_tallocr_free_tensor(alloc, node);
526
+ }
527
+
528
+ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
529
+ const int * parse_seq = galloc->parse_seq;
530
+ int parse_seq_len = galloc->parse_seq_len;
457
531
 
458
532
  // count number of children and views
459
- for (int g = 0; g < n_graphs; g++) {
460
- struct ggml_cgraph * gf = graphs[g];
461
- for (int i = 0; i < gf->n_nodes; i++) {
533
+ for (int i = 0; i < gf->n_nodes; i++) {
534
+ struct ggml_tensor * node = gf->nodes[i];
535
+
536
+ if (ggml_is_view(node)) {
537
+ struct ggml_tensor * view_src = node->view_src;
538
+ hash_get(galloc, view_src)->n_views += 1;
539
+ if (node->buffer == NULL && node->data != NULL) {
540
+ // view of a pre-allocated tensor, didn't call init_view() yet
541
+ init_view(galloc, node, true);
542
+ }
543
+ }
544
+
545
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
546
+ struct ggml_tensor * parent = node->src[j];
547
+ if (parent == NULL) {
548
+ break;
549
+ }
550
+ hash_get(galloc, parent)->n_children += 1;
551
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
552
+ init_view(galloc, parent, true);
553
+ }
554
+ }
555
+ }
556
+
557
+ // allocate tensors
558
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
559
+ int last_barrier_pos = 0;
560
+ int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
561
+
562
+ for (int ind = 0; ind < n_nodes; ind++) {
563
+ // allocate a node if there is no parse_seq or this is not a barrier
564
+ if (parse_seq_len == 0 || parse_seq[ind] != -1) {
565
+ int i = parse_seq_len ? parse_seq[ind] : ind;
462
566
  struct ggml_tensor * node = gf->nodes[i];
463
567
 
464
- if (ggml_is_view(node)) {
465
- struct ggml_tensor * view_src = node->view_src;
466
- hash_get(ht, view_src)->n_views += 1;
467
- if (node->buffer == NULL && node->data != NULL) {
468
- // view of a pre-allocated tensor, didn't call init_view() yet
469
- init_view(alloc, node, true);
568
+ // allocate parents (leafs)
569
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
570
+ struct ggml_tensor * parent = node->src[j];
571
+ if (parent == NULL) {
572
+ break;
470
573
  }
574
+ allocate_node(galloc, parent);
471
575
  }
472
576
 
577
+ // allocate node
578
+ allocate_node(galloc, node);
579
+
580
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
473
581
  for (int j = 0; j < GGML_MAX_SRC; j++) {
474
582
  struct ggml_tensor * parent = node->src[j];
475
583
  if (parent == NULL) {
476
584
  break;
477
585
  }
478
- hash_get(ht, parent)->n_children += 1;
479
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
480
- init_view(alloc, parent, true);
586
+ AT_PRINTF("%s", parent->name);
587
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
588
+ AT_PRINTF(", ");
481
589
  }
482
590
  }
591
+ AT_PRINTF("\n");
483
592
  }
484
- }
485
-
486
- // allocate tensors
487
- for (int g = 0; g < n_graphs; g++) {
488
- struct ggml_cgraph * gf = graphs[g];
489
- AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
490
- // graph inputs are allocated first to ensure that they are not overwritten by each other
491
- if (inputs != NULL && inputs[g] != NULL) {
492
- for (int i = 0; inputs[g][i] != NULL; i++) {
493
- struct ggml_tensor * input = inputs[g][i];
494
- AT_PRINTF("input: %s\n", input->name);
495
- allocate_node(alloc, input);
496
- }
497
- }
498
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
499
- int last_barrier_pos = 0;
500
- int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
501
593
 
502
- for (int ind = 0; ind < n_nodes; ind++) {
503
- // allocate a node if there is no parse_seq or this is not a barrier
504
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
505
- int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
506
- struct ggml_tensor * node = gf->nodes[i];
594
+ // update parents
595
+ // update immediately if there is no parse_seq
596
+ // update only at barriers if there is parse_seq
597
+ if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
598
+ int update_start = parse_seq_len ? last_barrier_pos : ind;
599
+ int update_end = parse_seq_len ? ind : ind + 1;
600
+ for (int i = update_start; i < update_end; i++) {
601
+ int node_i = parse_seq_len ? parse_seq[i] : i;
602
+ struct ggml_tensor * node = gf->nodes[node_i];
507
603
 
508
- // allocate parents (leafs)
509
604
  for (int j = 0; j < GGML_MAX_SRC; j++) {
510
605
  struct ggml_tensor * parent = node->src[j];
511
606
  if (parent == NULL) {
512
607
  break;
513
608
  }
514
- allocate_node(alloc, parent);
515
- }
609
+ struct hash_node * p_hn = hash_get(galloc, parent);
610
+ p_hn->n_children -= 1;
516
611
 
517
- // allocate node
518
- allocate_node(alloc, node);
612
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
519
613
 
520
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
521
- for (int j = 0; j < GGML_MAX_SRC; j++) {
522
- struct ggml_tensor * parent = node->src[j];
523
- if (parent == NULL) {
524
- break;
525
- }
526
- AT_PRINTF("%s", parent->name);
527
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
528
- AT_PRINTF(", ");
529
- }
530
- }
531
- AT_PRINTF("\n");
532
- }
533
-
534
- // update parents
535
- // update immediately if there is no parse_seq
536
- // update only at barriers if there is parse_seq
537
- if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
538
- int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
- int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
- for (int i = update_start; i < update_end; i++) {
541
- int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
542
- struct ggml_tensor * node = gf->nodes[node_i];
543
-
544
- for (int j = 0; j < GGML_MAX_SRC; j++) {
545
- struct ggml_tensor * parent = node->src[j];
546
- if (parent == NULL) {
547
- break;
548
- }
549
- struct hash_node * p_hn = hash_get(ht, parent);
550
- p_hn->n_children -= 1;
551
-
552
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
553
-
554
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
- if (ggml_is_view(parent)) {
556
- struct ggml_tensor * view_src = parent->view_src;
557
- struct hash_node * view_src_hn = hash_get(ht, view_src);
558
- view_src_hn->n_views -= 1;
559
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
560
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
- ggml_allocr_free_tensor(alloc, view_src);
562
- }
563
- }
564
- else {
565
- if (parent->data != node->data) {
566
- ggml_allocr_free_tensor(alloc, parent);
567
- }
614
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
615
+ if (ggml_is_view(parent)) {
616
+ struct ggml_tensor * view_src = parent->view_src;
617
+ struct hash_node * view_src_hn = hash_get(galloc, view_src);
618
+ view_src_hn->n_views -= 1;
619
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
620
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
621
+ free_node(galloc, view_src);
568
622
  }
569
623
  }
624
+ else {
625
+ free_node(galloc, parent);
626
+ }
570
627
  }
571
628
  }
572
- AT_PRINTF("\n");
573
- if (alloc->parse_seq_len) {
574
- last_barrier_pos = ind + 1;
575
- }
576
629
  }
577
- }
578
- // free graph outputs here that wouldn't be freed otherwise because they have no children
579
- if (outputs != NULL && outputs[g] != NULL) {
580
- for (int i = 0; outputs[g][i] != NULL; i++) {
581
- struct ggml_tensor * output = outputs[g][i];
582
- AT_PRINTF("output: %s\n", output->name);
583
- ggml_allocr_free_tensor(alloc, output);
630
+ AT_PRINTF("\n");
631
+ if (parse_seq_len) {
632
+ last_barrier_pos = ind + 1;
584
633
  }
585
634
  }
586
635
  }
636
+ }
587
637
 
588
- return alloc->max_size;
638
+ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
639
+ size_t hash_size = graph->visited_hash_table.size;
640
+
641
+ // check if the hash table is initialized and large enough
642
+ if (galloc->hash_set.size < hash_size) {
643
+ if (galloc->hash_set.keys != NULL) {
644
+ free(galloc->hash_set.keys);
645
+ }
646
+ if (galloc->hash_values != NULL) {
647
+ free(galloc->hash_values);
648
+ }
649
+ galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
650
+ galloc->hash_set.size = hash_size;
651
+ galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
652
+ }
653
+
654
+ // reset hash table
655
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
656
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
657
+
658
+ galloc->talloc = talloc;
659
+ ggml_tallocr_alloc_graph_impl(galloc, graph);
660
+ galloc->talloc = NULL;
661
+
662
+ size_t max_size = ggml_tallocr_max_size(talloc);
663
+
664
+ return max_size;
589
665
  }
590
666
 
591
- size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
592
- return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
667
+ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
668
+ const size_t hash_size = hash_set.size;
669
+
670
+ GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
671
+
672
+ galloc->talloc = NULL;
673
+
674
+ // alloc hash_values if needed
675
+ if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
676
+ free(galloc->hash_values);
677
+ galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
678
+ galloc->hash_values_size = hash_size;
679
+ }
680
+
681
+ // free hash_set.keys if needed
682
+ if (galloc->hash_set.keys != NULL) {
683
+ free(galloc->hash_set.keys);
684
+ }
685
+ galloc->hash_set = hash_set;
686
+
687
+ // reset hash values
688
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
689
+
690
+ galloc->hash_allocs = hash_node_talloc;
691
+
692
+ ggml_tallocr_alloc_graph_impl(galloc, graph);
693
+
694
+ // remove unowned resources
695
+ galloc->hash_set.keys = NULL;
696
+ galloc->hash_allocs = NULL;
593
697
  }
594
698
 
595
- size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
596
- return alloc->max_size;
699
+ // legacy API wrapper
700
+
701
+ struct ggml_allocr {
702
+ ggml_tallocr_t talloc;
703
+ ggml_gallocr_t galloc;
704
+ };
705
+
706
+ static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
707
+ ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
708
+ *alloc = (struct ggml_allocr) {
709
+ /*.talloc = */ talloc,
710
+ /*.galloc = */ ggml_gallocr_new(),
711
+ };
712
+ return alloc;
713
+ }
714
+
715
+ ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
716
+ return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
717
+ }
718
+
719
+ ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
720
+ return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
721
+ }
722
+
723
+ ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
724
+ return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
725
+ }
726
+
727
+ ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
728
+ return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
729
+ }
730
+
731
+ ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
732
+ return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
733
+ }
734
+
735
+ struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
736
+ return ggml_tallocr_get_buffer(alloc->talloc);
737
+ }
738
+
739
+ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
740
+ ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
741
+ }
742
+
743
+ void ggml_allocr_free(ggml_allocr_t alloc) {
744
+ ggml_gallocr_free(alloc->galloc);
745
+ ggml_tallocr_free(alloc->talloc);
746
+ free(alloc);
747
+ }
748
+
749
+ bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
750
+ return ggml_tallocr_is_measure(alloc->talloc);
751
+ }
752
+
753
+ void ggml_allocr_reset(ggml_allocr_t alloc) {
754
+ ggml_tallocr_reset(alloc->talloc);
755
+ }
756
+
757
+ void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
758
+ ggml_tallocr_alloc(alloc->talloc, tensor);
759
+ }
760
+
761
+ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
762
+ return ggml_tallocr_max_size(alloc->talloc);
763
+ }
764
+
765
+ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
766
+ return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
597
767
  }