llama_cpp 0.12.5 → 0.12.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,50 @@
17
17
  //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
+
21
+ static bool ggml_is_view(const struct ggml_tensor * t) {
22
+ return t->view_src != NULL;
23
+ }
24
+
25
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
+ if (a->type != b->type) {
27
+ return false;
28
+ }
29
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
+ if (a->ne[i] != b->ne[i]) {
31
+ return false;
32
+ }
33
+ if (a->nb[i] != b->nb[i]) {
34
+ return false;
35
+ }
36
+ }
37
+ return true;
38
+ }
39
+
40
+ static bool ggml_op_can_inplace(enum ggml_op op) {
41
+ switch (op) {
42
+ case GGML_OP_SCALE:
43
+ case GGML_OP_DIAG_MASK_ZERO:
44
+ case GGML_OP_DIAG_MASK_INF:
45
+ case GGML_OP_ADD:
46
+ case GGML_OP_ADD1:
47
+ case GGML_OP_SUB:
48
+ case GGML_OP_MUL:
49
+ case GGML_OP_DIV:
50
+ case GGML_OP_SQR:
51
+ case GGML_OP_SQRT:
52
+ case GGML_OP_LOG:
53
+ case GGML_OP_UNARY:
54
+ case GGML_OP_ROPE:
55
+ case GGML_OP_RMS_NORM:
56
+ case GGML_OP_SOFT_MAX:
57
+ return true;
58
+
59
+ default:
60
+ return false;
61
+ }
62
+ }
63
+
20
64
  // TODO: GGML_PAD ?
21
65
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
22
66
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
24
68
  return offset + align;
25
69
  }
26
70
 
71
+ // tallocr
72
+ struct ggml_tallocr {
73
+ ggml_backend_buffer_t buffer;
74
+ void * base;
75
+ size_t alignment;
76
+ size_t offset;
77
+ };
78
+
79
+ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
+ ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
+ if (talloc == NULL) {
82
+ return NULL;
83
+ }
84
+
85
+ void * base = ggml_backend_buffer_get_base(buffer);
86
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
87
+
88
+ assert(align && !(align & (align - 1))); // power of 2
89
+
90
+ *talloc = (struct ggml_tallocr) {
91
+ /*.buffer = */ buffer,
92
+ /*.base = */ base,
93
+ /*.alignment = */ align,
94
+ /*.offset = */ aligned_offset(base, 0, align),
95
+ };
96
+ return talloc;
97
+ }
98
+
99
+ void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
+ free(talloc);
101
+ }
102
+
103
+ void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
104
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
+ size = GGML_PAD(size, talloc->alignment);
106
+
107
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
108
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
109
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
110
+ GGML_ASSERT(!"not enough space in the buffer");
111
+ return;
112
+ }
113
+
114
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
115
+ talloc->offset += size;
116
+
117
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
118
+
119
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
120
+ }
121
+
122
+ // dynamic tensor allocator
123
+
27
124
  struct free_block {
28
- void * addr;
125
+ size_t offset;
29
126
  size_t size;
30
127
  };
31
128
 
32
- struct ggml_tallocr {
33
- struct ggml_backend_buffer * buffer;
34
- bool buffer_owned;
35
- void * base;
129
+ struct ggml_dyn_tallocr {
36
130
  size_t alignment;
37
-
38
131
  int n_free_blocks;
39
132
  struct free_block free_blocks[MAX_FREE_BLOCKS];
40
-
41
133
  size_t max_size;
42
134
 
43
- bool measure;
44
-
45
135
  #ifdef GGML_ALLOCATOR_DEBUG
46
- struct ggml_tensor * allocated_tensors[1024];
136
+ struct {
137
+ const struct ggml_tensor * tensor;
138
+ size_t offset;
139
+ } allocated_tensors[1024];
47
140
  #endif
48
141
  };
49
142
 
50
143
  #ifdef GGML_ALLOCATOR_DEBUG
51
- static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
144
+ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
52
145
  for (int i = 0; i < 1024; i++) {
53
- if (alloc->allocated_tensors[i] == NULL) {
54
- alloc->allocated_tensors[i] = tensor;
146
+ if (alloc->allocated_tensors[i].tensor == NULL) {
147
+ alloc->allocated_tensors[i].tensor = tensor;
148
+ alloc->allocated_tensors[i].offset = offset;
55
149
  return;
56
150
  }
57
151
  }
58
152
  GGML_ASSERT(!"out of allocated_tensors");
59
153
  }
60
- static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
154
+ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
61
155
  for (int i = 0; i < 1024; i++) {
62
- if (alloc->allocated_tensors[i] == tensor ||
63
- (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
64
- alloc->allocated_tensors[i] = NULL;
156
+ if (alloc->allocated_tensors[i].offset == offset) {
157
+ alloc->allocated_tensors[i].tensor = NULL;
65
158
  return;
66
159
  }
67
160
  }
68
- printf("tried to free tensor %s not found\n", tensor->name);
161
+ fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
69
162
  GGML_ASSERT(!"tensor not found");
70
163
  }
71
164
  #endif
72
165
 
73
- // check if a tensor is allocated by this buffer
74
- static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
75
- return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
76
- }
77
-
78
- static bool ggml_is_view(struct ggml_tensor * t) {
79
- return t->view_src != NULL;
80
- }
81
-
82
- void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
83
- GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
84
- GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
85
-
86
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
166
+ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
87
167
  size = aligned_offset(NULL, size, alloc->alignment);
88
168
 
89
169
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
109
189
  if (block->size >= size) {
110
190
  best_fit_block = alloc->n_free_blocks - 1;
111
191
  } else {
112
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
113
- __func__, tensor->name, size, max_avail);
192
+ // this should never happen
193
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
194
+ __func__, size, max_avail);
114
195
  GGML_ASSERT(!"not enough space in the buffer");
115
- return;
196
+ GGML_UNREACHABLE();
116
197
  }
117
198
  }
118
199
 
119
200
  struct free_block * block = &alloc->free_blocks[best_fit_block];
120
- void * addr = block->addr;
121
- block->addr = (char*)block->addr + size;
201
+ size_t offset = block->offset;
202
+ block->offset = offset + size;
122
203
  block->size -= size;
123
204
  if (block->size == 0) {
124
205
  // remove block if empty
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
128
209
  }
129
210
  }
130
211
 
131
- AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
-
133
- tensor->data = addr;
134
- tensor->buffer = alloc->buffer;
135
- if (!alloc->measure) {
136
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
137
- }
212
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
138
213
 
139
214
  #ifdef GGML_ALLOCATOR_DEBUG
140
- add_allocated_tensor(alloc, tensor);
141
- size_t cur_max = (char*)addr - (char*)alloc->base + size;
215
+ add_allocated_tensor(alloc, offset, tensor);
216
+ size_t cur_max = offset + size;
142
217
  if (cur_max > alloc->max_size) {
143
- printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
218
+ // sort allocated_tensors by offset
144
219
  for (int i = 0; i < 1024; i++) {
145
- if (alloc->allocated_tensors[i]) {
146
- printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
220
+ for (int j = i + 1; j < 1024; j++) {
221
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
222
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
223
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
224
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
225
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
226
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
227
+ alloc->allocated_tensors[j].offset = tmp_offset;
228
+ }
147
229
  }
148
230
  }
149
- printf("\n");
231
+ fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
232
+ for (int i = 0; i < 1024; i++) {
233
+ if (alloc->allocated_tensors[i].tensor) {
234
+ fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
235
+ alloc->allocated_tensors[i].offset,
236
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
237
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
238
+ }
239
+ }
240
+ fprintf(stderr, "\n");
150
241
  }
151
242
  #endif
152
243
 
153
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
154
- }
244
+ alloc->max_size = MAX(alloc->max_size, offset + size);
155
245
 
156
- // this is a very naive implementation, but for our case the number of free blocks should be very small
157
- static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
158
- if (ggml_tallocr_is_own(alloc, tensor) == false) {
159
- // the tensor was not allocated in this buffer
160
- // this can happen because the graph allocator will try to free weights and other tensors from different buffers
161
- // the easiest way to deal with this is just to ignore it
162
- // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
163
- return;
164
- }
246
+ return offset;
165
247
 
166
- void * ptr = tensor->data;
248
+ GGML_UNUSED(tensor);
249
+ }
167
250
 
168
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
251
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
252
+ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
169
253
  size = aligned_offset(NULL, size, alloc->alignment);
170
- AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
254
+
255
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
171
256
 
172
257
  #ifdef GGML_ALLOCATOR_DEBUG
173
- remove_allocated_tensor(alloc, tensor);
258
+ remove_allocated_tensor(alloc, offset, tensor);
174
259
  #endif
175
260
 
176
261
  // see if we can merge with an existing block
177
262
  for (int i = 0; i < alloc->n_free_blocks; i++) {
178
263
  struct free_block * block = &alloc->free_blocks[i];
179
264
  // check if ptr is at the end of the block
180
- if ((char*)block->addr + block->size == ptr) {
265
+ if (block->offset + block->size == offset) {
181
266
  block->size += size;
182
267
  // check if we can merge with the next block
183
- if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
268
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
184
269
  block->size += alloc->free_blocks[i+1].size;
185
270
  alloc->n_free_blocks--;
186
271
  for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
190
275
  return;
191
276
  }
192
277
  // check if ptr is at the beginning of the block
193
- if ((char*)ptr + size == block->addr) {
194
- block->addr = ptr;
278
+ if (offset + size == block->offset) {
279
+ block->offset = offset;
195
280
  block->size += size;
196
281
  // check if we can merge with the previous block
197
- if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
282
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
198
283
  alloc->free_blocks[i-1].size += block->size;
199
284
  alloc->n_free_blocks--;
200
285
  for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
208
293
  GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
209
294
  // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
210
295
  int insert_pos = 0;
211
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
296
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
212
297
  insert_pos++;
213
298
  }
214
299
  // shift all blocks from insert_pos onward to make room for the new block
@@ -216,565 +301,585 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
216
301
  alloc->free_blocks[i] = alloc->free_blocks[i-1];
217
302
  }
218
303
  // insert the new block
219
- alloc->free_blocks[insert_pos].addr = ptr;
304
+ alloc->free_blocks[insert_pos].offset = offset;
220
305
  alloc->free_blocks[insert_pos].size = size;
221
306
  alloc->n_free_blocks++;
307
+
308
+ GGML_UNUSED(tensor);
222
309
  }
223
310
 
224
- void ggml_tallocr_reset(ggml_tallocr_t alloc) {
311
+ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
225
312
  alloc->n_free_blocks = 1;
226
- size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
227
- alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
228
-
229
- if (alloc->measure) {
230
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
231
- } else {
232
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
- ggml_backend_buffer_reset(alloc->buffer);
234
- }
313
+ alloc->free_blocks[0].offset = 0;
314
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
315
+ alloc->max_size = 0;
235
316
  }
236
317
 
237
- ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
238
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
239
-
240
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
318
+ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
319
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
241
320
 
242
- *alloc = (struct ggml_tallocr) {
243
- /*.buffer = */ buffer,
244
- /*.buffer_owned = */ true,
245
- /*.base = */ ggml_backend_buffer_get_base(buffer),
321
+ *alloc = (struct ggml_dyn_tallocr) {
246
322
  /*.alignment = */ alignment,
247
323
  /*.n_free_blocks = */ 0,
248
324
  /*.free_blocks = */ {{0}},
249
325
  /*.max_size = */ 0,
250
- /*.measure = */ false,
251
326
  #ifdef GGML_ALLOCATOR_DEBUG
252
- /*.allocated_tensors = */ {0},
327
+ /*.allocated_tensors = */ {{0}},
253
328
  #endif
254
329
  };
255
330
 
256
- ggml_tallocr_reset(alloc);
257
-
258
- return alloc;
259
- }
260
-
261
- ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
262
- ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
263
- alloc->measure = true;
331
+ ggml_dyn_tallocr_reset(alloc);
264
332
 
265
333
  return alloc;
266
334
  }
267
335
 
268
- ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
269
- // create a backend buffer to get the correct tensor allocation sizes
270
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
271
-
272
- // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
273
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
274
- alloc->buffer_owned = true;
275
- alloc->measure = true;
276
- ggml_tallocr_reset(alloc);
277
- return alloc;
336
+ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
337
+ free(alloc);
278
338
  }
279
339
 
280
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
- return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
340
+ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
341
+ return alloc->max_size;
282
342
  }
283
343
 
284
- ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
- // create a backend buffer to get the correct tensor allocation sizes
286
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
287
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
288
- alloc->buffer_owned = true;
289
- return alloc;
290
- }
291
344
 
292
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
- return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
- }
345
+ /////////////////////////////////////
295
346
 
296
- ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
297
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
347
+ // graph allocator
298
348
 
299
- *alloc = (struct ggml_tallocr) {
300
- /*.buffer = */ buffer,
301
- /*.buffer_owned = */ false,
302
- /*.base = */ ggml_backend_buffer_get_base(buffer),
303
- /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
304
- /*.n_free_blocks = */ 0,
305
- /*.free_blocks = */ {{0}},
306
- /*.max_size = */ 0,
307
- /*.measure = */ false,
308
- #ifdef GGML_ALLOCATOR_DEBUG
309
- /*.allocated_tensors = */ {0},
310
- #endif
311
- };
349
+ struct hash_node {
350
+ int n_children;
351
+ int n_views;
352
+ int buffer_id;
353
+ size_t offset; // offset within the buffer
354
+ bool allocated;
355
+ };
312
356
 
313
- ggml_tallocr_reset(alloc);
357
+ //
358
+ struct tensor_alloc {
359
+ size_t offset;
360
+ size_t size_max; // 0 = pre-allocated, unused, or view
361
+ };
314
362
 
315
- return alloc;
316
- }
363
+ struct node_alloc {
364
+ int buffer_id;
365
+ struct tensor_alloc dst;
366
+ struct tensor_alloc src[GGML_MAX_SRC];
367
+ };
317
368
 
318
- struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
319
- return alloc->buffer;
320
- }
369
+ struct ggml_gallocr {
370
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
371
+ ggml_backend_buffer_t * buffers; // [n_buffers]
372
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
373
+ int n_buffers;
321
374
 
322
- void ggml_tallocr_free(ggml_tallocr_t alloc) {
323
- if (alloc == NULL) {
324
- return;
325
- }
375
+ struct ggml_hash_set hash_set;
376
+ struct hash_node * hash_values; // [hash_set.size]
326
377
 
327
- if (alloc->buffer_owned) {
328
- ggml_backend_buffer_free(alloc->buffer);
329
- }
330
- free(alloc);
331
- }
378
+ struct node_alloc * node_allocs; // [n_nodes]
379
+ int n_nodes;
332
380
 
333
- bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
334
- return alloc->measure;
335
- }
381
+ struct tensor_alloc * leaf_allocs; // [n_leafs]
382
+ int n_leafs;
383
+ };
336
384
 
337
- size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
- // to avoid this, we add a 10% margin to the buffer size
340
- return alloc->max_size + alloc->max_size/10;
341
- }
385
+ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
386
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
387
+ GGML_ASSERT(galloc != NULL);
342
388
 
343
- // graph allocator
389
+ galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
390
+ GGML_ASSERT(galloc->bufts != NULL);
344
391
 
345
- struct hash_node {
346
- int n_children;
347
- int n_views;
348
- };
392
+ galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
393
+ GGML_ASSERT(galloc->buffers != NULL);
349
394
 
350
- struct ggml_gallocr {
351
- ggml_tallocr_t talloc;
352
- struct ggml_hash_set hash_set;
353
- struct hash_node * hash_values;
354
- size_t hash_values_size;
355
- ggml_tallocr_t * hash_allocs;
356
- int * parse_seq;
357
- int parse_seq_len;
358
- };
395
+ galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
396
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
359
397
 
360
- ggml_gallocr_t ggml_gallocr_new(void) {
361
- ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
362
-
363
- *galloc = (struct ggml_gallocr) {
364
- /*.talloc = */ NULL,
365
- /*.hash_set = */ {0},
366
- /*.hash_values = */ NULL,
367
- /*.hash_values_size = */ 0,
368
- /*.hash_allocs = */ NULL,
369
- /*.parse_seq = */ NULL,
370
- /*.parse_seq_len = */ 0,
371
- };
398
+ for (int i = 0; i < n_bufs; i++) {
399
+ galloc->bufts[i] = bufts[i];
400
+ galloc->buffers[i] = NULL;
401
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
402
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
403
+ }
404
+ galloc->n_buffers = n_bufs;
372
405
 
373
406
  return galloc;
374
407
  }
375
408
 
409
+ ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
410
+ return ggml_gallocr_new_n(&buft, 1);
411
+ }
412
+
376
413
  void ggml_gallocr_free(ggml_gallocr_t galloc) {
377
414
  if (galloc == NULL) {
378
415
  return;
379
416
  }
380
417
 
381
- if (galloc->hash_set.keys != NULL) {
382
- free(galloc->hash_set.keys);
383
- }
384
- if (galloc->hash_values != NULL) {
385
- free(galloc->hash_values);
386
- }
387
- if (galloc->hash_allocs != NULL) {
388
- free(galloc->hash_allocs);
389
- }
390
- if (galloc->parse_seq != NULL) {
391
- free(galloc->parse_seq);
418
+ for (int i = 0; i < galloc->n_buffers; i++) {
419
+ if (galloc->buffers != NULL) {
420
+ ggml_backend_buffer_free(galloc->buffers[i]);
421
+ }
422
+ if (galloc->buf_tallocs != NULL) {
423
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
424
+ }
392
425
  }
426
+
427
+ free(galloc->hash_set.keys);
428
+ free(galloc->hash_values);
429
+ free(galloc->bufts);
430
+ free(galloc->buffers);
431
+ free(galloc->buf_tallocs);
432
+ free(galloc->node_allocs);
433
+ free(galloc->leaf_allocs);
393
434
  free(galloc);
394
435
  }
395
436
 
396
- void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
397
- free(galloc->parse_seq);
398
- galloc->parse_seq = malloc(sizeof(int) * n);
437
+ typedef struct ggml_gallocr * ggml_gallocr_t;
399
438
 
400
- for (int i = 0; i < n; i++) {
401
- galloc->parse_seq[i] = list[i];
402
- }
403
- galloc->parse_seq_len = n;
404
- }
405
-
406
- static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
439
+ static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
407
440
  size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
408
441
  return &galloc->hash_values[i];
409
442
  }
410
443
 
411
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
412
- if (a->type != b->type) {
413
- return false;
414
- }
415
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
416
- if (a->ne[i] != b->ne[i]) {
417
- return false;
418
- }
419
- if (a->nb[i] != b->nb[i]) {
420
- return false;
421
- }
422
- }
423
- return true;
444
+ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
445
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
424
446
  }
425
447
 
426
- static bool ggml_op_can_inplace(enum ggml_op op) {
427
- switch (op) {
428
- case GGML_OP_SCALE:
429
- case GGML_OP_DIAG_MASK_ZERO:
430
- case GGML_OP_DIAG_MASK_INF:
431
- case GGML_OP_ADD:
432
- case GGML_OP_ADD1:
433
- case GGML_OP_SUB:
434
- case GGML_OP_MUL:
435
- case GGML_OP_DIV:
436
- case GGML_OP_SQR:
437
- case GGML_OP_SQRT:
438
- case GGML_OP_LOG:
439
- case GGML_OP_UNARY:
440
- case GGML_OP_ROPE:
441
- case GGML_OP_RMS_NORM:
442
- case GGML_OP_SOFT_MAX:
443
- return true;
444
-
445
- default:
446
- return false;
447
- }
448
+ static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
449
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
450
+ hn->buffer_id = buffer_id;
451
+ hn->offset = offset;
452
+ hn->allocated = true;
448
453
  }
449
454
 
450
- static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
451
- if (galloc->talloc != NULL) {
452
- return galloc->talloc;
453
- }
454
-
455
- return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
455
+ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
456
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
456
457
  }
457
458
 
458
- static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
459
- ggml_tallocr_t alloc = node_tallocr(galloc, view);
459
+ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
460
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
460
461
 
461
- GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
462
- if (update_backend) {
463
- view->backend = view->view_src->backend;
464
- }
465
- // views are initialized in the alloc buffer rather than the view_src buffer
466
- view->buffer = alloc->buffer;
467
- view->data = (char *)view->view_src->data + view->view_offs;
462
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
463
+ hn->allocated = true;
464
+ assert(hn->offset == 0);
468
465
 
469
- assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
470
-
471
- if (!alloc->measure) {
472
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
473
- }
474
- }
466
+ // try to reuse a parent's buffer (inplace)
467
+ if (ggml_op_can_inplace(node->op)) {
468
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
469
+ struct ggml_tensor * parent = node->src[i];
470
+ if (parent == NULL) {
471
+ continue;
472
+ }
475
473
 
476
- static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
477
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
474
+ // if the node's data is external, then we cannot re-use it
475
+ if (!ggml_gallocr_is_own(galloc, parent)) {
476
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
477
+ continue;
478
+ }
478
479
 
479
- if (node->data == NULL) {
480
- if (ggml_is_view(node)) {
481
- init_view(galloc, node, true);
482
- } else {
483
- // see if we can reuse a parent's buffer (inplace)
484
- if (ggml_op_can_inplace(node->op)) {
485
- for (int i = 0; i < GGML_MAX_SRC; i++) {
486
- struct ggml_tensor * parent = node->src[i];
487
- if (parent == NULL) {
488
- break;
489
- }
480
+ // outputs cannot be reused
481
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
482
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
483
+ continue;
484
+ }
490
485
 
491
- // if the node's data is external, then we cannot re-use it
492
- if (ggml_tallocr_is_own(alloc, parent) == false) {
493
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
494
- continue;
495
- }
486
+ if (!ggml_are_same_layout(node, parent)) {
487
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
488
+ continue;
489
+ }
496
490
 
497
- struct hash_node * p_hn = hash_get(galloc, parent);
498
- if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
499
- if (ggml_is_view(parent)) {
500
- struct ggml_tensor * view_src = parent->view_src;
501
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
502
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
503
- // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
504
- // the parent's data that it will need later (same layout requirement). the problem is that then
505
- // we cannot free the tensor because the original address of the allocation is lost.
506
- // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
507
- // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
508
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
509
- node->view_src = view_src;
510
- view_src_hn->n_views += 1;
511
- init_view(galloc, node, false);
512
- return;
513
- }
514
- } else {
515
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
516
- node->view_src = parent;
517
- p_hn->n_views += 1;
518
- init_view(galloc, node, false);
491
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
492
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
493
+ if (ggml_is_view(parent)) {
494
+ struct ggml_tensor * view_src = parent->view_src;
495
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
496
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
497
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
498
+ assert(view_src_hn->offset == p_hn->offset);
499
+ hn->buffer_id = p_hn->buffer_id;
500
+ hn->offset = p_hn->offset;
501
+ p_hn->allocated = false; // avoid freeing the parent
502
+ view_src_hn->allocated = false;
519
503
  return;
520
504
  }
505
+ } else {
506
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
507
+ hn->buffer_id = p_hn->buffer_id;
508
+ hn->offset = p_hn->offset;
509
+ p_hn->allocated = false; // avoid freeing the parent
510
+ return;
521
511
  }
522
512
  }
523
513
  }
524
- ggml_tallocr_alloc(alloc, node);
525
514
  }
515
+ // allocate tensor from the buffer
516
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
517
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
518
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
519
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
520
+ hn->buffer_id = buffer_id;
521
+ hn->offset = offset;
522
+ return;
526
523
  }
527
524
  }
528
525
 
529
- static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
530
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
526
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
527
+ // graph outputs are never freed
528
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
529
+ AT_PRINTF("not freeing output %s\n", node->name);
530
+ return;
531
+ }
532
+
533
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
534
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
535
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
536
+ size_t offset = hn->offset;
537
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
538
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
539
+ hn->allocated = false;
540
+ }
531
541
 
532
- ggml_tallocr_free_tensor(alloc, node);
542
+ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
543
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
533
544
  }
534
545
 
535
- static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
536
- const int * parse_seq = galloc->parse_seq;
537
- int parse_seq_len = galloc->parse_seq_len;
546
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
547
+ // clear hash tables
548
+ memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
549
+ memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
538
550
 
539
551
  // count number of children and views
540
- for (int i = 0; i < gf->n_nodes; i++) {
541
- struct ggml_tensor * node = gf->nodes[i];
552
+ // allocate all graph inputs and leafs first to avoid overwriting them
553
+ for (int i = 0; i < graph->n_nodes; i++) {
554
+ struct ggml_tensor * node = graph->nodes[i];
542
555
 
543
556
  if (ggml_is_view(node)) {
544
557
  struct ggml_tensor * view_src = node->view_src;
545
- hash_get(galloc, view_src)->n_views += 1;
546
- if (node->buffer == NULL && node->data != NULL) {
547
- // view of a pre-allocated tensor, didn't call init_view() yet
548
- init_view(galloc, node, true);
549
- }
558
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
559
+ }
560
+
561
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
562
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
550
563
  }
551
564
 
552
565
  for (int j = 0; j < GGML_MAX_SRC; j++) {
553
- struct ggml_tensor * parent = node->src[j];
554
- if (parent == NULL) {
555
- break;
566
+ struct ggml_tensor * src = node->src[j];
567
+ if (src == NULL) {
568
+ continue;
556
569
  }
557
- hash_get(galloc, parent)->n_children += 1;
558
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
559
- init_view(galloc, parent, true);
570
+
571
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
572
+
573
+ // allocate explicit inputs and leafs
574
+ if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
575
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
560
576
  }
561
577
  }
562
- }
578
+ }
579
+
580
+ // allocate the remaining leafs that are unused on the graph
581
+ // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
582
+ for (int i = 0; i < graph->n_leafs; i++) {
583
+ struct ggml_tensor * leaf = graph->leafs[i];
584
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
585
+
586
+ if (hn->n_children == 0) {
587
+ assert(!hn->allocated);
588
+ // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
589
+ ggml_gallocr_allocate_node(galloc, leaf, 0);
590
+ }
591
+ }
563
592
 
564
593
  // allocate tensors
565
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
566
- int last_barrier_pos = 0;
567
- int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
568
-
569
- for (int ind = 0; ind < n_nodes; ind++) {
570
- // allocate a node if there is no parse_seq or this is not a barrier
571
- if (parse_seq_len == 0 || parse_seq[ind] != -1) {
572
- int i = parse_seq_len ? parse_seq[ind] : ind;
573
- struct ggml_tensor * node = gf->nodes[i];
574
-
575
- // allocate parents (leafs)
576
- for (int j = 0; j < GGML_MAX_SRC; j++) {
577
- struct ggml_tensor * parent = node->src[j];
578
- if (parent == NULL) {
579
- break;
580
- }
581
- allocate_node(galloc, parent);
594
+ for (int i = 0; i < graph->n_nodes; i++) {
595
+ struct ggml_tensor * node = graph->nodes[i];
596
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
597
+
598
+ // allocate parents (only leafs need to be allocated at this point)
599
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
600
+ struct ggml_tensor * parent = node->src[j];
601
+ if (parent == NULL) {
602
+ continue;
582
603
  }
604
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
605
+ }
583
606
 
584
- // allocate node
585
- allocate_node(galloc, node);
607
+ // allocate node
608
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
586
609
 
587
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
588
- for (int j = 0; j < GGML_MAX_SRC; j++) {
589
- struct ggml_tensor * parent = node->src[j];
590
- if (parent == NULL) {
591
- break;
592
- }
593
- AT_PRINTF("%s", parent->name);
594
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
595
- AT_PRINTF(", ");
596
- }
610
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
611
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
612
+ struct ggml_tensor * parent = node->src[j];
613
+ if (parent == NULL) {
614
+ continue;
615
+ }
616
+ AT_PRINTF("%s", parent->name);
617
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
618
+ AT_PRINTF(", ");
597
619
  }
598
- AT_PRINTF("\n");
599
620
  }
621
+ AT_PRINTF("\n");
600
622
 
601
623
  // update parents
602
- // update immediately if there is no parse_seq
603
- // update only at barriers if there is parse_seq
604
- if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
605
- int update_start = parse_seq_len ? last_barrier_pos : ind;
606
- int update_end = parse_seq_len ? ind : ind + 1;
607
- for (int i = update_start; i < update_end; i++) {
608
- int node_i = parse_seq_len ? parse_seq[i] : i;
609
- struct ggml_tensor * node = gf->nodes[node_i];
610
-
611
- for (int j = 0; j < GGML_MAX_SRC; j++) {
612
- struct ggml_tensor * parent = node->src[j];
613
- if (parent == NULL) {
614
- break;
615
- }
616
- struct hash_node * p_hn = hash_get(galloc, parent);
617
- p_hn->n_children -= 1;
618
-
619
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
620
-
621
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
622
- if (ggml_is_view(parent)) {
623
- struct ggml_tensor * view_src = parent->view_src;
624
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
625
- view_src_hn->n_views -= 1;
626
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
627
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
628
- free_node(galloc, view_src);
629
- }
630
- }
631
- else {
632
- free_node(galloc, parent);
633
- }
624
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
625
+ struct ggml_tensor * parent = node->src[j];
626
+ if (parent == NULL) {
627
+ continue;
628
+ }
629
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
630
+ p_hn->n_children -= 1;
631
+
632
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
633
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
634
+
635
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
636
+ if (ggml_is_view(parent)) {
637
+ struct ggml_tensor * view_src = parent->view_src;
638
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
639
+ view_src_hn->n_views -= 1;
640
+ AT_PRINTF("view_src %s: %d children, %d views\n",
641
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
642
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
643
+ ggml_gallocr_free_node(galloc, view_src, buffer_id);
634
644
  }
635
645
  }
646
+ else if (p_hn->allocated) {
647
+ ggml_gallocr_free_node(galloc, parent, buffer_id);
648
+ }
636
649
  }
637
650
  AT_PRINTF("\n");
638
- if (parse_seq_len) {
639
- last_barrier_pos = ind + 1;
640
- }
641
651
  }
642
652
  }
643
653
  }
644
654
 
645
- size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
655
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
646
656
  size_t hash_size = graph->visited_hash_table.size;
647
657
 
648
- // check if the hash table is initialized and large enough
658
+ // initialize hash table
649
659
  if (galloc->hash_set.size < hash_size) {
650
- if (galloc->hash_set.keys != NULL) {
651
- free(galloc->hash_set.keys);
652
- }
653
- if (galloc->hash_values != NULL) {
654
- free(galloc->hash_values);
655
- }
656
- galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
660
+ free(galloc->hash_set.keys);
661
+ free(galloc->hash_values);
657
662
  galloc->hash_set.size = hash_size;
658
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
663
+ galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
664
+ galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
665
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
666
+ GGML_ASSERT(galloc->hash_values != NULL);
667
+ } else {
668
+ // reset hash table
669
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
670
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
659
671
  }
660
672
 
661
- // reset hash table
662
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
663
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
664
-
665
- galloc->talloc = talloc;
666
- ggml_tallocr_alloc_graph_impl(galloc, graph);
667
- galloc->talloc = NULL;
668
-
669
- size_t max_size = ggml_tallocr_max_size(talloc);
670
-
671
- return max_size;
672
- }
673
-
674
- void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
675
- const size_t hash_size = hash_set.size;
676
-
677
- GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
673
+ // reset allocators
674
+ for (int i = 0; i < galloc->n_buffers; i++) {
675
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
676
+ }
678
677
 
679
- galloc->talloc = NULL;
678
+ // allocate in hash table
679
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
680
680
 
681
- // alloc hash_values if needed
682
- if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
683
- free(galloc->hash_values);
684
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
685
- galloc->hash_values_size = hash_size;
681
+ // set the node_allocs from the hash table
682
+ if (galloc->n_nodes < graph->n_nodes) {
683
+ free(galloc->node_allocs);
684
+ galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
685
+ GGML_ASSERT(galloc->node_allocs != NULL);
686
686
  }
687
-
688
- // free hash_set.keys if needed
689
- if (galloc->hash_set.keys != NULL) {
690
- free(galloc->hash_set.keys);
687
+ galloc->n_nodes = graph->n_nodes;
688
+ for (int i = 0; i < graph->n_nodes; i++) {
689
+ struct ggml_tensor * node = graph->nodes[i];
690
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
691
+ node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
692
+ if (node->view_src || node->data) {
693
+ node_alloc->dst.offset = SIZE_MAX;
694
+ node_alloc->dst.size_max = 0;
695
+ } else {
696
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
697
+ node_alloc->dst.offset = hn->offset;
698
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
699
+ }
700
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
701
+ struct ggml_tensor * src = node->src[j];
702
+ if (!src || src->view_src || src->data) {
703
+ node_alloc->src[j].offset = SIZE_MAX;
704
+ node_alloc->src[j].size_max = 0;
705
+ } else {
706
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
707
+ node_alloc->src[j].offset = hn->offset;
708
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
709
+ }
710
+ }
711
+ }
712
+ if (galloc->n_leafs < graph->n_leafs) {
713
+ free(galloc->leaf_allocs);
714
+ galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
715
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
716
+ }
717
+ galloc->n_leafs = graph->n_leafs;
718
+ for (int i = 0; i < graph->n_leafs; i++) {
719
+ struct ggml_tensor * leaf = graph->leafs[i];
720
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
721
+ galloc->leaf_allocs[i].offset = hn->offset;
722
+ galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
691
723
  }
692
- galloc->hash_set = hash_set;
693
-
694
- // reset hash values
695
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
696
724
 
697
- galloc->hash_allocs = hash_node_talloc;
725
+ // reallocate buffers if needed
726
+ for (int i = 0; i < galloc->n_buffers; i++) {
727
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
728
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
698
729
 
699
- ggml_tallocr_alloc_graph_impl(galloc, graph);
730
+ if (new_size > cur_size) {
731
+ #ifndef NDEBUG
732
+ fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
733
+ #endif
734
+ ggml_backend_buffer_free(galloc->buffers[i]);
735
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
736
+ if (galloc->buffers[i] == NULL) {
737
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
738
+ return false;
739
+ }
740
+ }
741
+ }
700
742
 
701
- // remove unowned resources
702
- galloc->hash_set.keys = NULL;
703
- galloc->hash_allocs = NULL;
743
+ return true;
704
744
  }
705
745
 
706
- // legacy API wrapper
707
-
708
- struct ggml_allocr {
709
- ggml_tallocr_t talloc;
710
- ggml_gallocr_t galloc;
711
- };
712
-
713
- static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
714
- ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
715
- *alloc = (struct ggml_allocr) {
716
- /*.talloc = */ talloc,
717
- /*.galloc = */ ggml_gallocr_new(),
718
- };
719
- return alloc;
746
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
747
+ return ggml_gallocr_reserve_n(galloc, graph, NULL);
720
748
  }
721
749
 
722
- ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
723
- return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
724
- }
750
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
751
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
725
752
 
726
- ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
727
- return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
753
+ if (node->view_src != NULL) {
754
+ if (node->buffer == NULL) {
755
+ assert(tensor_alloc->offset == SIZE_MAX);
756
+ if (node->view_src->buffer == NULL) {
757
+ // this tensor was allocated without ggml-backend
758
+ return;
759
+ }
760
+ ggml_backend_view_init(galloc->buffers[buffer_id], node);
761
+ }
762
+ } else {
763
+ if (node->data == NULL) {
764
+ assert(tensor_alloc->offset != SIZE_MAX);
765
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
766
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
767
+ void * addr = (char *)base + tensor_alloc->offset;
768
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
769
+ } else {
770
+ if (node->buffer == NULL) {
771
+ // this tensor was allocated without ggml-backend
772
+ return;
773
+ }
774
+ }
775
+ }
728
776
  }
729
777
 
730
- ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
731
- return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
778
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
779
+ ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
780
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
781
+ return talloc->size_max >= node_size;
732
782
  }
733
783
 
734
- ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
735
- return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
736
- }
784
+ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
785
+ if (galloc->n_nodes != graph->n_nodes) {
786
+ #ifndef NDEBUG
787
+ fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
788
+ #endif
789
+ return true;
790
+ }
737
791
 
738
- ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
739
- return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
740
- }
792
+ if (galloc->n_leafs != graph->n_leafs) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
795
+ #endif
796
+ return true;
797
+ }
741
798
 
742
- struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
743
- return ggml_tallocr_get_buffer(alloc->talloc);
744
- }
799
+ for (int i = 0; i < graph->n_nodes; i++) {
800
+ struct ggml_tensor * node = graph->nodes[i];
801
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
745
802
 
746
- void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
747
- ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
748
- }
803
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
804
+ #ifndef NDEBUG
805
+ fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
806
+ #endif
807
+ return true;
808
+ }
749
809
 
750
- void ggml_allocr_free(ggml_allocr_t alloc) {
751
- if (alloc == NULL) {
752
- return;
810
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
811
+ struct ggml_tensor * src = node->src[j];
812
+ if (src == NULL) {
813
+ continue;
814
+ }
815
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
816
+ #ifndef NDEBUG
817
+ fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
818
+ #endif
819
+ return true;
820
+ }
821
+ }
753
822
  }
754
823
 
755
- ggml_gallocr_free(alloc->galloc);
756
- ggml_tallocr_free(alloc->talloc);
757
- free(alloc);
824
+ return false;
758
825
  }
759
826
 
760
- bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
761
- return ggml_tallocr_is_measure(alloc->talloc);
762
- }
827
+ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
828
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
829
+ if (galloc->n_buffers == 1) {
830
+ #ifndef NDEBUG
831
+ fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
832
+ #endif
833
+ if (!ggml_gallocr_reserve(galloc, graph)) {
834
+ return false;
835
+ }
836
+ } else {
837
+ #ifndef NDEBUG
838
+ fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
839
+ #endif
840
+ return false;
841
+ }
842
+ }
763
843
 
764
- void ggml_allocr_reset(ggml_allocr_t alloc) {
765
- ggml_tallocr_reset(alloc->talloc);
766
- }
844
+ // reset buffers
845
+ for (int i = 0; i < galloc->n_buffers; i++) {
846
+ // zero size buffers are not allocated
847
+ if (galloc->buffers[i] != NULL) {
848
+ ggml_backend_buffer_reset(galloc->buffers[i]);
849
+ }
850
+ }
767
851
 
768
- void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
769
- ggml_tallocr_alloc(alloc->talloc, tensor);
770
- }
852
+ // allocate the graph tensors from the previous assignments
853
+ // nodes
854
+ for (int i = 0; i < graph->n_nodes; i++) {
855
+ struct ggml_tensor * node = graph->nodes[i];
856
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
857
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
858
+ struct ggml_tensor * src = node->src[j];
859
+ if (src == NULL) {
860
+ continue;
861
+ }
862
+ ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
863
+ }
864
+ ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
865
+ }
866
+ // leafs
867
+ for (int i = 0; i < graph->n_leafs; i++) {
868
+ struct ggml_tensor * leaf = graph->leafs[i];
869
+ struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
870
+ ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
871
+ }
771
872
 
772
- size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
773
- return ggml_tallocr_max_size(alloc->talloc);
873
+ return true;
774
874
  }
775
875
 
776
- size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
777
- return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
876
+ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
877
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
878
+
879
+ if (galloc->buffers[buffer_id] == NULL) {
880
+ return 0;
881
+ }
882
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
778
883
  }
779
884
 
780
885
  // utils
@@ -795,17 +900,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
795
900
  return false;
796
901
  }
797
902
 
798
- ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
903
+ struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
799
904
 
800
905
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
801
906
  if (t->data == NULL) {
802
907
  if (t->view_src == NULL) {
803
908
  ggml_tallocr_alloc(tallocr, t);
804
- } else {
909
+ } else if (t->buffer == NULL) {
805
910
  ggml_backend_view_init(buffer, t);
806
911
  }
807
912
  } else {
808
- if (t->view_src != NULL) {
913
+ if (t->view_src != NULL && t->buffer == NULL) {
809
914
  // view of a pre-allocated tensor
810
915
  ggml_backend_view_init(buffer, t);
811
916
  }
@@ -838,7 +943,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
838
943
  }
839
944
 
840
945
  if (this_size > max_size) {
841
- // tensor is too large to fit in a single buffer
842
946
  fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
947
  __func__, t->name,
844
948
  ggml_backend_buft_name(buft),
@@ -870,7 +974,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
870
974
  }
871
975
 
872
976
  if (n_buffers == 0) {
873
- // all the tensors in the context are already allocated
874
977
  #ifndef NDEBUG
875
978
  fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
979
  #endif