llama_cpp 0.12.5 → 0.12.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,6 +17,50 @@
17
17
  //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
+
21
+ static bool ggml_is_view(const struct ggml_tensor * t) {
22
+ return t->view_src != NULL;
23
+ }
24
+
25
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
+ if (a->type != b->type) {
27
+ return false;
28
+ }
29
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
+ if (a->ne[i] != b->ne[i]) {
31
+ return false;
32
+ }
33
+ if (a->nb[i] != b->nb[i]) {
34
+ return false;
35
+ }
36
+ }
37
+ return true;
38
+ }
39
+
40
+ static bool ggml_op_can_inplace(enum ggml_op op) {
41
+ switch (op) {
42
+ case GGML_OP_SCALE:
43
+ case GGML_OP_DIAG_MASK_ZERO:
44
+ case GGML_OP_DIAG_MASK_INF:
45
+ case GGML_OP_ADD:
46
+ case GGML_OP_ADD1:
47
+ case GGML_OP_SUB:
48
+ case GGML_OP_MUL:
49
+ case GGML_OP_DIV:
50
+ case GGML_OP_SQR:
51
+ case GGML_OP_SQRT:
52
+ case GGML_OP_LOG:
53
+ case GGML_OP_UNARY:
54
+ case GGML_OP_ROPE:
55
+ case GGML_OP_RMS_NORM:
56
+ case GGML_OP_SOFT_MAX:
57
+ return true;
58
+
59
+ default:
60
+ return false;
61
+ }
62
+ }
63
+
20
64
  // TODO: GGML_PAD ?
21
65
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
22
66
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
24
68
  return offset + align;
25
69
  }
26
70
 
71
+ // tallocr
72
+ struct ggml_tallocr {
73
+ ggml_backend_buffer_t buffer;
74
+ void * base;
75
+ size_t alignment;
76
+ size_t offset;
77
+ };
78
+
79
+ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
+ ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
+ if (talloc == NULL) {
82
+ return NULL;
83
+ }
84
+
85
+ void * base = ggml_backend_buffer_get_base(buffer);
86
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
87
+
88
+ assert(align && !(align & (align - 1))); // power of 2
89
+
90
+ *talloc = (struct ggml_tallocr) {
91
+ /*.buffer = */ buffer,
92
+ /*.base = */ base,
93
+ /*.alignment = */ align,
94
+ /*.offset = */ aligned_offset(base, 0, align),
95
+ };
96
+ return talloc;
97
+ }
98
+
99
+ void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
+ free(talloc);
101
+ }
102
+
103
+ void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
104
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
+ size = GGML_PAD(size, talloc->alignment);
106
+
107
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
108
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
109
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
110
+ GGML_ASSERT(!"not enough space in the buffer");
111
+ return;
112
+ }
113
+
114
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
115
+ talloc->offset += size;
116
+
117
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
118
+
119
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
120
+ }
121
+
122
+ // dynamic tensor allocator
123
+
27
124
  struct free_block {
28
- void * addr;
125
+ size_t offset;
29
126
  size_t size;
30
127
  };
31
128
 
32
- struct ggml_tallocr {
33
- struct ggml_backend_buffer * buffer;
34
- bool buffer_owned;
35
- void * base;
129
+ struct ggml_dyn_tallocr {
36
130
  size_t alignment;
37
-
38
131
  int n_free_blocks;
39
132
  struct free_block free_blocks[MAX_FREE_BLOCKS];
40
-
41
133
  size_t max_size;
42
134
 
43
- bool measure;
44
-
45
135
  #ifdef GGML_ALLOCATOR_DEBUG
46
- struct ggml_tensor * allocated_tensors[1024];
136
+ struct {
137
+ const struct ggml_tensor * tensor;
138
+ size_t offset;
139
+ } allocated_tensors[1024];
47
140
  #endif
48
141
  };
49
142
 
50
143
  #ifdef GGML_ALLOCATOR_DEBUG
51
- static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
144
+ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
52
145
  for (int i = 0; i < 1024; i++) {
53
- if (alloc->allocated_tensors[i] == NULL) {
54
- alloc->allocated_tensors[i] = tensor;
146
+ if (alloc->allocated_tensors[i].tensor == NULL) {
147
+ alloc->allocated_tensors[i].tensor = tensor;
148
+ alloc->allocated_tensors[i].offset = offset;
55
149
  return;
56
150
  }
57
151
  }
58
152
  GGML_ASSERT(!"out of allocated_tensors");
59
153
  }
60
- static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
154
+ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
61
155
  for (int i = 0; i < 1024; i++) {
62
- if (alloc->allocated_tensors[i] == tensor ||
63
- (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
64
- alloc->allocated_tensors[i] = NULL;
156
+ if (alloc->allocated_tensors[i].offset == offset) {
157
+ alloc->allocated_tensors[i].tensor = NULL;
65
158
  return;
66
159
  }
67
160
  }
68
- printf("tried to free tensor %s not found\n", tensor->name);
161
+ fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
69
162
  GGML_ASSERT(!"tensor not found");
70
163
  }
71
164
  #endif
72
165
 
73
- // check if a tensor is allocated by this buffer
74
- static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
75
- return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
76
- }
77
-
78
- static bool ggml_is_view(struct ggml_tensor * t) {
79
- return t->view_src != NULL;
80
- }
81
-
82
- void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
83
- GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
84
- GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
85
-
86
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
166
+ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
87
167
  size = aligned_offset(NULL, size, alloc->alignment);
88
168
 
89
169
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
109
189
  if (block->size >= size) {
110
190
  best_fit_block = alloc->n_free_blocks - 1;
111
191
  } else {
112
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
113
- __func__, tensor->name, size, max_avail);
192
+ // this should never happen
193
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
194
+ __func__, size, max_avail);
114
195
  GGML_ASSERT(!"not enough space in the buffer");
115
- return;
196
+ GGML_UNREACHABLE();
116
197
  }
117
198
  }
118
199
 
119
200
  struct free_block * block = &alloc->free_blocks[best_fit_block];
120
- void * addr = block->addr;
121
- block->addr = (char*)block->addr + size;
201
+ size_t offset = block->offset;
202
+ block->offset = offset + size;
122
203
  block->size -= size;
123
204
  if (block->size == 0) {
124
205
  // remove block if empty
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
128
209
  }
129
210
  }
130
211
 
131
- AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
-
133
- tensor->data = addr;
134
- tensor->buffer = alloc->buffer;
135
- if (!alloc->measure) {
136
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
137
- }
212
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
138
213
 
139
214
  #ifdef GGML_ALLOCATOR_DEBUG
140
- add_allocated_tensor(alloc, tensor);
141
- size_t cur_max = (char*)addr - (char*)alloc->base + size;
215
+ add_allocated_tensor(alloc, offset, tensor);
216
+ size_t cur_max = offset + size;
142
217
  if (cur_max > alloc->max_size) {
143
- printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
218
+ // sort allocated_tensors by offset
219
+ for (int i = 0; i < 1024; i++) {
220
+ for (int j = i + 1; j < 1024; j++) {
221
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
222
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
223
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
224
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
225
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
226
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
227
+ alloc->allocated_tensors[j].offset = tmp_offset;
228
+ }
229
+ }
230
+ }
231
+ fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
144
232
  for (int i = 0; i < 1024; i++) {
145
- if (alloc->allocated_tensors[i]) {
146
- printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
233
+ if (alloc->allocated_tensors[i].tensor) {
234
+ fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
235
+ alloc->allocated_tensors[i].offset,
236
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
237
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
147
238
  }
148
239
  }
149
- printf("\n");
240
+ fprintf(stderr, "\n");
150
241
  }
151
242
  #endif
152
243
 
153
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
154
- }
244
+ alloc->max_size = MAX(alloc->max_size, offset + size);
155
245
 
156
- // this is a very naive implementation, but for our case the number of free blocks should be very small
157
- static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
158
- if (ggml_tallocr_is_own(alloc, tensor) == false) {
159
- // the tensor was not allocated in this buffer
160
- // this can happen because the graph allocator will try to free weights and other tensors from different buffers
161
- // the easiest way to deal with this is just to ignore it
162
- // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
163
- return;
164
- }
246
+ return offset;
165
247
 
166
- void * ptr = tensor->data;
248
+ GGML_UNUSED(tensor);
249
+ }
167
250
 
168
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
251
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
252
+ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
169
253
  size = aligned_offset(NULL, size, alloc->alignment);
170
- AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
254
+
255
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
171
256
 
172
257
  #ifdef GGML_ALLOCATOR_DEBUG
173
- remove_allocated_tensor(alloc, tensor);
258
+ remove_allocated_tensor(alloc, offset, tensor);
174
259
  #endif
175
260
 
176
261
  // see if we can merge with an existing block
177
262
  for (int i = 0; i < alloc->n_free_blocks; i++) {
178
263
  struct free_block * block = &alloc->free_blocks[i];
179
264
  // check if ptr is at the end of the block
180
- if ((char*)block->addr + block->size == ptr) {
265
+ if (block->offset + block->size == offset) {
181
266
  block->size += size;
182
267
  // check if we can merge with the next block
183
- if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
268
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
184
269
  block->size += alloc->free_blocks[i+1].size;
185
270
  alloc->n_free_blocks--;
186
271
  for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
190
275
  return;
191
276
  }
192
277
  // check if ptr is at the beginning of the block
193
- if ((char*)ptr + size == block->addr) {
194
- block->addr = ptr;
278
+ if (offset + size == block->offset) {
279
+ block->offset = offset;
195
280
  block->size += size;
196
281
  // check if we can merge with the previous block
197
- if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
282
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
198
283
  alloc->free_blocks[i-1].size += block->size;
199
284
  alloc->n_free_blocks--;
200
285
  for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
208
293
  GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
209
294
  // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
210
295
  int insert_pos = 0;
211
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
296
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
212
297
  insert_pos++;
213
298
  }
214
299
  // shift all blocks from insert_pos onward to make room for the new block
@@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
216
301
  alloc->free_blocks[i] = alloc->free_blocks[i-1];
217
302
  }
218
303
  // insert the new block
219
- alloc->free_blocks[insert_pos].addr = ptr;
304
+ alloc->free_blocks[insert_pos].offset = offset;
220
305
  alloc->free_blocks[insert_pos].size = size;
221
306
  alloc->n_free_blocks++;
307
+
308
+ GGML_UNUSED(tensor);
222
309
  }
223
310
 
224
- void ggml_tallocr_reset(ggml_tallocr_t alloc) {
311
+ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
225
312
  alloc->n_free_blocks = 1;
226
- size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
227
- alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
228
-
229
- if (alloc->measure) {
230
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
231
- } else {
232
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
- ggml_backend_buffer_reset(alloc->buffer);
234
- }
313
+ alloc->free_blocks[0].offset = 0;
314
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
315
+ alloc->max_size = 0;
235
316
  }
236
317
 
237
- ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
238
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
239
-
240
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
318
+ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
319
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
241
320
 
242
- *alloc = (struct ggml_tallocr) {
243
- /*.buffer = */ buffer,
244
- /*.buffer_owned = */ true,
245
- /*.base = */ ggml_backend_buffer_get_base(buffer),
321
+ *alloc = (struct ggml_dyn_tallocr) {
246
322
  /*.alignment = */ alignment,
247
323
  /*.n_free_blocks = */ 0,
248
324
  /*.free_blocks = */ {{0}},
249
325
  /*.max_size = */ 0,
250
- /*.measure = */ false,
251
326
  #ifdef GGML_ALLOCATOR_DEBUG
252
- /*.allocated_tensors = */ {0},
327
+ /*.allocated_tensors = */ {{0}},
253
328
  #endif
254
329
  };
255
330
 
256
- ggml_tallocr_reset(alloc);
257
-
258
- return alloc;
259
- }
260
-
261
- ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
262
- ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
263
- alloc->measure = true;
331
+ ggml_dyn_tallocr_reset(alloc);
264
332
 
265
333
  return alloc;
266
334
  }
267
335
 
268
- ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
269
- // create a backend buffer to get the correct tensor allocation sizes
270
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
271
-
272
- // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
273
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
274
- alloc->buffer_owned = true;
275
- alloc->measure = true;
276
- ggml_tallocr_reset(alloc);
277
- return alloc;
278
- }
279
-
280
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
- return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282
- }
283
-
284
- ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
- // create a backend buffer to get the correct tensor allocation sizes
286
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
287
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
288
- alloc->buffer_owned = true;
289
- return alloc;
290
- }
291
-
292
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
- return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
- }
295
-
296
- ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
297
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
298
-
299
- *alloc = (struct ggml_tallocr) {
300
- /*.buffer = */ buffer,
301
- /*.buffer_owned = */ false,
302
- /*.base = */ ggml_backend_buffer_get_base(buffer),
303
- /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
304
- /*.n_free_blocks = */ 0,
305
- /*.free_blocks = */ {{0}},
306
- /*.max_size = */ 0,
307
- /*.measure = */ false,
308
- #ifdef GGML_ALLOCATOR_DEBUG
309
- /*.allocated_tensors = */ {0},
310
- #endif
311
- };
312
-
313
- ggml_tallocr_reset(alloc);
314
-
315
- return alloc;
316
- }
317
-
318
- struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
319
- return alloc->buffer;
320
- }
321
-
322
- void ggml_tallocr_free(ggml_tallocr_t alloc) {
323
- if (alloc == NULL) {
324
- return;
325
- }
326
-
327
- if (alloc->buffer_owned) {
328
- ggml_backend_buffer_free(alloc->buffer);
329
- }
336
+ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
330
337
  free(alloc);
331
338
  }
332
339
 
333
- bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
334
- return alloc->measure;
340
+ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
341
+ return alloc->max_size;
335
342
  }
336
343
 
337
- size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
- // to avoid this, we add a 10% margin to the buffer size
340
- return alloc->max_size + alloc->max_size/10;
341
- }
344
+
345
+ /////////////////////////////////////
342
346
 
343
347
  // graph allocator
344
348
 
345
349
  struct hash_node {
346
350
  int n_children;
347
351
  int n_views;
352
+ int buffer_id;
353
+ size_t offset; // offset within the buffer
354
+ bool allocated;
355
+ };
356
+
357
+ //
358
+ struct tensor_alloc {
359
+ size_t offset;
360
+ size_t size_max; // 0 = pre-allocated, unused, or view
361
+ };
362
+
363
+ struct node_alloc {
364
+ int buffer_id;
365
+ struct tensor_alloc dst;
366
+ struct tensor_alloc src[GGML_MAX_SRC];
348
367
  };
349
368
 
350
369
  struct ggml_gallocr {
351
- ggml_tallocr_t talloc;
370
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
371
+ ggml_backend_buffer_t * buffers; // [n_buffers]
372
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
373
+ int n_buffers;
374
+
352
375
  struct ggml_hash_set hash_set;
353
- struct hash_node * hash_values;
354
- size_t hash_values_size;
355
- ggml_tallocr_t * hash_allocs;
356
- int * parse_seq;
357
- int parse_seq_len;
376
+ struct hash_node * hash_values; // [hash_set.size]
377
+
378
+ struct node_alloc * node_allocs; // [n_nodes]
379
+ int n_nodes;
358
380
  };
359
381
 
360
- ggml_gallocr_t ggml_gallocr_new(void) {
361
- ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
362
-
363
- *galloc = (struct ggml_gallocr) {
364
- /*.talloc = */ NULL,
365
- /*.hash_set = */ {0},
366
- /*.hash_values = */ NULL,
367
- /*.hash_values_size = */ 0,
368
- /*.hash_allocs = */ NULL,
369
- /*.parse_seq = */ NULL,
370
- /*.parse_seq_len = */ 0,
371
- };
382
+ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
383
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
384
+ GGML_ASSERT(galloc != NULL);
385
+
386
+ galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
387
+ GGML_ASSERT(galloc->bufts != NULL);
388
+
389
+ galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
390
+ GGML_ASSERT(galloc->buffers != NULL);
391
+
392
+ galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
393
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
394
+
395
+ for (int i = 0; i < n_bufs; i++) {
396
+ galloc->bufts[i] = bufts[i];
397
+ galloc->buffers[i] = NULL;
398
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
399
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
400
+ }
401
+ galloc->n_buffers = n_bufs;
372
402
 
373
403
  return galloc;
374
404
  }
375
405
 
406
+ ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
407
+ return ggml_gallocr_new_n(&buft, 1);
408
+ }
409
+
376
410
  void ggml_gallocr_free(ggml_gallocr_t galloc) {
377
411
  if (galloc == NULL) {
378
412
  return;
379
413
  }
380
414
 
381
- if (galloc->hash_set.keys != NULL) {
382
- free(galloc->hash_set.keys);
383
- }
384
- if (galloc->hash_values != NULL) {
385
- free(galloc->hash_values);
386
- }
387
- if (galloc->hash_allocs != NULL) {
388
- free(galloc->hash_allocs);
389
- }
390
- if (galloc->parse_seq != NULL) {
391
- free(galloc->parse_seq);
415
+ for (int i = 0; i < galloc->n_buffers; i++) {
416
+ if (galloc->buffers != NULL) {
417
+ ggml_backend_buffer_free(galloc->buffers[i]);
418
+ }
419
+ if (galloc->buf_tallocs != NULL) {
420
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
421
+ }
392
422
  }
423
+
424
+ free(galloc->hash_set.keys);
425
+ free(galloc->hash_values);
426
+ free(galloc->bufts);
427
+ free(galloc->buffers);
428
+ free(galloc->buf_tallocs);
429
+ free(galloc->node_allocs);
393
430
  free(galloc);
394
431
  }
395
432
 
396
- void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
397
- free(galloc->parse_seq);
398
- galloc->parse_seq = malloc(sizeof(int) * n);
433
+ typedef struct ggml_gallocr * ggml_gallocr_t;
399
434
 
400
- for (int i = 0; i < n; i++) {
401
- galloc->parse_seq[i] = list[i];
402
- }
403
- galloc->parse_seq_len = n;
404
- }
405
-
406
- static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
435
+ static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
407
436
  size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
408
437
  return &galloc->hash_values[i];
409
438
  }
410
439
 
411
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
412
- if (a->type != b->type) {
413
- return false;
414
- }
415
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
416
- if (a->ne[i] != b->ne[i]) {
417
- return false;
418
- }
419
- if (a->nb[i] != b->nb[i]) {
420
- return false;
421
- }
422
- }
423
- return true;
440
+ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
441
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
424
442
  }
425
443
 
426
- static bool ggml_op_can_inplace(enum ggml_op op) {
427
- switch (op) {
428
- case GGML_OP_SCALE:
429
- case GGML_OP_DIAG_MASK_ZERO:
430
- case GGML_OP_DIAG_MASK_INF:
431
- case GGML_OP_ADD:
432
- case GGML_OP_ADD1:
433
- case GGML_OP_SUB:
434
- case GGML_OP_MUL:
435
- case GGML_OP_DIV:
436
- case GGML_OP_SQR:
437
- case GGML_OP_SQRT:
438
- case GGML_OP_LOG:
439
- case GGML_OP_UNARY:
440
- case GGML_OP_ROPE:
441
- case GGML_OP_RMS_NORM:
442
- case GGML_OP_SOFT_MAX:
443
- return true;
444
-
445
- default:
446
- return false;
447
- }
444
+ static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
445
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
446
+ hn->buffer_id = buffer_id;
447
+ hn->offset = offset;
448
+ hn->allocated = true;
448
449
  }
449
450
 
450
- static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
451
- if (galloc->talloc != NULL) {
452
- return galloc->talloc;
453
- }
454
-
455
- return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
451
+ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
452
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
456
453
  }
457
454
 
458
- static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
459
- ggml_tallocr_t alloc = node_tallocr(galloc, view);
460
-
461
- GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
462
- if (update_backend) {
463
- view->backend = view->view_src->backend;
464
- }
465
- // views are initialized in the alloc buffer rather than the view_src buffer
466
- view->buffer = alloc->buffer;
467
- view->data = (char *)view->view_src->data + view->view_offs;
455
+ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
456
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
468
457
 
469
- assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
459
+ hn->allocated = true;
460
+ assert(hn->offset == 0);
470
461
 
471
- if (!alloc->measure) {
472
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
473
- }
474
- }
462
+ // try to reuse a parent's buffer (inplace)
463
+ if (ggml_op_can_inplace(node->op)) {
464
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
465
+ struct ggml_tensor * parent = node->src[i];
466
+ if (parent == NULL) {
467
+ break;
468
+ }
475
469
 
476
- static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
477
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
470
+ // if the node's data is external, then we cannot re-use it
471
+ if (!ggml_gallocr_is_own(galloc, parent)) {
472
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
473
+ continue;
474
+ }
478
475
 
479
- if (node->data == NULL) {
480
- if (ggml_is_view(node)) {
481
- init_view(galloc, node, true);
482
- } else {
483
- // see if we can reuse a parent's buffer (inplace)
484
- if (ggml_op_can_inplace(node->op)) {
485
- for (int i = 0; i < GGML_MAX_SRC; i++) {
486
- struct ggml_tensor * parent = node->src[i];
487
- if (parent == NULL) {
488
- break;
489
- }
476
+ // outputs cannot be reused
477
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
478
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
479
+ continue;
480
+ }
490
481
 
491
- // if the node's data is external, then we cannot re-use it
492
- if (ggml_tallocr_is_own(alloc, parent) == false) {
493
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
494
- continue;
495
- }
482
+ if (!ggml_are_same_layout(node, parent)) {
483
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
484
+ continue;
485
+ }
496
486
 
497
- struct hash_node * p_hn = hash_get(galloc, parent);
498
- if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
499
- if (ggml_is_view(parent)) {
500
- struct ggml_tensor * view_src = parent->view_src;
501
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
502
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
503
- // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
504
- // the parent's data that it will need later (same layout requirement). the problem is that then
505
- // we cannot free the tensor because the original address of the allocation is lost.
506
- // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
507
- // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
508
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
509
- node->view_src = view_src;
510
- view_src_hn->n_views += 1;
511
- init_view(galloc, node, false);
512
- return;
513
- }
514
- } else {
515
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
516
- node->view_src = parent;
517
- p_hn->n_views += 1;
518
- init_view(galloc, node, false);
487
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
488
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
489
+ if (ggml_is_view(parent)) {
490
+ struct ggml_tensor * view_src = parent->view_src;
491
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
492
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
493
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
494
+ assert(view_src_hn->offset == p_hn->offset);
495
+ hn->buffer_id = p_hn->buffer_id;
496
+ hn->offset = p_hn->offset;
497
+ p_hn->allocated = false; // avoid freeing the parent
498
+ view_src_hn->allocated = false;
519
499
  return;
520
500
  }
501
+ } else {
502
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
503
+ hn->buffer_id = p_hn->buffer_id;
504
+ hn->offset = p_hn->offset;
505
+ p_hn->allocated = false; // avoid freeing the parent
506
+ return;
521
507
  }
522
508
  }
523
509
  }
524
- ggml_tallocr_alloc(alloc, node);
525
510
  }
511
+ // allocate tensor from the buffer
512
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
513
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
514
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
515
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
516
+ hn->buffer_id = buffer_id;
517
+ hn->offset = offset;
518
+ return;
526
519
  }
527
520
  }
528
521
 
529
- static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
530
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
522
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
523
+ // graph outputs are never freed
524
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
525
+ AT_PRINTF("not freeing output %s\n", node->name);
526
+ return;
527
+ }
531
528
 
532
- ggml_tallocr_free_tensor(alloc, node);
529
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
530
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
531
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
532
+ size_t offset = hn->offset;
533
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
534
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
535
+ hn->allocated = false;
533
536
  }
534
537
 
535
- static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
536
- const int * parse_seq = galloc->parse_seq;
537
- int parse_seq_len = galloc->parse_seq_len;
538
+ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
539
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
540
+ }
541
+
542
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
543
+ // clear hash tables
544
+ memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
545
+ memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
546
+
547
+ // allocate all graph inputs first to avoid overwriting them
548
+ for (int i = 0; i < graph->n_nodes; i++) {
549
+ if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
550
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
551
+ }
552
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
553
+ if (graph->nodes[i]->src[j] == NULL) {
554
+ break;
555
+ }
556
+ if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
557
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
558
+ }
559
+ }
560
+ }
538
561
 
539
562
  // count number of children and views
540
- for (int i = 0; i < gf->n_nodes; i++) {
541
- struct ggml_tensor * node = gf->nodes[i];
563
+ for (int i = 0; i < graph->n_nodes; i++) {
564
+ struct ggml_tensor * node = graph->nodes[i];
542
565
 
543
566
  if (ggml_is_view(node)) {
544
567
  struct ggml_tensor * view_src = node->view_src;
545
- hash_get(galloc, view_src)->n_views += 1;
546
- if (node->buffer == NULL && node->data != NULL) {
547
- // view of a pre-allocated tensor, didn't call init_view() yet
548
- init_view(galloc, node, true);
549
- }
568
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
550
569
  }
551
570
 
552
571
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
554
573
  if (parent == NULL) {
555
574
  break;
556
575
  }
557
- hash_get(galloc, parent)->n_children += 1;
558
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
559
- init_view(galloc, parent, true);
560
- }
576
+ ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
561
577
  }
562
578
  }
563
579
 
564
580
  // allocate tensors
565
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
566
- int last_barrier_pos = 0;
567
- int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
568
-
569
- for (int ind = 0; ind < n_nodes; ind++) {
570
- // allocate a node if there is no parse_seq or this is not a barrier
571
- if (parse_seq_len == 0 || parse_seq[ind] != -1) {
572
- int i = parse_seq_len ? parse_seq[ind] : ind;
573
- struct ggml_tensor * node = gf->nodes[i];
574
-
575
- // allocate parents (leafs)
576
- for (int j = 0; j < GGML_MAX_SRC; j++) {
577
- struct ggml_tensor * parent = node->src[j];
578
- if (parent == NULL) {
579
- break;
580
- }
581
- allocate_node(galloc, parent);
581
+ for (int i = 0; i < graph->n_nodes; i++) {
582
+ struct ggml_tensor * node = graph->nodes[i];
583
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
584
+
585
+ // allocate parents (only leafs need to be allocated at this point)
586
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
587
+ struct ggml_tensor * parent = node->src[j];
588
+ if (parent == NULL) {
589
+ break;
582
590
  }
591
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
592
+ }
583
593
 
584
- // allocate node
585
- allocate_node(galloc, node);
594
+ // allocate node
595
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
586
596
 
587
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
588
- for (int j = 0; j < GGML_MAX_SRC; j++) {
589
- struct ggml_tensor * parent = node->src[j];
590
- if (parent == NULL) {
591
- break;
592
- }
593
- AT_PRINTF("%s", parent->name);
594
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
595
- AT_PRINTF(", ");
596
- }
597
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
598
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
599
+ struct ggml_tensor * parent = node->src[j];
600
+ if (parent == NULL) {
601
+ break;
602
+ }
603
+ AT_PRINTF("%s", parent->name);
604
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
605
+ AT_PRINTF(", ");
597
606
  }
598
- AT_PRINTF("\n");
599
607
  }
608
+ AT_PRINTF("\n");
600
609
 
601
610
  // update parents
602
- // update immediately if there is no parse_seq
603
- // update only at barriers if there is parse_seq
604
- if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
605
- int update_start = parse_seq_len ? last_barrier_pos : ind;
606
- int update_end = parse_seq_len ? ind : ind + 1;
607
- for (int i = update_start; i < update_end; i++) {
608
- int node_i = parse_seq_len ? parse_seq[i] : i;
609
- struct ggml_tensor * node = gf->nodes[node_i];
610
-
611
- for (int j = 0; j < GGML_MAX_SRC; j++) {
612
- struct ggml_tensor * parent = node->src[j];
613
- if (parent == NULL) {
614
- break;
615
- }
616
- struct hash_node * p_hn = hash_get(galloc, parent);
617
- p_hn->n_children -= 1;
618
-
619
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
620
-
621
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
622
- if (ggml_is_view(parent)) {
623
- struct ggml_tensor * view_src = parent->view_src;
624
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
625
- view_src_hn->n_views -= 1;
626
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
627
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
628
- free_node(galloc, view_src);
629
- }
630
- }
631
- else {
632
- free_node(galloc, parent);
633
- }
611
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
612
+ struct ggml_tensor * parent = node->src[j];
613
+ if (parent == NULL) {
614
+ break;
615
+ }
616
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
617
+ p_hn->n_children -= 1;
618
+
619
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
620
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
621
+
622
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
623
+ if (ggml_is_view(parent)) {
624
+ struct ggml_tensor * view_src = parent->view_src;
625
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
626
+ view_src_hn->n_views -= 1;
627
+ AT_PRINTF("view_src %s: %d children, %d views\n",
628
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
629
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
630
+ ggml_gallocr_free_node(galloc, view_src, buffer_id);
634
631
  }
635
632
  }
633
+ else if (p_hn->allocated) {
634
+ ggml_gallocr_free_node(galloc, parent, buffer_id);
635
+ }
636
636
  }
637
637
  AT_PRINTF("\n");
638
- if (parse_seq_len) {
639
- last_barrier_pos = ind + 1;
640
- }
641
638
  }
642
639
  }
643
640
  }
644
641
 
645
- size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
642
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
646
643
  size_t hash_size = graph->visited_hash_table.size;
647
644
 
648
- // check if the hash table is initialized and large enough
645
+ // initialize hash table
649
646
  if (galloc->hash_set.size < hash_size) {
650
- if (galloc->hash_set.keys != NULL) {
651
- free(galloc->hash_set.keys);
652
- }
653
- if (galloc->hash_values != NULL) {
654
- free(galloc->hash_values);
655
- }
656
- galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
647
+ free(galloc->hash_set.keys);
648
+ free(galloc->hash_values);
657
649
  galloc->hash_set.size = hash_size;
658
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
650
+ galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
651
+ galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
652
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
653
+ GGML_ASSERT(galloc->hash_values != NULL);
654
+ } else {
655
+ // reset hash table
656
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
657
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
659
658
  }
660
659
 
661
- // reset hash table
662
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
663
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
664
-
665
- galloc->talloc = talloc;
666
- ggml_tallocr_alloc_graph_impl(galloc, graph);
667
- galloc->talloc = NULL;
668
-
669
- size_t max_size = ggml_tallocr_max_size(talloc);
670
-
671
- return max_size;
672
- }
673
-
674
- void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
675
- const size_t hash_size = hash_set.size;
676
-
677
- GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
660
+ // reset allocators
661
+ for (int i = 0; i < galloc->n_buffers; i++) {
662
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
663
+ }
678
664
 
679
- galloc->talloc = NULL;
665
+ // allocate in hash table
666
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
680
667
 
681
- // alloc hash_values if needed
682
- if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
683
- free(galloc->hash_values);
684
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
685
- galloc->hash_values_size = hash_size;
668
+ // set the node_allocs from the hash table
669
+ if (galloc->n_nodes < graph->n_nodes) {
670
+ free(galloc->node_allocs);
671
+ galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
672
+ GGML_ASSERT(galloc->node_allocs != NULL);
686
673
  }
687
-
688
- // free hash_set.keys if needed
689
- if (galloc->hash_set.keys != NULL) {
690
- free(galloc->hash_set.keys);
674
+ galloc->n_nodes = graph->n_nodes;
675
+ for (int i = 0; i < graph->n_nodes; i++) {
676
+ struct ggml_tensor * node = graph->nodes[i];
677
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
678
+ node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
679
+ if (node->view_src || node->data) {
680
+ node_alloc->dst.offset = SIZE_MAX;
681
+ node_alloc->dst.size_max = 0;
682
+ } else {
683
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
684
+ node_alloc->dst.offset = hn->offset;
685
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
686
+ }
687
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
688
+ struct ggml_tensor * src = node->src[j];
689
+ if (!src || src->view_src || src->data) {
690
+ node_alloc->src[j].offset = SIZE_MAX;
691
+ node_alloc->src[j].size_max = 0;
692
+ } else {
693
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
694
+ node_alloc->src[j].offset = hn->offset;
695
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
696
+ }
697
+ }
691
698
  }
692
- galloc->hash_set = hash_set;
693
699
 
694
- // reset hash values
695
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
700
+ // reallocate buffers if needed
701
+ for (int i = 0; i < galloc->n_buffers; i++) {
702
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
703
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
696
704
 
697
- galloc->hash_allocs = hash_node_talloc;
698
-
699
- ggml_tallocr_alloc_graph_impl(galloc, graph);
705
+ if (new_size > cur_size) {
706
+ #ifndef NDEBUG
707
+ fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
708
+ #endif
709
+ ggml_backend_buffer_free(galloc->buffers[i]);
710
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
711
+ if (galloc->buffers[i] == NULL) {
712
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
713
+ return false;
714
+ }
715
+ }
716
+ }
700
717
 
701
- // remove unowned resources
702
- galloc->hash_set.keys = NULL;
703
- galloc->hash_allocs = NULL;
718
+ return true;
704
719
  }
705
720
 
706
- // legacy API wrapper
707
-
708
- struct ggml_allocr {
709
- ggml_tallocr_t talloc;
710
- ggml_gallocr_t galloc;
711
- };
712
-
713
- static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
714
- ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
715
- *alloc = (struct ggml_allocr) {
716
- /*.talloc = */ talloc,
717
- /*.galloc = */ ggml_gallocr_new(),
718
- };
719
- return alloc;
721
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
722
+ return ggml_gallocr_reserve_n(galloc, graph, NULL);
720
723
  }
721
724
 
722
- ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
723
- return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
724
- }
725
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
726
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
725
727
 
726
- ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
727
- return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
728
- }
728
+ if (node->view_src != NULL) {
729
+ if (node->buffer == NULL) {
730
+ assert(tensor_alloc->offset == SIZE_MAX);
731
+ if (node->view_src->buffer == NULL) {
732
+ // this tensor was allocated without ggml-backend
733
+ return;
734
+ }
735
+ ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
736
+ }
737
+ } else {
738
+ if (node->data == NULL) {
739
+ assert(tensor_alloc->offset != SIZE_MAX);
740
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
741
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
742
+ void * addr = (char *)base + tensor_alloc->offset;
743
+ ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
744
+ } else {
745
+ if (node->buffer == NULL) {
746
+ // this tensor was allocated without ggml-backend
747
+ return;
748
+ }
729
749
 
730
- ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
731
- return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
750
+ #ifndef NDEBUG
751
+ size_t offset =
752
+ (char *)node->data -
753
+ (char *)ggml_backend_buffer_get_base(node->buffer);
754
+ size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
755
+ assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
756
+ assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
757
+ #endif
758
+ }
759
+ }
732
760
  }
733
761
 
734
- ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
735
- return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
762
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
763
+ ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
764
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
765
+ return talloc->size_max >= node_size;
736
766
  }
737
767
 
738
- ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
739
- return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
740
- }
768
+ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
769
+ if (galloc->n_nodes != graph->n_nodes) {
770
+ #ifndef NDEBUG
771
+ fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
772
+ #endif
773
+ return true;
774
+ }
741
775
 
742
- struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
743
- return ggml_tallocr_get_buffer(alloc->talloc);
744
- }
776
+ for (int i = 0; i < graph->n_nodes; i++) {
777
+ struct ggml_tensor * node = graph->nodes[i];
778
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
745
779
 
746
- void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
747
- ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
748
- }
780
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
781
+ #ifndef NDEBUG
782
+ fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
783
+ #endif
784
+ return true;
785
+ }
749
786
 
750
- void ggml_allocr_free(ggml_allocr_t alloc) {
751
- if (alloc == NULL) {
752
- return;
787
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
788
+ struct ggml_tensor * src = node->src[j];
789
+ if (src == NULL) {
790
+ break;
791
+ }
792
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
795
+ #endif
796
+ return true;
797
+ }
798
+ }
753
799
  }
754
800
 
755
- ggml_gallocr_free(alloc->galloc);
756
- ggml_tallocr_free(alloc->talloc);
757
- free(alloc);
801
+ return false;
758
802
  }
759
803
 
760
- bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
761
- return ggml_tallocr_is_measure(alloc->talloc);
762
- }
804
+ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
805
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
806
+ if (galloc->n_buffers == 1) {
807
+ #ifndef NDEBUG
808
+ fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
809
+ #endif
810
+ if (!ggml_gallocr_reserve(galloc, graph)) {
811
+ return false;
812
+ }
813
+ } else {
814
+ #ifndef NDEBUG
815
+ fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
816
+ #endif
817
+ return false;
818
+ }
819
+ }
763
820
 
764
- void ggml_allocr_reset(ggml_allocr_t alloc) {
765
- ggml_tallocr_reset(alloc->talloc);
766
- }
821
+ // reset buffers
822
+ for (int i = 0; i < galloc->n_buffers; i++) {
823
+ // zero size buffers are not allocated
824
+ if (galloc->buffers[i] != NULL) {
825
+ ggml_backend_buffer_reset(galloc->buffers[i]);
826
+ }
827
+ }
767
828
 
768
- void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
769
- ggml_tallocr_alloc(alloc->talloc, tensor);
770
- }
829
+ // allocate the graph tensors from the previous assignments
830
+ for (int i = 0; i < graph->n_nodes; i++) {
831
+ struct ggml_tensor * node = graph->nodes[i];
832
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
833
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
834
+ struct ggml_tensor * src = node->src[j];
835
+ if (src == NULL) {
836
+ break;
837
+ }
838
+ ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
839
+ }
840
+ ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
841
+ }
771
842
 
772
- size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
773
- return ggml_tallocr_max_size(alloc->talloc);
843
+ return true;
774
844
  }
775
845
 
776
- size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
777
- return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
846
+ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
847
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
848
+
849
+ if (galloc->buffers[buffer_id] == NULL) {
850
+ return 0;
851
+ }
852
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
778
853
  }
779
854
 
780
855
  // utils
@@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
795
870
  return false;
796
871
  }
797
872
 
798
- ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
873
+ struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
799
874
 
800
875
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
801
876
  if (t->data == NULL) {
802
877
  if (t->view_src == NULL) {
803
878
  ggml_tallocr_alloc(tallocr, t);
804
- } else {
879
+ } else if (t->buffer == NULL) {
805
880
  ggml_backend_view_init(buffer, t);
806
881
  }
807
882
  } else {
808
- if (t->view_src != NULL) {
883
+ if (t->view_src != NULL && t->buffer == NULL) {
809
884
  // view of a pre-allocated tensor
810
885
  ggml_backend_view_init(buffer, t);
811
886
  }
@@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
838
913
  }
839
914
 
840
915
  if (this_size > max_size) {
841
- // tensor is too large to fit in a single buffer
842
916
  fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
917
  __func__, t->name,
844
918
  ggml_backend_buft_name(buft),
@@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
870
944
  }
871
945
 
872
946
  if (n_buffers == 0) {
873
- // all the tensors in the context are already allocated
874
947
  #ifndef NDEBUG
875
948
  fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
949
  #endif