llama_cpp 0.12.5 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,50 @@
17
17
  //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
+
21
+ static bool ggml_is_view(const struct ggml_tensor * t) {
22
+ return t->view_src != NULL;
23
+ }
24
+
25
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
+ if (a->type != b->type) {
27
+ return false;
28
+ }
29
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
+ if (a->ne[i] != b->ne[i]) {
31
+ return false;
32
+ }
33
+ if (a->nb[i] != b->nb[i]) {
34
+ return false;
35
+ }
36
+ }
37
+ return true;
38
+ }
39
+
40
+ static bool ggml_op_can_inplace(enum ggml_op op) {
41
+ switch (op) {
42
+ case GGML_OP_SCALE:
43
+ case GGML_OP_DIAG_MASK_ZERO:
44
+ case GGML_OP_DIAG_MASK_INF:
45
+ case GGML_OP_ADD:
46
+ case GGML_OP_ADD1:
47
+ case GGML_OP_SUB:
48
+ case GGML_OP_MUL:
49
+ case GGML_OP_DIV:
50
+ case GGML_OP_SQR:
51
+ case GGML_OP_SQRT:
52
+ case GGML_OP_LOG:
53
+ case GGML_OP_UNARY:
54
+ case GGML_OP_ROPE:
55
+ case GGML_OP_RMS_NORM:
56
+ case GGML_OP_SOFT_MAX:
57
+ return true;
58
+
59
+ default:
60
+ return false;
61
+ }
62
+ }
63
+
20
64
  // TODO: GGML_PAD ?
21
65
  static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
22
66
  assert(alignment && !(alignment & (alignment - 1))); // power of 2
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
24
68
  return offset + align;
25
69
  }
26
70
 
71
+ // tallocr
72
+ struct ggml_tallocr {
73
+ ggml_backend_buffer_t buffer;
74
+ void * base;
75
+ size_t alignment;
76
+ size_t offset;
77
+ };
78
+
79
+ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
80
+ ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
81
+ if (talloc == NULL) {
82
+ return NULL;
83
+ }
84
+
85
+ void * base = ggml_backend_buffer_get_base(buffer);
86
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
87
+
88
+ assert(align && !(align & (align - 1))); // power of 2
89
+
90
+ *talloc = (struct ggml_tallocr) {
91
+ /*.buffer = */ buffer,
92
+ /*.base = */ base,
93
+ /*.alignment = */ align,
94
+ /*.offset = */ aligned_offset(base, 0, align),
95
+ };
96
+ return talloc;
97
+ }
98
+
99
+ void ggml_tallocr_free(ggml_tallocr_t talloc) {
100
+ free(talloc);
101
+ }
102
+
103
+ void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
104
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
105
+ size = GGML_PAD(size, talloc->alignment);
106
+
107
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
108
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
109
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
110
+ GGML_ASSERT(!"not enough space in the buffer");
111
+ return;
112
+ }
113
+
114
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
115
+ talloc->offset += size;
116
+
117
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
118
+
119
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
120
+ }
121
+
122
+ // dynamic tensor allocator
123
+
27
124
  struct free_block {
28
- void * addr;
125
+ size_t offset;
29
126
  size_t size;
30
127
  };
31
128
 
32
- struct ggml_tallocr {
33
- struct ggml_backend_buffer * buffer;
34
- bool buffer_owned;
35
- void * base;
129
+ struct ggml_dyn_tallocr {
36
130
  size_t alignment;
37
-
38
131
  int n_free_blocks;
39
132
  struct free_block free_blocks[MAX_FREE_BLOCKS];
40
-
41
133
  size_t max_size;
42
134
 
43
- bool measure;
44
-
45
135
  #ifdef GGML_ALLOCATOR_DEBUG
46
- struct ggml_tensor * allocated_tensors[1024];
136
+ struct {
137
+ const struct ggml_tensor * tensor;
138
+ size_t offset;
139
+ } allocated_tensors[1024];
47
140
  #endif
48
141
  };
49
142
 
50
143
  #ifdef GGML_ALLOCATOR_DEBUG
51
- static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
144
+ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
52
145
  for (int i = 0; i < 1024; i++) {
53
- if (alloc->allocated_tensors[i] == NULL) {
54
- alloc->allocated_tensors[i] = tensor;
146
+ if (alloc->allocated_tensors[i].tensor == NULL) {
147
+ alloc->allocated_tensors[i].tensor = tensor;
148
+ alloc->allocated_tensors[i].offset = offset;
55
149
  return;
56
150
  }
57
151
  }
58
152
  GGML_ASSERT(!"out of allocated_tensors");
59
153
  }
60
- static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
154
+ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
61
155
  for (int i = 0; i < 1024; i++) {
62
- if (alloc->allocated_tensors[i] == tensor ||
63
- (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
64
- alloc->allocated_tensors[i] = NULL;
156
+ if (alloc->allocated_tensors[i].offset == offset) {
157
+ alloc->allocated_tensors[i].tensor = NULL;
65
158
  return;
66
159
  }
67
160
  }
68
- printf("tried to free tensor %s not found\n", tensor->name);
161
+ fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
69
162
  GGML_ASSERT(!"tensor not found");
70
163
  }
71
164
  #endif
72
165
 
73
- // check if a tensor is allocated by this buffer
74
- static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
75
- return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
76
- }
77
-
78
- static bool ggml_is_view(struct ggml_tensor * t) {
79
- return t->view_src != NULL;
80
- }
81
-
82
- void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
83
- GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
84
- GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
85
-
86
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
166
+ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
87
167
  size = aligned_offset(NULL, size, alloc->alignment);
88
168
 
89
169
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
109
189
  if (block->size >= size) {
110
190
  best_fit_block = alloc->n_free_blocks - 1;
111
191
  } else {
112
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
113
- __func__, tensor->name, size, max_avail);
192
+ // this should never happen
193
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
194
+ __func__, size, max_avail);
114
195
  GGML_ASSERT(!"not enough space in the buffer");
115
- return;
196
+ GGML_UNREACHABLE();
116
197
  }
117
198
  }
118
199
 
119
200
  struct free_block * block = &alloc->free_blocks[best_fit_block];
120
- void * addr = block->addr;
121
- block->addr = (char*)block->addr + size;
201
+ size_t offset = block->offset;
202
+ block->offset = offset + size;
122
203
  block->size -= size;
123
204
  if (block->size == 0) {
124
205
  // remove block if empty
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
128
209
  }
129
210
  }
130
211
 
131
- AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
-
133
- tensor->data = addr;
134
- tensor->buffer = alloc->buffer;
135
- if (!alloc->measure) {
136
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
137
- }
212
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
138
213
 
139
214
  #ifdef GGML_ALLOCATOR_DEBUG
140
- add_allocated_tensor(alloc, tensor);
141
- size_t cur_max = (char*)addr - (char*)alloc->base + size;
215
+ add_allocated_tensor(alloc, offset, tensor);
216
+ size_t cur_max = offset + size;
142
217
  if (cur_max > alloc->max_size) {
143
- printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
218
+ // sort allocated_tensors by offset
219
+ for (int i = 0; i < 1024; i++) {
220
+ for (int j = i + 1; j < 1024; j++) {
221
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
222
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
223
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
224
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
225
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
226
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
227
+ alloc->allocated_tensors[j].offset = tmp_offset;
228
+ }
229
+ }
230
+ }
231
+ fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
144
232
  for (int i = 0; i < 1024; i++) {
145
- if (alloc->allocated_tensors[i]) {
146
- printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
233
+ if (alloc->allocated_tensors[i].tensor) {
234
+ fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
235
+ alloc->allocated_tensors[i].offset,
236
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
237
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
147
238
  }
148
239
  }
149
- printf("\n");
240
+ fprintf(stderr, "\n");
150
241
  }
151
242
  #endif
152
243
 
153
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
154
- }
244
+ alloc->max_size = MAX(alloc->max_size, offset + size);
155
245
 
156
- // this is a very naive implementation, but for our case the number of free blocks should be very small
157
- static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
158
- if (ggml_tallocr_is_own(alloc, tensor) == false) {
159
- // the tensor was not allocated in this buffer
160
- // this can happen because the graph allocator will try to free weights and other tensors from different buffers
161
- // the easiest way to deal with this is just to ignore it
162
- // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
163
- return;
164
- }
246
+ return offset;
165
247
 
166
- void * ptr = tensor->data;
248
+ GGML_UNUSED(tensor);
249
+ }
167
250
 
168
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
251
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
252
+ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
169
253
  size = aligned_offset(NULL, size, alloc->alignment);
170
- AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
254
+
255
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
171
256
 
172
257
  #ifdef GGML_ALLOCATOR_DEBUG
173
- remove_allocated_tensor(alloc, tensor);
258
+ remove_allocated_tensor(alloc, offset, tensor);
174
259
  #endif
175
260
 
176
261
  // see if we can merge with an existing block
177
262
  for (int i = 0; i < alloc->n_free_blocks; i++) {
178
263
  struct free_block * block = &alloc->free_blocks[i];
179
264
  // check if ptr is at the end of the block
180
- if ((char*)block->addr + block->size == ptr) {
265
+ if (block->offset + block->size == offset) {
181
266
  block->size += size;
182
267
  // check if we can merge with the next block
183
- if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
268
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
184
269
  block->size += alloc->free_blocks[i+1].size;
185
270
  alloc->n_free_blocks--;
186
271
  for (int j = i+1; j < alloc->n_free_blocks; j++) {
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
190
275
  return;
191
276
  }
192
277
  // check if ptr is at the beginning of the block
193
- if ((char*)ptr + size == block->addr) {
194
- block->addr = ptr;
278
+ if (offset + size == block->offset) {
279
+ block->offset = offset;
195
280
  block->size += size;
196
281
  // check if we can merge with the previous block
197
- if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
282
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
198
283
  alloc->free_blocks[i-1].size += block->size;
199
284
  alloc->n_free_blocks--;
200
285
  for (int j = i; j < alloc->n_free_blocks; j++) {
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
208
293
  GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
209
294
  // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
210
295
  int insert_pos = 0;
211
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
296
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
212
297
  insert_pos++;
213
298
  }
214
299
  // shift all blocks from insert_pos onward to make room for the new block
@@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
216
301
  alloc->free_blocks[i] = alloc->free_blocks[i-1];
217
302
  }
218
303
  // insert the new block
219
- alloc->free_blocks[insert_pos].addr = ptr;
304
+ alloc->free_blocks[insert_pos].offset = offset;
220
305
  alloc->free_blocks[insert_pos].size = size;
221
306
  alloc->n_free_blocks++;
307
+
308
+ GGML_UNUSED(tensor);
222
309
  }
223
310
 
224
- void ggml_tallocr_reset(ggml_tallocr_t alloc) {
311
+ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
225
312
  alloc->n_free_blocks = 1;
226
- size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
227
- alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
228
-
229
- if (alloc->measure) {
230
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
231
- } else {
232
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
- ggml_backend_buffer_reset(alloc->buffer);
234
- }
313
+ alloc->free_blocks[0].offset = 0;
314
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
315
+ alloc->max_size = 0;
235
316
  }
236
317
 
237
- ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
238
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
239
-
240
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
318
+ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
319
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
241
320
 
242
- *alloc = (struct ggml_tallocr) {
243
- /*.buffer = */ buffer,
244
- /*.buffer_owned = */ true,
245
- /*.base = */ ggml_backend_buffer_get_base(buffer),
321
+ *alloc = (struct ggml_dyn_tallocr) {
246
322
  /*.alignment = */ alignment,
247
323
  /*.n_free_blocks = */ 0,
248
324
  /*.free_blocks = */ {{0}},
249
325
  /*.max_size = */ 0,
250
- /*.measure = */ false,
251
326
  #ifdef GGML_ALLOCATOR_DEBUG
252
- /*.allocated_tensors = */ {0},
327
+ /*.allocated_tensors = */ {{0}},
253
328
  #endif
254
329
  };
255
330
 
256
- ggml_tallocr_reset(alloc);
257
-
258
- return alloc;
259
- }
260
-
261
- ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
262
- ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
263
- alloc->measure = true;
331
+ ggml_dyn_tallocr_reset(alloc);
264
332
 
265
333
  return alloc;
266
334
  }
267
335
 
268
- ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
269
- // create a backend buffer to get the correct tensor allocation sizes
270
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
271
-
272
- // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
273
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
274
- alloc->buffer_owned = true;
275
- alloc->measure = true;
276
- ggml_tallocr_reset(alloc);
277
- return alloc;
278
- }
279
-
280
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
- return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282
- }
283
-
284
- ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
- // create a backend buffer to get the correct tensor allocation sizes
286
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
287
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
288
- alloc->buffer_owned = true;
289
- return alloc;
290
- }
291
-
292
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
- return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
- }
295
-
296
- ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
297
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
298
-
299
- *alloc = (struct ggml_tallocr) {
300
- /*.buffer = */ buffer,
301
- /*.buffer_owned = */ false,
302
- /*.base = */ ggml_backend_buffer_get_base(buffer),
303
- /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
304
- /*.n_free_blocks = */ 0,
305
- /*.free_blocks = */ {{0}},
306
- /*.max_size = */ 0,
307
- /*.measure = */ false,
308
- #ifdef GGML_ALLOCATOR_DEBUG
309
- /*.allocated_tensors = */ {0},
310
- #endif
311
- };
312
-
313
- ggml_tallocr_reset(alloc);
314
-
315
- return alloc;
316
- }
317
-
318
- struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
319
- return alloc->buffer;
320
- }
321
-
322
- void ggml_tallocr_free(ggml_tallocr_t alloc) {
323
- if (alloc == NULL) {
324
- return;
325
- }
326
-
327
- if (alloc->buffer_owned) {
328
- ggml_backend_buffer_free(alloc->buffer);
329
- }
336
+ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
330
337
  free(alloc);
331
338
  }
332
339
 
333
- bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
334
- return alloc->measure;
340
+ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
341
+ return alloc->max_size;
335
342
  }
336
343
 
337
- size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
- // to avoid this, we add a 10% margin to the buffer size
340
- return alloc->max_size + alloc->max_size/10;
341
- }
344
+
345
+ /////////////////////////////////////
342
346
 
343
347
  // graph allocator
344
348
 
345
349
  struct hash_node {
346
350
  int n_children;
347
351
  int n_views;
352
+ int buffer_id;
353
+ size_t offset; // offset within the buffer
354
+ bool allocated;
355
+ };
356
+
357
+ //
358
+ struct tensor_alloc {
359
+ size_t offset;
360
+ size_t size_max; // 0 = pre-allocated, unused, or view
361
+ };
362
+
363
+ struct node_alloc {
364
+ int buffer_id;
365
+ struct tensor_alloc dst;
366
+ struct tensor_alloc src[GGML_MAX_SRC];
348
367
  };
349
368
 
350
369
  struct ggml_gallocr {
351
- ggml_tallocr_t talloc;
370
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
371
+ ggml_backend_buffer_t * buffers; // [n_buffers]
372
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
373
+ int n_buffers;
374
+
352
375
  struct ggml_hash_set hash_set;
353
- struct hash_node * hash_values;
354
- size_t hash_values_size;
355
- ggml_tallocr_t * hash_allocs;
356
- int * parse_seq;
357
- int parse_seq_len;
376
+ struct hash_node * hash_values; // [hash_set.size]
377
+
378
+ struct node_alloc * node_allocs; // [n_nodes]
379
+ int n_nodes;
358
380
  };
359
381
 
360
- ggml_gallocr_t ggml_gallocr_new(void) {
361
- ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
362
-
363
- *galloc = (struct ggml_gallocr) {
364
- /*.talloc = */ NULL,
365
- /*.hash_set = */ {0},
366
- /*.hash_values = */ NULL,
367
- /*.hash_values_size = */ 0,
368
- /*.hash_allocs = */ NULL,
369
- /*.parse_seq = */ NULL,
370
- /*.parse_seq_len = */ 0,
371
- };
382
+ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
383
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
384
+ GGML_ASSERT(galloc != NULL);
385
+
386
+ galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
387
+ GGML_ASSERT(galloc->bufts != NULL);
388
+
389
+ galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
390
+ GGML_ASSERT(galloc->buffers != NULL);
391
+
392
+ galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
393
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
394
+
395
+ for (int i = 0; i < n_bufs; i++) {
396
+ galloc->bufts[i] = bufts[i];
397
+ galloc->buffers[i] = NULL;
398
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
399
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
400
+ }
401
+ galloc->n_buffers = n_bufs;
372
402
 
373
403
  return galloc;
374
404
  }
375
405
 
406
+ ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
407
+ return ggml_gallocr_new_n(&buft, 1);
408
+ }
409
+
376
410
  void ggml_gallocr_free(ggml_gallocr_t galloc) {
377
411
  if (galloc == NULL) {
378
412
  return;
379
413
  }
380
414
 
381
- if (galloc->hash_set.keys != NULL) {
382
- free(galloc->hash_set.keys);
383
- }
384
- if (galloc->hash_values != NULL) {
385
- free(galloc->hash_values);
386
- }
387
- if (galloc->hash_allocs != NULL) {
388
- free(galloc->hash_allocs);
389
- }
390
- if (galloc->parse_seq != NULL) {
391
- free(galloc->parse_seq);
415
+ for (int i = 0; i < galloc->n_buffers; i++) {
416
+ if (galloc->buffers != NULL) {
417
+ ggml_backend_buffer_free(galloc->buffers[i]);
418
+ }
419
+ if (galloc->buf_tallocs != NULL) {
420
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
421
+ }
392
422
  }
423
+
424
+ free(galloc->hash_set.keys);
425
+ free(galloc->hash_values);
426
+ free(galloc->bufts);
427
+ free(galloc->buffers);
428
+ free(galloc->buf_tallocs);
429
+ free(galloc->node_allocs);
393
430
  free(galloc);
394
431
  }
395
432
 
396
- void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
397
- free(galloc->parse_seq);
398
- galloc->parse_seq = malloc(sizeof(int) * n);
433
+ typedef struct ggml_gallocr * ggml_gallocr_t;
399
434
 
400
- for (int i = 0; i < n; i++) {
401
- galloc->parse_seq[i] = list[i];
402
- }
403
- galloc->parse_seq_len = n;
404
- }
405
-
406
- static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
435
+ static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
407
436
  size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
408
437
  return &galloc->hash_values[i];
409
438
  }
410
439
 
411
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
412
- if (a->type != b->type) {
413
- return false;
414
- }
415
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
416
- if (a->ne[i] != b->ne[i]) {
417
- return false;
418
- }
419
- if (a->nb[i] != b->nb[i]) {
420
- return false;
421
- }
422
- }
423
- return true;
440
+ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
441
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
424
442
  }
425
443
 
426
- static bool ggml_op_can_inplace(enum ggml_op op) {
427
- switch (op) {
428
- case GGML_OP_SCALE:
429
- case GGML_OP_DIAG_MASK_ZERO:
430
- case GGML_OP_DIAG_MASK_INF:
431
- case GGML_OP_ADD:
432
- case GGML_OP_ADD1:
433
- case GGML_OP_SUB:
434
- case GGML_OP_MUL:
435
- case GGML_OP_DIV:
436
- case GGML_OP_SQR:
437
- case GGML_OP_SQRT:
438
- case GGML_OP_LOG:
439
- case GGML_OP_UNARY:
440
- case GGML_OP_ROPE:
441
- case GGML_OP_RMS_NORM:
442
- case GGML_OP_SOFT_MAX:
443
- return true;
444
-
445
- default:
446
- return false;
447
- }
444
+ static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
445
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
446
+ hn->buffer_id = buffer_id;
447
+ hn->offset = offset;
448
+ hn->allocated = true;
448
449
  }
449
450
 
450
- static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
451
- if (galloc->talloc != NULL) {
452
- return galloc->talloc;
453
- }
454
-
455
- return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
451
+ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
452
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
456
453
  }
457
454
 
458
- static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
459
- ggml_tallocr_t alloc = node_tallocr(galloc, view);
460
-
461
- GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
462
- if (update_backend) {
463
- view->backend = view->view_src->backend;
464
- }
465
- // views are initialized in the alloc buffer rather than the view_src buffer
466
- view->buffer = alloc->buffer;
467
- view->data = (char *)view->view_src->data + view->view_offs;
455
+ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
456
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
468
457
 
469
- assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
459
+ hn->allocated = true;
460
+ assert(hn->offset == 0);
470
461
 
471
- if (!alloc->measure) {
472
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
473
- }
474
- }
462
+ // try to reuse a parent's buffer (inplace)
463
+ if (ggml_op_can_inplace(node->op)) {
464
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
465
+ struct ggml_tensor * parent = node->src[i];
466
+ if (parent == NULL) {
467
+ break;
468
+ }
475
469
 
476
- static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
477
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
470
+ // if the node's data is external, then we cannot re-use it
471
+ if (!ggml_gallocr_is_own(galloc, parent)) {
472
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
473
+ continue;
474
+ }
478
475
 
479
- if (node->data == NULL) {
480
- if (ggml_is_view(node)) {
481
- init_view(galloc, node, true);
482
- } else {
483
- // see if we can reuse a parent's buffer (inplace)
484
- if (ggml_op_can_inplace(node->op)) {
485
- for (int i = 0; i < GGML_MAX_SRC; i++) {
486
- struct ggml_tensor * parent = node->src[i];
487
- if (parent == NULL) {
488
- break;
489
- }
476
+ // outputs cannot be reused
477
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
478
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
479
+ continue;
480
+ }
490
481
 
491
- // if the node's data is external, then we cannot re-use it
492
- if (ggml_tallocr_is_own(alloc, parent) == false) {
493
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
494
- continue;
495
- }
482
+ if (!ggml_are_same_layout(node, parent)) {
483
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
484
+ continue;
485
+ }
496
486
 
497
- struct hash_node * p_hn = hash_get(galloc, parent);
498
- if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
499
- if (ggml_is_view(parent)) {
500
- struct ggml_tensor * view_src = parent->view_src;
501
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
502
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
503
- // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
504
- // the parent's data that it will need later (same layout requirement). the problem is that then
505
- // we cannot free the tensor because the original address of the allocation is lost.
506
- // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
507
- // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
508
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
509
- node->view_src = view_src;
510
- view_src_hn->n_views += 1;
511
- init_view(galloc, node, false);
512
- return;
513
- }
514
- } else {
515
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
516
- node->view_src = parent;
517
- p_hn->n_views += 1;
518
- init_view(galloc, node, false);
487
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
488
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
489
+ if (ggml_is_view(parent)) {
490
+ struct ggml_tensor * view_src = parent->view_src;
491
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
492
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
493
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
494
+ assert(view_src_hn->offset == p_hn->offset);
495
+ hn->buffer_id = p_hn->buffer_id;
496
+ hn->offset = p_hn->offset;
497
+ p_hn->allocated = false; // avoid freeing the parent
498
+ view_src_hn->allocated = false;
519
499
  return;
520
500
  }
501
+ } else {
502
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
503
+ hn->buffer_id = p_hn->buffer_id;
504
+ hn->offset = p_hn->offset;
505
+ p_hn->allocated = false; // avoid freeing the parent
506
+ return;
521
507
  }
522
508
  }
523
509
  }
524
- ggml_tallocr_alloc(alloc, node);
525
510
  }
511
+ // allocate tensor from the buffer
512
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
513
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
514
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
515
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
516
+ hn->buffer_id = buffer_id;
517
+ hn->offset = offset;
518
+ return;
526
519
  }
527
520
  }
528
521
 
529
- static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
530
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
522
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
523
+ // graph outputs are never freed
524
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
525
+ AT_PRINTF("not freeing output %s\n", node->name);
526
+ return;
527
+ }
531
528
 
532
- ggml_tallocr_free_tensor(alloc, node);
529
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
530
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
531
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
532
+ size_t offset = hn->offset;
533
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
534
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
535
+ hn->allocated = false;
533
536
  }
534
537
 
535
- static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
536
- const int * parse_seq = galloc->parse_seq;
537
- int parse_seq_len = galloc->parse_seq_len;
538
+ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
539
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
540
+ }
541
+
542
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
543
+ // clear hash tables
544
+ memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
545
+ memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
546
+
547
+ // allocate all graph inputs first to avoid overwriting them
548
+ for (int i = 0; i < graph->n_nodes; i++) {
549
+ if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
550
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
551
+ }
552
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
553
+ if (graph->nodes[i]->src[j] == NULL) {
554
+ break;
555
+ }
556
+ if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
557
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
558
+ }
559
+ }
560
+ }
538
561
 
539
562
  // count number of children and views
540
- for (int i = 0; i < gf->n_nodes; i++) {
541
- struct ggml_tensor * node = gf->nodes[i];
563
+ for (int i = 0; i < graph->n_nodes; i++) {
564
+ struct ggml_tensor * node = graph->nodes[i];
542
565
 
543
566
  if (ggml_is_view(node)) {
544
567
  struct ggml_tensor * view_src = node->view_src;
545
- hash_get(galloc, view_src)->n_views += 1;
546
- if (node->buffer == NULL && node->data != NULL) {
547
- // view of a pre-allocated tensor, didn't call init_view() yet
548
- init_view(galloc, node, true);
549
- }
568
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
550
569
  }
551
570
 
552
571
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
554
573
  if (parent == NULL) {
555
574
  break;
556
575
  }
557
- hash_get(galloc, parent)->n_children += 1;
558
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
559
- init_view(galloc, parent, true);
560
- }
576
+ ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
561
577
  }
562
578
  }
563
579
 
564
580
  // allocate tensors
565
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
566
- int last_barrier_pos = 0;
567
- int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
568
-
569
- for (int ind = 0; ind < n_nodes; ind++) {
570
- // allocate a node if there is no parse_seq or this is not a barrier
571
- if (parse_seq_len == 0 || parse_seq[ind] != -1) {
572
- int i = parse_seq_len ? parse_seq[ind] : ind;
573
- struct ggml_tensor * node = gf->nodes[i];
574
-
575
- // allocate parents (leafs)
576
- for (int j = 0; j < GGML_MAX_SRC; j++) {
577
- struct ggml_tensor * parent = node->src[j];
578
- if (parent == NULL) {
579
- break;
580
- }
581
- allocate_node(galloc, parent);
581
+ for (int i = 0; i < graph->n_nodes; i++) {
582
+ struct ggml_tensor * node = graph->nodes[i];
583
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
584
+
585
+ // allocate parents (only leafs need to be allocated at this point)
586
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
587
+ struct ggml_tensor * parent = node->src[j];
588
+ if (parent == NULL) {
589
+ break;
582
590
  }
591
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
592
+ }
583
593
 
584
- // allocate node
585
- allocate_node(galloc, node);
594
+ // allocate node
595
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
586
596
 
587
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
588
- for (int j = 0; j < GGML_MAX_SRC; j++) {
589
- struct ggml_tensor * parent = node->src[j];
590
- if (parent == NULL) {
591
- break;
592
- }
593
- AT_PRINTF("%s", parent->name);
594
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
595
- AT_PRINTF(", ");
596
- }
597
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
598
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
599
+ struct ggml_tensor * parent = node->src[j];
600
+ if (parent == NULL) {
601
+ break;
602
+ }
603
+ AT_PRINTF("%s", parent->name);
604
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
605
+ AT_PRINTF(", ");
597
606
  }
598
- AT_PRINTF("\n");
599
607
  }
608
+ AT_PRINTF("\n");
600
609
 
601
610
  // update parents
602
- // update immediately if there is no parse_seq
603
- // update only at barriers if there is parse_seq
604
- if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
605
- int update_start = parse_seq_len ? last_barrier_pos : ind;
606
- int update_end = parse_seq_len ? ind : ind + 1;
607
- for (int i = update_start; i < update_end; i++) {
608
- int node_i = parse_seq_len ? parse_seq[i] : i;
609
- struct ggml_tensor * node = gf->nodes[node_i];
610
-
611
- for (int j = 0; j < GGML_MAX_SRC; j++) {
612
- struct ggml_tensor * parent = node->src[j];
613
- if (parent == NULL) {
614
- break;
615
- }
616
- struct hash_node * p_hn = hash_get(galloc, parent);
617
- p_hn->n_children -= 1;
618
-
619
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
620
-
621
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
622
- if (ggml_is_view(parent)) {
623
- struct ggml_tensor * view_src = parent->view_src;
624
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
625
- view_src_hn->n_views -= 1;
626
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
627
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
628
- free_node(galloc, view_src);
629
- }
630
- }
631
- else {
632
- free_node(galloc, parent);
633
- }
611
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
612
+ struct ggml_tensor * parent = node->src[j];
613
+ if (parent == NULL) {
614
+ break;
615
+ }
616
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
617
+ p_hn->n_children -= 1;
618
+
619
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
620
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
621
+
622
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
623
+ if (ggml_is_view(parent)) {
624
+ struct ggml_tensor * view_src = parent->view_src;
625
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
626
+ view_src_hn->n_views -= 1;
627
+ AT_PRINTF("view_src %s: %d children, %d views\n",
628
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
629
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
630
+ ggml_gallocr_free_node(galloc, view_src, buffer_id);
634
631
  }
635
632
  }
633
+ else if (p_hn->allocated) {
634
+ ggml_gallocr_free_node(galloc, parent, buffer_id);
635
+ }
636
636
  }
637
637
  AT_PRINTF("\n");
638
- if (parse_seq_len) {
639
- last_barrier_pos = ind + 1;
640
- }
641
638
  }
642
639
  }
643
640
  }
644
641
 
645
- size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
642
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
646
643
  size_t hash_size = graph->visited_hash_table.size;
647
644
 
648
- // check if the hash table is initialized and large enough
645
+ // initialize hash table
649
646
  if (galloc->hash_set.size < hash_size) {
650
- if (galloc->hash_set.keys != NULL) {
651
- free(galloc->hash_set.keys);
652
- }
653
- if (galloc->hash_values != NULL) {
654
- free(galloc->hash_values);
655
- }
656
- galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
647
+ free(galloc->hash_set.keys);
648
+ free(galloc->hash_values);
657
649
  galloc->hash_set.size = hash_size;
658
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
650
+ galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
651
+ galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
652
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
653
+ GGML_ASSERT(galloc->hash_values != NULL);
654
+ } else {
655
+ // reset hash table
656
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
657
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
659
658
  }
660
659
 
661
- // reset hash table
662
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
663
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
664
-
665
- galloc->talloc = talloc;
666
- ggml_tallocr_alloc_graph_impl(galloc, graph);
667
- galloc->talloc = NULL;
668
-
669
- size_t max_size = ggml_tallocr_max_size(talloc);
670
-
671
- return max_size;
672
- }
673
-
674
- void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
675
- const size_t hash_size = hash_set.size;
676
-
677
- GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
660
+ // reset allocators
661
+ for (int i = 0; i < galloc->n_buffers; i++) {
662
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
663
+ }
678
664
 
679
- galloc->talloc = NULL;
665
+ // allocate in hash table
666
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
680
667
 
681
- // alloc hash_values if needed
682
- if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
683
- free(galloc->hash_values);
684
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
685
- galloc->hash_values_size = hash_size;
668
+ // set the node_allocs from the hash table
669
+ if (galloc->n_nodes < graph->n_nodes) {
670
+ free(galloc->node_allocs);
671
+ galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
672
+ GGML_ASSERT(galloc->node_allocs != NULL);
686
673
  }
687
-
688
- // free hash_set.keys if needed
689
- if (galloc->hash_set.keys != NULL) {
690
- free(galloc->hash_set.keys);
674
+ galloc->n_nodes = graph->n_nodes;
675
+ for (int i = 0; i < graph->n_nodes; i++) {
676
+ struct ggml_tensor * node = graph->nodes[i];
677
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
678
+ node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
679
+ if (node->view_src || node->data) {
680
+ node_alloc->dst.offset = SIZE_MAX;
681
+ node_alloc->dst.size_max = 0;
682
+ } else {
683
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
684
+ node_alloc->dst.offset = hn->offset;
685
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
686
+ }
687
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
688
+ struct ggml_tensor * src = node->src[j];
689
+ if (!src || src->view_src || src->data) {
690
+ node_alloc->src[j].offset = SIZE_MAX;
691
+ node_alloc->src[j].size_max = 0;
692
+ } else {
693
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
694
+ node_alloc->src[j].offset = hn->offset;
695
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
696
+ }
697
+ }
691
698
  }
692
- galloc->hash_set = hash_set;
693
699
 
694
- // reset hash values
695
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
700
+ // reallocate buffers if needed
701
+ for (int i = 0; i < galloc->n_buffers; i++) {
702
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
703
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
696
704
 
697
- galloc->hash_allocs = hash_node_talloc;
698
-
699
- ggml_tallocr_alloc_graph_impl(galloc, graph);
705
+ if (new_size > cur_size) {
706
+ #ifndef NDEBUG
707
+ fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
708
+ #endif
709
+ ggml_backend_buffer_free(galloc->buffers[i]);
710
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
711
+ if (galloc->buffers[i] == NULL) {
712
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
713
+ return false;
714
+ }
715
+ }
716
+ }
700
717
 
701
- // remove unowned resources
702
- galloc->hash_set.keys = NULL;
703
- galloc->hash_allocs = NULL;
718
+ return true;
704
719
  }
705
720
 
706
- // legacy API wrapper
707
-
708
- struct ggml_allocr {
709
- ggml_tallocr_t talloc;
710
- ggml_gallocr_t galloc;
711
- };
712
-
713
- static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
714
- ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
715
- *alloc = (struct ggml_allocr) {
716
- /*.talloc = */ talloc,
717
- /*.galloc = */ ggml_gallocr_new(),
718
- };
719
- return alloc;
721
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
722
+ return ggml_gallocr_reserve_n(galloc, graph, NULL);
720
723
  }
721
724
 
722
- ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
723
- return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
724
- }
725
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
726
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
725
727
 
726
- ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
727
- return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
728
- }
728
+ if (node->view_src != NULL) {
729
+ if (node->buffer == NULL) {
730
+ assert(tensor_alloc->offset == SIZE_MAX);
731
+ if (node->view_src->buffer == NULL) {
732
+ // this tensor was allocated without ggml-backend
733
+ return;
734
+ }
735
+ ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
736
+ }
737
+ } else {
738
+ if (node->data == NULL) {
739
+ assert(tensor_alloc->offset != SIZE_MAX);
740
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
741
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
742
+ void * addr = (char *)base + tensor_alloc->offset;
743
+ ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
744
+ } else {
745
+ if (node->buffer == NULL) {
746
+ // this tensor was allocated without ggml-backend
747
+ return;
748
+ }
729
749
 
730
- ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
731
- return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
750
+ #ifndef NDEBUG
751
+ size_t offset =
752
+ (char *)node->data -
753
+ (char *)ggml_backend_buffer_get_base(node->buffer);
754
+ size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
755
+ assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
756
+ assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
757
+ #endif
758
+ }
759
+ }
732
760
  }
733
761
 
734
- ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
735
- return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
762
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
763
+ ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
764
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
765
+ return talloc->size_max >= node_size;
736
766
  }
737
767
 
738
- ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
739
- return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
740
- }
768
+ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
769
+ if (galloc->n_nodes != graph->n_nodes) {
770
+ #ifndef NDEBUG
771
+ fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
772
+ #endif
773
+ return true;
774
+ }
741
775
 
742
- struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
743
- return ggml_tallocr_get_buffer(alloc->talloc);
744
- }
776
+ for (int i = 0; i < graph->n_nodes; i++) {
777
+ struct ggml_tensor * node = graph->nodes[i];
778
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
745
779
 
746
- void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
747
- ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
748
- }
780
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
781
+ #ifndef NDEBUG
782
+ fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
783
+ #endif
784
+ return true;
785
+ }
749
786
 
750
- void ggml_allocr_free(ggml_allocr_t alloc) {
751
- if (alloc == NULL) {
752
- return;
787
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
788
+ struct ggml_tensor * src = node->src[j];
789
+ if (src == NULL) {
790
+ break;
791
+ }
792
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
795
+ #endif
796
+ return true;
797
+ }
798
+ }
753
799
  }
754
800
 
755
- ggml_gallocr_free(alloc->galloc);
756
- ggml_tallocr_free(alloc->talloc);
757
- free(alloc);
801
+ return false;
758
802
  }
759
803
 
760
- bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
761
- return ggml_tallocr_is_measure(alloc->talloc);
762
- }
804
+ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
805
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
806
+ if (galloc->n_buffers == 1) {
807
+ #ifndef NDEBUG
808
+ fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
809
+ #endif
810
+ if (!ggml_gallocr_reserve(galloc, graph)) {
811
+ return false;
812
+ }
813
+ } else {
814
+ #ifndef NDEBUG
815
+ fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
816
+ #endif
817
+ return false;
818
+ }
819
+ }
763
820
 
764
- void ggml_allocr_reset(ggml_allocr_t alloc) {
765
- ggml_tallocr_reset(alloc->talloc);
766
- }
821
+ // reset buffers
822
+ for (int i = 0; i < galloc->n_buffers; i++) {
823
+ // zero size buffers are not allocated
824
+ if (galloc->buffers[i] != NULL) {
825
+ ggml_backend_buffer_reset(galloc->buffers[i]);
826
+ }
827
+ }
767
828
 
768
- void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
769
- ggml_tallocr_alloc(alloc->talloc, tensor);
770
- }
829
+ // allocate the graph tensors from the previous assignments
830
+ for (int i = 0; i < graph->n_nodes; i++) {
831
+ struct ggml_tensor * node = graph->nodes[i];
832
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
833
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
834
+ struct ggml_tensor * src = node->src[j];
835
+ if (src == NULL) {
836
+ break;
837
+ }
838
+ ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
839
+ }
840
+ ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
841
+ }
771
842
 
772
- size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
773
- return ggml_tallocr_max_size(alloc->talloc);
843
+ return true;
774
844
  }
775
845
 
776
- size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
777
- return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
846
+ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
847
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
848
+
849
+ if (galloc->buffers[buffer_id] == NULL) {
850
+ return 0;
851
+ }
852
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
778
853
  }
779
854
 
780
855
  // utils
@@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
795
870
  return false;
796
871
  }
797
872
 
798
- ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
873
+ struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
799
874
 
800
875
  for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
801
876
  if (t->data == NULL) {
802
877
  if (t->view_src == NULL) {
803
878
  ggml_tallocr_alloc(tallocr, t);
804
- } else {
879
+ } else if (t->buffer == NULL) {
805
880
  ggml_backend_view_init(buffer, t);
806
881
  }
807
882
  } else {
808
- if (t->view_src != NULL) {
883
+ if (t->view_src != NULL && t->buffer == NULL) {
809
884
  // view of a pre-allocated tensor
810
885
  ggml_backend_view_init(buffer, t);
811
886
  }
@@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
838
913
  }
839
914
 
840
915
  if (this_size > max_size) {
841
- // tensor is too large to fit in a single buffer
842
916
  fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
917
  __func__, t->name,
844
918
  ggml_backend_buft_name(buft),
@@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
870
944
  }
871
945
 
872
946
  if (n_buffers == 0) {
873
- // all the tensors in the context are already allocated
874
947
  #ifndef NDEBUG
875
948
  fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
949
  #endif