llama_cpp 0.12.5 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -17,6 +17,50 @@
|
|
17
17
|
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
18
18
|
#define AT_PRINTF(...)
|
19
19
|
|
20
|
+
|
21
|
+
static bool ggml_is_view(const struct ggml_tensor * t) {
|
22
|
+
return t->view_src != NULL;
|
23
|
+
}
|
24
|
+
|
25
|
+
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
26
|
+
if (a->type != b->type) {
|
27
|
+
return false;
|
28
|
+
}
|
29
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
30
|
+
if (a->ne[i] != b->ne[i]) {
|
31
|
+
return false;
|
32
|
+
}
|
33
|
+
if (a->nb[i] != b->nb[i]) {
|
34
|
+
return false;
|
35
|
+
}
|
36
|
+
}
|
37
|
+
return true;
|
38
|
+
}
|
39
|
+
|
40
|
+
static bool ggml_op_can_inplace(enum ggml_op op) {
|
41
|
+
switch (op) {
|
42
|
+
case GGML_OP_SCALE:
|
43
|
+
case GGML_OP_DIAG_MASK_ZERO:
|
44
|
+
case GGML_OP_DIAG_MASK_INF:
|
45
|
+
case GGML_OP_ADD:
|
46
|
+
case GGML_OP_ADD1:
|
47
|
+
case GGML_OP_SUB:
|
48
|
+
case GGML_OP_MUL:
|
49
|
+
case GGML_OP_DIV:
|
50
|
+
case GGML_OP_SQR:
|
51
|
+
case GGML_OP_SQRT:
|
52
|
+
case GGML_OP_LOG:
|
53
|
+
case GGML_OP_UNARY:
|
54
|
+
case GGML_OP_ROPE:
|
55
|
+
case GGML_OP_RMS_NORM:
|
56
|
+
case GGML_OP_SOFT_MAX:
|
57
|
+
return true;
|
58
|
+
|
59
|
+
default:
|
60
|
+
return false;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
20
64
|
// TODO: GGML_PAD ?
|
21
65
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
22
66
|
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
24
68
|
return offset + align;
|
25
69
|
}
|
26
70
|
|
71
|
+
// tallocr
|
72
|
+
struct ggml_tallocr {
|
73
|
+
ggml_backend_buffer_t buffer;
|
74
|
+
void * base;
|
75
|
+
size_t alignment;
|
76
|
+
size_t offset;
|
77
|
+
};
|
78
|
+
|
79
|
+
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
80
|
+
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
81
|
+
if (talloc == NULL) {
|
82
|
+
return NULL;
|
83
|
+
}
|
84
|
+
|
85
|
+
void * base = ggml_backend_buffer_get_base(buffer);
|
86
|
+
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
87
|
+
|
88
|
+
assert(align && !(align & (align - 1))); // power of 2
|
89
|
+
|
90
|
+
*talloc = (struct ggml_tallocr) {
|
91
|
+
/*.buffer = */ buffer,
|
92
|
+
/*.base = */ base,
|
93
|
+
/*.alignment = */ align,
|
94
|
+
/*.offset = */ aligned_offset(base, 0, align),
|
95
|
+
};
|
96
|
+
return talloc;
|
97
|
+
}
|
98
|
+
|
99
|
+
void ggml_tallocr_free(ggml_tallocr_t talloc) {
|
100
|
+
free(talloc);
|
101
|
+
}
|
102
|
+
|
103
|
+
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
104
|
+
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
105
|
+
size = GGML_PAD(size, talloc->alignment);
|
106
|
+
|
107
|
+
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
108
|
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
109
|
+
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
110
|
+
GGML_ASSERT(!"not enough space in the buffer");
|
111
|
+
return;
|
112
|
+
}
|
113
|
+
|
114
|
+
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
115
|
+
talloc->offset += size;
|
116
|
+
|
117
|
+
assert(((uintptr_t)addr % talloc->alignment) == 0);
|
118
|
+
|
119
|
+
ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
|
120
|
+
}
|
121
|
+
|
122
|
+
// dynamic tensor allocator
|
123
|
+
|
27
124
|
struct free_block {
|
28
|
-
|
125
|
+
size_t offset;
|
29
126
|
size_t size;
|
30
127
|
};
|
31
128
|
|
32
|
-
struct
|
33
|
-
struct ggml_backend_buffer * buffer;
|
34
|
-
bool buffer_owned;
|
35
|
-
void * base;
|
129
|
+
struct ggml_dyn_tallocr {
|
36
130
|
size_t alignment;
|
37
|
-
|
38
131
|
int n_free_blocks;
|
39
132
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
40
|
-
|
41
133
|
size_t max_size;
|
42
134
|
|
43
|
-
bool measure;
|
44
|
-
|
45
135
|
#ifdef GGML_ALLOCATOR_DEBUG
|
46
|
-
struct
|
136
|
+
struct {
|
137
|
+
const struct ggml_tensor * tensor;
|
138
|
+
size_t offset;
|
139
|
+
} allocated_tensors[1024];
|
47
140
|
#endif
|
48
141
|
};
|
49
142
|
|
50
143
|
#ifdef GGML_ALLOCATOR_DEBUG
|
51
|
-
static void add_allocated_tensor(
|
144
|
+
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
52
145
|
for (int i = 0; i < 1024; i++) {
|
53
|
-
if (alloc->allocated_tensors[i] == NULL) {
|
54
|
-
alloc->allocated_tensors[i] = tensor;
|
146
|
+
if (alloc->allocated_tensors[i].tensor == NULL) {
|
147
|
+
alloc->allocated_tensors[i].tensor = tensor;
|
148
|
+
alloc->allocated_tensors[i].offset = offset;
|
55
149
|
return;
|
56
150
|
}
|
57
151
|
}
|
58
152
|
GGML_ASSERT(!"out of allocated_tensors");
|
59
153
|
}
|
60
|
-
static void remove_allocated_tensor(
|
154
|
+
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
61
155
|
for (int i = 0; i < 1024; i++) {
|
62
|
-
if (alloc->allocated_tensors[i] ==
|
63
|
-
|
64
|
-
alloc->allocated_tensors[i] = NULL;
|
156
|
+
if (alloc->allocated_tensors[i].offset == offset) {
|
157
|
+
alloc->allocated_tensors[i].tensor = NULL;
|
65
158
|
return;
|
66
159
|
}
|
67
160
|
}
|
68
|
-
|
161
|
+
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
|
69
162
|
GGML_ASSERT(!"tensor not found");
|
70
163
|
}
|
71
164
|
#endif
|
72
165
|
|
73
|
-
|
74
|
-
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
75
|
-
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
76
|
-
}
|
77
|
-
|
78
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
79
|
-
return t->view_src != NULL;
|
80
|
-
}
|
81
|
-
|
82
|
-
void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
83
|
-
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
84
|
-
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
85
|
-
|
86
|
-
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
166
|
+
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
87
167
|
size = aligned_offset(NULL, size, alloc->alignment);
|
88
168
|
|
89
169
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
109
189
|
if (block->size >= size) {
|
110
190
|
best_fit_block = alloc->n_free_blocks - 1;
|
111
191
|
} else {
|
112
|
-
|
113
|
-
|
192
|
+
// this should never happen
|
193
|
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
194
|
+
__func__, size, max_avail);
|
114
195
|
GGML_ASSERT(!"not enough space in the buffer");
|
115
|
-
|
196
|
+
GGML_UNREACHABLE();
|
116
197
|
}
|
117
198
|
}
|
118
199
|
|
119
200
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
120
|
-
|
121
|
-
block->
|
201
|
+
size_t offset = block->offset;
|
202
|
+
block->offset = offset + size;
|
122
203
|
block->size -= size;
|
123
204
|
if (block->size == 0) {
|
124
205
|
// remove block if empty
|
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
128
209
|
}
|
129
210
|
}
|
130
211
|
|
131
|
-
AT_PRINTF("block %d,
|
132
|
-
|
133
|
-
tensor->data = addr;
|
134
|
-
tensor->buffer = alloc->buffer;
|
135
|
-
if (!alloc->measure) {
|
136
|
-
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
137
|
-
}
|
212
|
+
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
138
213
|
|
139
214
|
#ifdef GGML_ALLOCATOR_DEBUG
|
140
|
-
add_allocated_tensor(alloc, tensor);
|
141
|
-
size_t cur_max =
|
215
|
+
add_allocated_tensor(alloc, offset, tensor);
|
216
|
+
size_t cur_max = offset + size;
|
142
217
|
if (cur_max > alloc->max_size) {
|
143
|
-
|
218
|
+
// sort allocated_tensors by offset
|
219
|
+
for (int i = 0; i < 1024; i++) {
|
220
|
+
for (int j = i + 1; j < 1024; j++) {
|
221
|
+
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
|
222
|
+
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
223
|
+
size_t tmp_offset = alloc->allocated_tensors[i].offset;
|
224
|
+
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
225
|
+
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
|
226
|
+
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
227
|
+
alloc->allocated_tensors[j].offset = tmp_offset;
|
228
|
+
}
|
229
|
+
}
|
230
|
+
}
|
231
|
+
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
144
232
|
for (int i = 0; i < 1024; i++) {
|
145
|
-
if (alloc->allocated_tensors[i]) {
|
146
|
-
|
233
|
+
if (alloc->allocated_tensors[i].tensor) {
|
234
|
+
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
235
|
+
alloc->allocated_tensors[i].offset,
|
236
|
+
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
237
|
+
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
147
238
|
}
|
148
239
|
}
|
149
|
-
|
240
|
+
fprintf(stderr, "\n");
|
150
241
|
}
|
151
242
|
#endif
|
152
243
|
|
153
|
-
alloc->max_size = MAX(alloc->max_size,
|
154
|
-
}
|
244
|
+
alloc->max_size = MAX(alloc->max_size, offset + size);
|
155
245
|
|
156
|
-
|
157
|
-
static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
158
|
-
if (ggml_tallocr_is_own(alloc, tensor) == false) {
|
159
|
-
// the tensor was not allocated in this buffer
|
160
|
-
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
161
|
-
// the easiest way to deal with this is just to ignore it
|
162
|
-
// AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
163
|
-
return;
|
164
|
-
}
|
246
|
+
return offset;
|
165
247
|
|
166
|
-
|
248
|
+
GGML_UNUSED(tensor);
|
249
|
+
}
|
167
250
|
|
168
|
-
|
251
|
+
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
252
|
+
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
|
169
253
|
size = aligned_offset(NULL, size, alloc->alignment);
|
170
|
-
|
254
|
+
|
255
|
+
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
|
171
256
|
|
172
257
|
#ifdef GGML_ALLOCATOR_DEBUG
|
173
|
-
remove_allocated_tensor(alloc, tensor);
|
258
|
+
remove_allocated_tensor(alloc, offset, tensor);
|
174
259
|
#endif
|
175
260
|
|
176
261
|
// see if we can merge with an existing block
|
177
262
|
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
178
263
|
struct free_block * block = &alloc->free_blocks[i];
|
179
264
|
// check if ptr is at the end of the block
|
180
|
-
if (
|
265
|
+
if (block->offset + block->size == offset) {
|
181
266
|
block->size += size;
|
182
267
|
// check if we can merge with the next block
|
183
|
-
if (i < alloc->n_free_blocks - 1 &&
|
268
|
+
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
|
184
269
|
block->size += alloc->free_blocks[i+1].size;
|
185
270
|
alloc->n_free_blocks--;
|
186
271
|
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
190
275
|
return;
|
191
276
|
}
|
192
277
|
// check if ptr is at the beginning of the block
|
193
|
-
if (
|
194
|
-
block->
|
278
|
+
if (offset + size == block->offset) {
|
279
|
+
block->offset = offset;
|
195
280
|
block->size += size;
|
196
281
|
// check if we can merge with the previous block
|
197
|
-
if (i > 0 &&
|
282
|
+
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
|
198
283
|
alloc->free_blocks[i-1].size += block->size;
|
199
284
|
alloc->n_free_blocks--;
|
200
285
|
for (int j = i; j < alloc->n_free_blocks; j++) {
|
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
208
293
|
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
209
294
|
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
210
295
|
int insert_pos = 0;
|
211
|
-
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].
|
296
|
+
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
212
297
|
insert_pos++;
|
213
298
|
}
|
214
299
|
// shift all blocks from insert_pos onward to make room for the new block
|
@@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
216
301
|
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
217
302
|
}
|
218
303
|
// insert the new block
|
219
|
-
alloc->free_blocks[insert_pos].
|
304
|
+
alloc->free_blocks[insert_pos].offset = offset;
|
220
305
|
alloc->free_blocks[insert_pos].size = size;
|
221
306
|
alloc->n_free_blocks++;
|
307
|
+
|
308
|
+
GGML_UNUSED(tensor);
|
222
309
|
}
|
223
310
|
|
224
|
-
void
|
311
|
+
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
225
312
|
alloc->n_free_blocks = 1;
|
226
|
-
|
227
|
-
alloc->free_blocks[0].
|
228
|
-
|
229
|
-
if (alloc->measure) {
|
230
|
-
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
231
|
-
} else {
|
232
|
-
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
233
|
-
ggml_backend_buffer_reset(alloc->buffer);
|
234
|
-
}
|
313
|
+
alloc->free_blocks[0].offset = 0;
|
314
|
+
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
315
|
+
alloc->max_size = 0;
|
235
316
|
}
|
236
317
|
|
237
|
-
|
238
|
-
struct
|
239
|
-
|
240
|
-
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
318
|
+
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
319
|
+
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
241
320
|
|
242
|
-
*alloc = (struct
|
243
|
-
/*.buffer = */ buffer,
|
244
|
-
/*.buffer_owned = */ true,
|
245
|
-
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
321
|
+
*alloc = (struct ggml_dyn_tallocr) {
|
246
322
|
/*.alignment = */ alignment,
|
247
323
|
/*.n_free_blocks = */ 0,
|
248
324
|
/*.free_blocks = */ {{0}},
|
249
325
|
/*.max_size = */ 0,
|
250
|
-
/*.measure = */ false,
|
251
326
|
#ifdef GGML_ALLOCATOR_DEBUG
|
252
|
-
/*.allocated_tensors = */ {0},
|
327
|
+
/*.allocated_tensors = */ {{0}},
|
253
328
|
#endif
|
254
329
|
};
|
255
330
|
|
256
|
-
|
257
|
-
|
258
|
-
return alloc;
|
259
|
-
}
|
260
|
-
|
261
|
-
ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
262
|
-
ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
|
263
|
-
alloc->measure = true;
|
331
|
+
ggml_dyn_tallocr_reset(alloc);
|
264
332
|
|
265
333
|
return alloc;
|
266
334
|
}
|
267
335
|
|
268
|
-
|
269
|
-
// create a backend buffer to get the correct tensor allocation sizes
|
270
|
-
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
271
|
-
|
272
|
-
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
273
|
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
274
|
-
alloc->buffer_owned = true;
|
275
|
-
alloc->measure = true;
|
276
|
-
ggml_tallocr_reset(alloc);
|
277
|
-
return alloc;
|
278
|
-
}
|
279
|
-
|
280
|
-
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
281
|
-
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
282
|
-
}
|
283
|
-
|
284
|
-
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
285
|
-
// create a backend buffer to get the correct tensor allocation sizes
|
286
|
-
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
287
|
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
288
|
-
alloc->buffer_owned = true;
|
289
|
-
return alloc;
|
290
|
-
}
|
291
|
-
|
292
|
-
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
293
|
-
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
294
|
-
}
|
295
|
-
|
296
|
-
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
297
|
-
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
298
|
-
|
299
|
-
*alloc = (struct ggml_tallocr) {
|
300
|
-
/*.buffer = */ buffer,
|
301
|
-
/*.buffer_owned = */ false,
|
302
|
-
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
303
|
-
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
304
|
-
/*.n_free_blocks = */ 0,
|
305
|
-
/*.free_blocks = */ {{0}},
|
306
|
-
/*.max_size = */ 0,
|
307
|
-
/*.measure = */ false,
|
308
|
-
#ifdef GGML_ALLOCATOR_DEBUG
|
309
|
-
/*.allocated_tensors = */ {0},
|
310
|
-
#endif
|
311
|
-
};
|
312
|
-
|
313
|
-
ggml_tallocr_reset(alloc);
|
314
|
-
|
315
|
-
return alloc;
|
316
|
-
}
|
317
|
-
|
318
|
-
struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
|
319
|
-
return alloc->buffer;
|
320
|
-
}
|
321
|
-
|
322
|
-
void ggml_tallocr_free(ggml_tallocr_t alloc) {
|
323
|
-
if (alloc == NULL) {
|
324
|
-
return;
|
325
|
-
}
|
326
|
-
|
327
|
-
if (alloc->buffer_owned) {
|
328
|
-
ggml_backend_buffer_free(alloc->buffer);
|
329
|
-
}
|
336
|
+
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
330
337
|
free(alloc);
|
331
338
|
}
|
332
339
|
|
333
|
-
|
334
|
-
return alloc->
|
340
|
+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
341
|
+
return alloc->max_size;
|
335
342
|
}
|
336
343
|
|
337
|
-
|
338
|
-
|
339
|
-
// to avoid this, we add a 10% margin to the buffer size
|
340
|
-
return alloc->max_size + alloc->max_size/10;
|
341
|
-
}
|
344
|
+
|
345
|
+
/////////////////////////////////////
|
342
346
|
|
343
347
|
// graph allocator
|
344
348
|
|
345
349
|
struct hash_node {
|
346
350
|
int n_children;
|
347
351
|
int n_views;
|
352
|
+
int buffer_id;
|
353
|
+
size_t offset; // offset within the buffer
|
354
|
+
bool allocated;
|
355
|
+
};
|
356
|
+
|
357
|
+
//
|
358
|
+
struct tensor_alloc {
|
359
|
+
size_t offset;
|
360
|
+
size_t size_max; // 0 = pre-allocated, unused, or view
|
361
|
+
};
|
362
|
+
|
363
|
+
struct node_alloc {
|
364
|
+
int buffer_id;
|
365
|
+
struct tensor_alloc dst;
|
366
|
+
struct tensor_alloc src[GGML_MAX_SRC];
|
348
367
|
};
|
349
368
|
|
350
369
|
struct ggml_gallocr {
|
351
|
-
|
370
|
+
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
371
|
+
ggml_backend_buffer_t * buffers; // [n_buffers]
|
372
|
+
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
373
|
+
int n_buffers;
|
374
|
+
|
352
375
|
struct ggml_hash_set hash_set;
|
353
|
-
struct hash_node * hash_values;
|
354
|
-
|
355
|
-
|
356
|
-
int
|
357
|
-
int parse_seq_len;
|
376
|
+
struct hash_node * hash_values; // [hash_set.size]
|
377
|
+
|
378
|
+
struct node_alloc * node_allocs; // [n_nodes]
|
379
|
+
int n_nodes;
|
358
380
|
};
|
359
381
|
|
360
|
-
ggml_gallocr_t
|
361
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
382
|
+
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
383
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
|
384
|
+
GGML_ASSERT(galloc != NULL);
|
385
|
+
|
386
|
+
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
|
387
|
+
GGML_ASSERT(galloc->bufts != NULL);
|
388
|
+
|
389
|
+
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
|
390
|
+
GGML_ASSERT(galloc->buffers != NULL);
|
391
|
+
|
392
|
+
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
|
393
|
+
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
394
|
+
|
395
|
+
for (int i = 0; i < n_bufs; i++) {
|
396
|
+
galloc->bufts[i] = bufts[i];
|
397
|
+
galloc->buffers[i] = NULL;
|
398
|
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
399
|
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
400
|
+
}
|
401
|
+
galloc->n_buffers = n_bufs;
|
372
402
|
|
373
403
|
return galloc;
|
374
404
|
}
|
375
405
|
|
406
|
+
ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
|
407
|
+
return ggml_gallocr_new_n(&buft, 1);
|
408
|
+
}
|
409
|
+
|
376
410
|
void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
377
411
|
if (galloc == NULL) {
|
378
412
|
return;
|
379
413
|
}
|
380
414
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
free(galloc->hash_allocs);
|
389
|
-
}
|
390
|
-
if (galloc->parse_seq != NULL) {
|
391
|
-
free(galloc->parse_seq);
|
415
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
416
|
+
if (galloc->buffers != NULL) {
|
417
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
418
|
+
}
|
419
|
+
if (galloc->buf_tallocs != NULL) {
|
420
|
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
421
|
+
}
|
392
422
|
}
|
423
|
+
|
424
|
+
free(galloc->hash_set.keys);
|
425
|
+
free(galloc->hash_values);
|
426
|
+
free(galloc->bufts);
|
427
|
+
free(galloc->buffers);
|
428
|
+
free(galloc->buf_tallocs);
|
429
|
+
free(galloc->node_allocs);
|
393
430
|
free(galloc);
|
394
431
|
}
|
395
432
|
|
396
|
-
|
397
|
-
free(galloc->parse_seq);
|
398
|
-
galloc->parse_seq = malloc(sizeof(int) * n);
|
433
|
+
typedef struct ggml_gallocr * ggml_gallocr_t;
|
399
434
|
|
400
|
-
|
401
|
-
galloc->parse_seq[i] = list[i];
|
402
|
-
}
|
403
|
-
galloc->parse_seq_len = n;
|
404
|
-
}
|
405
|
-
|
406
|
-
static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
435
|
+
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
407
436
|
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
408
437
|
return &galloc->hash_values[i];
|
409
438
|
}
|
410
439
|
|
411
|
-
static bool
|
412
|
-
|
413
|
-
return false;
|
414
|
-
}
|
415
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
416
|
-
if (a->ne[i] != b->ne[i]) {
|
417
|
-
return false;
|
418
|
-
}
|
419
|
-
if (a->nb[i] != b->nb[i]) {
|
420
|
-
return false;
|
421
|
-
}
|
422
|
-
}
|
423
|
-
return true;
|
440
|
+
static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
441
|
+
return ggml_gallocr_hash_get(galloc, t)->allocated;
|
424
442
|
}
|
425
443
|
|
426
|
-
static
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
case GGML_OP_ADD:
|
432
|
-
case GGML_OP_ADD1:
|
433
|
-
case GGML_OP_SUB:
|
434
|
-
case GGML_OP_MUL:
|
435
|
-
case GGML_OP_DIV:
|
436
|
-
case GGML_OP_SQR:
|
437
|
-
case GGML_OP_SQRT:
|
438
|
-
case GGML_OP_LOG:
|
439
|
-
case GGML_OP_UNARY:
|
440
|
-
case GGML_OP_ROPE:
|
441
|
-
case GGML_OP_RMS_NORM:
|
442
|
-
case GGML_OP_SOFT_MAX:
|
443
|
-
return true;
|
444
|
-
|
445
|
-
default:
|
446
|
-
return false;
|
447
|
-
}
|
444
|
+
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
|
445
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
446
|
+
hn->buffer_id = buffer_id;
|
447
|
+
hn->offset = offset;
|
448
|
+
hn->allocated = true;
|
448
449
|
}
|
449
450
|
|
450
|
-
static
|
451
|
-
|
452
|
-
return galloc->talloc;
|
453
|
-
}
|
454
|
-
|
455
|
-
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
451
|
+
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
452
|
+
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
456
453
|
}
|
457
454
|
|
458
|
-
static void
|
459
|
-
|
460
|
-
|
461
|
-
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
462
|
-
if (update_backend) {
|
463
|
-
view->backend = view->view_src->backend;
|
464
|
-
}
|
465
|
-
// views are initialized in the alloc buffer rather than the view_src buffer
|
466
|
-
view->buffer = alloc->buffer;
|
467
|
-
view->data = (char *)view->view_src->data + view->view_offs;
|
455
|
+
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
456
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
468
457
|
|
469
|
-
|
458
|
+
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
459
|
+
hn->allocated = true;
|
460
|
+
assert(hn->offset == 0);
|
470
461
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
462
|
+
// try to reuse a parent's buffer (inplace)
|
463
|
+
if (ggml_op_can_inplace(node->op)) {
|
464
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
465
|
+
struct ggml_tensor * parent = node->src[i];
|
466
|
+
if (parent == NULL) {
|
467
|
+
break;
|
468
|
+
}
|
475
469
|
|
476
|
-
|
477
|
-
|
470
|
+
// if the node's data is external, then we cannot re-use it
|
471
|
+
if (!ggml_gallocr_is_own(galloc, parent)) {
|
472
|
+
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
473
|
+
continue;
|
474
|
+
}
|
478
475
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
if (ggml_op_can_inplace(node->op)) {
|
485
|
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
486
|
-
struct ggml_tensor * parent = node->src[i];
|
487
|
-
if (parent == NULL) {
|
488
|
-
break;
|
489
|
-
}
|
476
|
+
// outputs cannot be reused
|
477
|
+
if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
|
478
|
+
AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
|
479
|
+
continue;
|
480
|
+
}
|
490
481
|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
}
|
482
|
+
if (!ggml_are_same_layout(node, parent)) {
|
483
|
+
AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
|
484
|
+
continue;
|
485
|
+
}
|
496
486
|
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
node->view_src = view_src;
|
510
|
-
view_src_hn->n_views += 1;
|
511
|
-
init_view(galloc, node, false);
|
512
|
-
return;
|
513
|
-
}
|
514
|
-
} else {
|
515
|
-
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
516
|
-
node->view_src = parent;
|
517
|
-
p_hn->n_views += 1;
|
518
|
-
init_view(galloc, node, false);
|
487
|
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
488
|
+
if (p_hn->n_children == 1 && p_hn->n_views == 0) {
|
489
|
+
if (ggml_is_view(parent)) {
|
490
|
+
struct ggml_tensor * view_src = parent->view_src;
|
491
|
+
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
492
|
+
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
493
|
+
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
494
|
+
assert(view_src_hn->offset == p_hn->offset);
|
495
|
+
hn->buffer_id = p_hn->buffer_id;
|
496
|
+
hn->offset = p_hn->offset;
|
497
|
+
p_hn->allocated = false; // avoid freeing the parent
|
498
|
+
view_src_hn->allocated = false;
|
519
499
|
return;
|
520
500
|
}
|
501
|
+
} else {
|
502
|
+
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
503
|
+
hn->buffer_id = p_hn->buffer_id;
|
504
|
+
hn->offset = p_hn->offset;
|
505
|
+
p_hn->allocated = false; // avoid freeing the parent
|
506
|
+
return;
|
521
507
|
}
|
522
508
|
}
|
523
509
|
}
|
524
|
-
ggml_tallocr_alloc(alloc, node);
|
525
510
|
}
|
511
|
+
// allocate tensor from the buffer
|
512
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
513
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
514
|
+
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
515
|
+
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
516
|
+
hn->buffer_id = buffer_id;
|
517
|
+
hn->offset = offset;
|
518
|
+
return;
|
526
519
|
}
|
527
520
|
}
|
528
521
|
|
529
|
-
static void
|
530
|
-
|
522
|
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
523
|
+
// graph outputs are never freed
|
524
|
+
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
525
|
+
AT_PRINTF("not freeing output %s\n", node->name);
|
526
|
+
return;
|
527
|
+
}
|
531
528
|
|
532
|
-
|
529
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
530
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
531
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
532
|
+
size_t offset = hn->offset;
|
533
|
+
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
534
|
+
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
535
|
+
hn->allocated = false;
|
533
536
|
}
|
534
537
|
|
535
|
-
static
|
536
|
-
|
537
|
-
|
538
|
+
static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
539
|
+
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
540
|
+
}
|
541
|
+
|
542
|
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
543
|
+
// clear hash tables
|
544
|
+
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
545
|
+
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
546
|
+
|
547
|
+
// allocate all graph inputs first to avoid overwriting them
|
548
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
549
|
+
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
550
|
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
551
|
+
}
|
552
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
553
|
+
if (graph->nodes[i]->src[j] == NULL) {
|
554
|
+
break;
|
555
|
+
}
|
556
|
+
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
557
|
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
558
|
+
}
|
559
|
+
}
|
560
|
+
}
|
538
561
|
|
539
562
|
// count number of children and views
|
540
|
-
for (int i = 0; i <
|
541
|
-
struct ggml_tensor * node =
|
563
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
564
|
+
struct ggml_tensor * node = graph->nodes[i];
|
542
565
|
|
543
566
|
if (ggml_is_view(node)) {
|
544
567
|
struct ggml_tensor * view_src = node->view_src;
|
545
|
-
|
546
|
-
if (node->buffer == NULL && node->data != NULL) {
|
547
|
-
// view of a pre-allocated tensor, didn't call init_view() yet
|
548
|
-
init_view(galloc, node, true);
|
549
|
-
}
|
568
|
+
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
550
569
|
}
|
551
570
|
|
552
571
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
554
573
|
if (parent == NULL) {
|
555
574
|
break;
|
556
575
|
}
|
557
|
-
|
558
|
-
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
559
|
-
init_view(galloc, parent, true);
|
560
|
-
}
|
576
|
+
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
561
577
|
}
|
562
578
|
}
|
563
579
|
|
564
580
|
// allocate tensors
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
// allocate parents (leafs)
|
576
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
577
|
-
struct ggml_tensor * parent = node->src[j];
|
578
|
-
if (parent == NULL) {
|
579
|
-
break;
|
580
|
-
}
|
581
|
-
allocate_node(galloc, parent);
|
581
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
582
|
+
struct ggml_tensor * node = graph->nodes[i];
|
583
|
+
int buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
584
|
+
|
585
|
+
// allocate parents (only leafs need to be allocated at this point)
|
586
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
587
|
+
struct ggml_tensor * parent = node->src[j];
|
588
|
+
if (parent == NULL) {
|
589
|
+
break;
|
582
590
|
}
|
591
|
+
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
592
|
+
}
|
583
593
|
|
584
|
-
|
585
|
-
|
594
|
+
// allocate node
|
595
|
+
ggml_gallocr_allocate_node(galloc, node, buffer_id);
|
586
596
|
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
}
|
597
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
|
598
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
599
|
+
struct ggml_tensor * parent = node->src[j];
|
600
|
+
if (parent == NULL) {
|
601
|
+
break;
|
602
|
+
}
|
603
|
+
AT_PRINTF("%s", parent->name);
|
604
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
605
|
+
AT_PRINTF(", ");
|
597
606
|
}
|
598
|
-
AT_PRINTF("\n");
|
599
607
|
}
|
608
|
+
AT_PRINTF("\n");
|
600
609
|
|
601
610
|
// update parents
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
struct hash_node *
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
if (ggml_is_view(parent)) {
|
623
|
-
struct ggml_tensor * view_src = parent->view_src;
|
624
|
-
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
625
|
-
view_src_hn->n_views -= 1;
|
626
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
627
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
|
628
|
-
free_node(galloc, view_src);
|
629
|
-
}
|
630
|
-
}
|
631
|
-
else {
|
632
|
-
free_node(galloc, parent);
|
633
|
-
}
|
611
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
612
|
+
struct ggml_tensor * parent = node->src[j];
|
613
|
+
if (parent == NULL) {
|
614
|
+
break;
|
615
|
+
}
|
616
|
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
617
|
+
p_hn->n_children -= 1;
|
618
|
+
|
619
|
+
AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
|
620
|
+
parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
|
621
|
+
|
622
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
623
|
+
if (ggml_is_view(parent)) {
|
624
|
+
struct ggml_tensor * view_src = parent->view_src;
|
625
|
+
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
626
|
+
view_src_hn->n_views -= 1;
|
627
|
+
AT_PRINTF("view_src %s: %d children, %d views\n",
|
628
|
+
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
629
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
630
|
+
ggml_gallocr_free_node(galloc, view_src, buffer_id);
|
634
631
|
}
|
635
632
|
}
|
633
|
+
else if (p_hn->allocated) {
|
634
|
+
ggml_gallocr_free_node(galloc, parent, buffer_id);
|
635
|
+
}
|
636
636
|
}
|
637
637
|
AT_PRINTF("\n");
|
638
|
-
if (parse_seq_len) {
|
639
|
-
last_barrier_pos = ind + 1;
|
640
|
-
}
|
641
638
|
}
|
642
639
|
}
|
643
640
|
}
|
644
641
|
|
645
|
-
|
642
|
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
646
643
|
size_t hash_size = graph->visited_hash_table.size;
|
647
644
|
|
648
|
-
//
|
645
|
+
// initialize hash table
|
649
646
|
if (galloc->hash_set.size < hash_size) {
|
650
|
-
|
651
|
-
|
652
|
-
}
|
653
|
-
if (galloc->hash_values != NULL) {
|
654
|
-
free(galloc->hash_values);
|
655
|
-
}
|
656
|
-
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
|
647
|
+
free(galloc->hash_set.keys);
|
648
|
+
free(galloc->hash_values);
|
657
649
|
galloc->hash_set.size = hash_size;
|
658
|
-
galloc->
|
650
|
+
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
|
651
|
+
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
|
652
|
+
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
653
|
+
GGML_ASSERT(galloc->hash_values != NULL);
|
654
|
+
} else {
|
655
|
+
// reset hash table
|
656
|
+
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
657
|
+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
659
658
|
}
|
660
659
|
|
661
|
-
// reset
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
galloc->talloc = talloc;
|
666
|
-
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
667
|
-
galloc->talloc = NULL;
|
668
|
-
|
669
|
-
size_t max_size = ggml_tallocr_max_size(talloc);
|
670
|
-
|
671
|
-
return max_size;
|
672
|
-
}
|
673
|
-
|
674
|
-
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
675
|
-
const size_t hash_size = hash_set.size;
|
676
|
-
|
677
|
-
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
660
|
+
// reset allocators
|
661
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
662
|
+
ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
|
663
|
+
}
|
678
664
|
|
679
|
-
|
665
|
+
// allocate in hash table
|
666
|
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
680
667
|
|
681
|
-
//
|
682
|
-
if (galloc->
|
683
|
-
free(galloc->
|
684
|
-
galloc->
|
685
|
-
galloc->
|
668
|
+
// set the node_allocs from the hash table
|
669
|
+
if (galloc->n_nodes < graph->n_nodes) {
|
670
|
+
free(galloc->node_allocs);
|
671
|
+
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
|
672
|
+
GGML_ASSERT(galloc->node_allocs != NULL);
|
686
673
|
}
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
674
|
+
galloc->n_nodes = graph->n_nodes;
|
675
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
676
|
+
struct ggml_tensor * node = graph->nodes[i];
|
677
|
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
678
|
+
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
679
|
+
if (node->view_src || node->data) {
|
680
|
+
node_alloc->dst.offset = SIZE_MAX;
|
681
|
+
node_alloc->dst.size_max = 0;
|
682
|
+
} else {
|
683
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
684
|
+
node_alloc->dst.offset = hn->offset;
|
685
|
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
686
|
+
}
|
687
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
688
|
+
struct ggml_tensor * src = node->src[j];
|
689
|
+
if (!src || src->view_src || src->data) {
|
690
|
+
node_alloc->src[j].offset = SIZE_MAX;
|
691
|
+
node_alloc->src[j].size_max = 0;
|
692
|
+
} else {
|
693
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
694
|
+
node_alloc->src[j].offset = hn->offset;
|
695
|
+
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
696
|
+
}
|
697
|
+
}
|
691
698
|
}
|
692
|
-
galloc->hash_set = hash_set;
|
693
699
|
|
694
|
-
//
|
695
|
-
|
700
|
+
// reallocate buffers if needed
|
701
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
702
|
+
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
703
|
+
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
696
704
|
|
697
|
-
|
698
|
-
|
699
|
-
|
705
|
+
if (new_size > cur_size) {
|
706
|
+
#ifndef NDEBUG
|
707
|
+
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
708
|
+
#endif
|
709
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
710
|
+
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
711
|
+
if (galloc->buffers[i] == NULL) {
|
712
|
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
713
|
+
return false;
|
714
|
+
}
|
715
|
+
}
|
716
|
+
}
|
700
717
|
|
701
|
-
|
702
|
-
galloc->hash_set.keys = NULL;
|
703
|
-
galloc->hash_allocs = NULL;
|
718
|
+
return true;
|
704
719
|
}
|
705
720
|
|
706
|
-
|
707
|
-
|
708
|
-
struct ggml_allocr {
|
709
|
-
ggml_tallocr_t talloc;
|
710
|
-
ggml_gallocr_t galloc;
|
711
|
-
};
|
712
|
-
|
713
|
-
static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
|
714
|
-
ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
|
715
|
-
*alloc = (struct ggml_allocr) {
|
716
|
-
/*.talloc = */ talloc,
|
717
|
-
/*.galloc = */ ggml_gallocr_new(),
|
718
|
-
};
|
719
|
-
return alloc;
|
721
|
+
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
722
|
+
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
720
723
|
}
|
721
724
|
|
722
|
-
|
723
|
-
|
724
|
-
}
|
725
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
|
726
|
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
725
727
|
|
726
|
-
|
727
|
-
|
728
|
-
|
728
|
+
if (node->view_src != NULL) {
|
729
|
+
if (node->buffer == NULL) {
|
730
|
+
assert(tensor_alloc->offset == SIZE_MAX);
|
731
|
+
if (node->view_src->buffer == NULL) {
|
732
|
+
// this tensor was allocated without ggml-backend
|
733
|
+
return;
|
734
|
+
}
|
735
|
+
ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
|
736
|
+
}
|
737
|
+
} else {
|
738
|
+
if (node->data == NULL) {
|
739
|
+
assert(tensor_alloc->offset != SIZE_MAX);
|
740
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
741
|
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
|
742
|
+
void * addr = (char *)base + tensor_alloc->offset;
|
743
|
+
ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
|
744
|
+
} else {
|
745
|
+
if (node->buffer == NULL) {
|
746
|
+
// this tensor was allocated without ggml-backend
|
747
|
+
return;
|
748
|
+
}
|
729
749
|
|
730
|
-
|
731
|
-
|
750
|
+
#ifndef NDEBUG
|
751
|
+
size_t offset =
|
752
|
+
(char *)node->data -
|
753
|
+
(char *)ggml_backend_buffer_get_base(node->buffer);
|
754
|
+
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
755
|
+
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
756
|
+
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
757
|
+
#endif
|
758
|
+
}
|
759
|
+
}
|
732
760
|
}
|
733
761
|
|
734
|
-
|
735
|
-
|
762
|
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
|
763
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
|
764
|
+
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
765
|
+
return talloc->size_max >= node_size;
|
736
766
|
}
|
737
767
|
|
738
|
-
|
739
|
-
|
740
|
-
|
768
|
+
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
769
|
+
if (galloc->n_nodes != graph->n_nodes) {
|
770
|
+
#ifndef NDEBUG
|
771
|
+
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
|
772
|
+
#endif
|
773
|
+
return true;
|
774
|
+
}
|
741
775
|
|
742
|
-
|
743
|
-
|
744
|
-
|
776
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
777
|
+
struct ggml_tensor * node = graph->nodes[i];
|
778
|
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
745
779
|
|
746
|
-
|
747
|
-
|
748
|
-
|
780
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
|
781
|
+
#ifndef NDEBUG
|
782
|
+
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
783
|
+
#endif
|
784
|
+
return true;
|
785
|
+
}
|
749
786
|
|
750
|
-
|
751
|
-
|
752
|
-
|
787
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
788
|
+
struct ggml_tensor * src = node->src[j];
|
789
|
+
if (src == NULL) {
|
790
|
+
break;
|
791
|
+
}
|
792
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
795
|
+
#endif
|
796
|
+
return true;
|
797
|
+
}
|
798
|
+
}
|
753
799
|
}
|
754
800
|
|
755
|
-
|
756
|
-
ggml_tallocr_free(alloc->talloc);
|
757
|
-
free(alloc);
|
801
|
+
return false;
|
758
802
|
}
|
759
803
|
|
760
|
-
bool
|
761
|
-
|
762
|
-
|
804
|
+
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
805
|
+
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
806
|
+
if (galloc->n_buffers == 1) {
|
807
|
+
#ifndef NDEBUG
|
808
|
+
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
|
809
|
+
#endif
|
810
|
+
if (!ggml_gallocr_reserve(galloc, graph)) {
|
811
|
+
return false;
|
812
|
+
}
|
813
|
+
} else {
|
814
|
+
#ifndef NDEBUG
|
815
|
+
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
816
|
+
#endif
|
817
|
+
return false;
|
818
|
+
}
|
819
|
+
}
|
763
820
|
|
764
|
-
|
765
|
-
|
766
|
-
|
821
|
+
// reset buffers
|
822
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
823
|
+
// zero size buffers are not allocated
|
824
|
+
if (galloc->buffers[i] != NULL) {
|
825
|
+
ggml_backend_buffer_reset(galloc->buffers[i]);
|
826
|
+
}
|
827
|
+
}
|
767
828
|
|
768
|
-
|
769
|
-
|
770
|
-
|
829
|
+
// allocate the graph tensors from the previous assignments
|
830
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
831
|
+
struct ggml_tensor * node = graph->nodes[i];
|
832
|
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
833
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
834
|
+
struct ggml_tensor * src = node->src[j];
|
835
|
+
if (src == NULL) {
|
836
|
+
break;
|
837
|
+
}
|
838
|
+
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
839
|
+
}
|
840
|
+
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
841
|
+
}
|
771
842
|
|
772
|
-
|
773
|
-
return ggml_tallocr_max_size(alloc->talloc);
|
843
|
+
return true;
|
774
844
|
}
|
775
845
|
|
776
|
-
size_t
|
777
|
-
|
846
|
+
size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
847
|
+
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
848
|
+
|
849
|
+
if (galloc->buffers[buffer_id] == NULL) {
|
850
|
+
return 0;
|
851
|
+
}
|
852
|
+
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
778
853
|
}
|
779
854
|
|
780
855
|
// utils
|
@@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
795
870
|
return false;
|
796
871
|
}
|
797
872
|
|
798
|
-
|
873
|
+
struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
|
799
874
|
|
800
875
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
801
876
|
if (t->data == NULL) {
|
802
877
|
if (t->view_src == NULL) {
|
803
878
|
ggml_tallocr_alloc(tallocr, t);
|
804
|
-
} else {
|
879
|
+
} else if (t->buffer == NULL) {
|
805
880
|
ggml_backend_view_init(buffer, t);
|
806
881
|
}
|
807
882
|
} else {
|
808
|
-
if (t->view_src != NULL) {
|
883
|
+
if (t->view_src != NULL && t->buffer == NULL) {
|
809
884
|
// view of a pre-allocated tensor
|
810
885
|
ggml_backend_view_init(buffer, t);
|
811
886
|
}
|
@@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
838
913
|
}
|
839
914
|
|
840
915
|
if (this_size > max_size) {
|
841
|
-
// tensor is too large to fit in a single buffer
|
842
916
|
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
843
917
|
__func__, t->name,
|
844
918
|
ggml_backend_buft_name(buft),
|
@@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
870
944
|
}
|
871
945
|
|
872
946
|
if (n_buffers == 0) {
|
873
|
-
// all the tensors in the context are already allocated
|
874
947
|
#ifndef NDEBUG
|
875
948
|
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
876
949
|
#endif
|