llama_cpp 0.12.5 → 0.12.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/ext/llama_cpp/llama_cpp.cpp +67 -10
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +51 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +595 -492
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +268 -271
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +560 -346
- data/vendor/tmp/llama.cpp/ggml-impl.h +20 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +101 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +608 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +1255 -94
- data/vendor/tmp/llama.cpp/ggml-quants.h +39 -16
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +95 -264
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +213 -58
- data/vendor/tmp/llama.cpp/ggml.c +1082 -564
- data/vendor/tmp/llama.cpp/ggml.h +50 -17
- data/vendor/tmp/llama.cpp/llama.cpp +1329 -280
- data/vendor/tmp/llama.cpp/llama.h +43 -1
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +1 -1
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -17,6 +17,50 @@
|
|
17
17
|
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
18
18
|
#define AT_PRINTF(...)
|
19
19
|
|
20
|
+
|
21
|
+
static bool ggml_is_view(const struct ggml_tensor * t) {
|
22
|
+
return t->view_src != NULL;
|
23
|
+
}
|
24
|
+
|
25
|
+
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
26
|
+
if (a->type != b->type) {
|
27
|
+
return false;
|
28
|
+
}
|
29
|
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
30
|
+
if (a->ne[i] != b->ne[i]) {
|
31
|
+
return false;
|
32
|
+
}
|
33
|
+
if (a->nb[i] != b->nb[i]) {
|
34
|
+
return false;
|
35
|
+
}
|
36
|
+
}
|
37
|
+
return true;
|
38
|
+
}
|
39
|
+
|
40
|
+
static bool ggml_op_can_inplace(enum ggml_op op) {
|
41
|
+
switch (op) {
|
42
|
+
case GGML_OP_SCALE:
|
43
|
+
case GGML_OP_DIAG_MASK_ZERO:
|
44
|
+
case GGML_OP_DIAG_MASK_INF:
|
45
|
+
case GGML_OP_ADD:
|
46
|
+
case GGML_OP_ADD1:
|
47
|
+
case GGML_OP_SUB:
|
48
|
+
case GGML_OP_MUL:
|
49
|
+
case GGML_OP_DIV:
|
50
|
+
case GGML_OP_SQR:
|
51
|
+
case GGML_OP_SQRT:
|
52
|
+
case GGML_OP_LOG:
|
53
|
+
case GGML_OP_UNARY:
|
54
|
+
case GGML_OP_ROPE:
|
55
|
+
case GGML_OP_RMS_NORM:
|
56
|
+
case GGML_OP_SOFT_MAX:
|
57
|
+
return true;
|
58
|
+
|
59
|
+
default:
|
60
|
+
return false;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
20
64
|
// TODO: GGML_PAD ?
|
21
65
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
22
66
|
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
@@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
24
68
|
return offset + align;
|
25
69
|
}
|
26
70
|
|
71
|
+
// tallocr
|
72
|
+
struct ggml_tallocr {
|
73
|
+
ggml_backend_buffer_t buffer;
|
74
|
+
void * base;
|
75
|
+
size_t alignment;
|
76
|
+
size_t offset;
|
77
|
+
};
|
78
|
+
|
79
|
+
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
80
|
+
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
81
|
+
if (talloc == NULL) {
|
82
|
+
return NULL;
|
83
|
+
}
|
84
|
+
|
85
|
+
void * base = ggml_backend_buffer_get_base(buffer);
|
86
|
+
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
87
|
+
|
88
|
+
assert(align && !(align & (align - 1))); // power of 2
|
89
|
+
|
90
|
+
*talloc = (struct ggml_tallocr) {
|
91
|
+
/*.buffer = */ buffer,
|
92
|
+
/*.base = */ base,
|
93
|
+
/*.alignment = */ align,
|
94
|
+
/*.offset = */ aligned_offset(base, 0, align),
|
95
|
+
};
|
96
|
+
return talloc;
|
97
|
+
}
|
98
|
+
|
99
|
+
void ggml_tallocr_free(ggml_tallocr_t talloc) {
|
100
|
+
free(talloc);
|
101
|
+
}
|
102
|
+
|
103
|
+
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
104
|
+
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
105
|
+
size = GGML_PAD(size, talloc->alignment);
|
106
|
+
|
107
|
+
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
108
|
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
109
|
+
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
110
|
+
GGML_ASSERT(!"not enough space in the buffer");
|
111
|
+
return;
|
112
|
+
}
|
113
|
+
|
114
|
+
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
115
|
+
talloc->offset += size;
|
116
|
+
|
117
|
+
assert(((uintptr_t)addr % talloc->alignment) == 0);
|
118
|
+
|
119
|
+
ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
|
120
|
+
}
|
121
|
+
|
122
|
+
// dynamic tensor allocator
|
123
|
+
|
27
124
|
struct free_block {
|
28
|
-
|
125
|
+
size_t offset;
|
29
126
|
size_t size;
|
30
127
|
};
|
31
128
|
|
32
|
-
struct
|
33
|
-
struct ggml_backend_buffer * buffer;
|
34
|
-
bool buffer_owned;
|
35
|
-
void * base;
|
129
|
+
struct ggml_dyn_tallocr {
|
36
130
|
size_t alignment;
|
37
|
-
|
38
131
|
int n_free_blocks;
|
39
132
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
40
|
-
|
41
133
|
size_t max_size;
|
42
134
|
|
43
|
-
bool measure;
|
44
|
-
|
45
135
|
#ifdef GGML_ALLOCATOR_DEBUG
|
46
|
-
struct
|
136
|
+
struct {
|
137
|
+
const struct ggml_tensor * tensor;
|
138
|
+
size_t offset;
|
139
|
+
} allocated_tensors[1024];
|
47
140
|
#endif
|
48
141
|
};
|
49
142
|
|
50
143
|
#ifdef GGML_ALLOCATOR_DEBUG
|
51
|
-
static void add_allocated_tensor(
|
144
|
+
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
52
145
|
for (int i = 0; i < 1024; i++) {
|
53
|
-
if (alloc->allocated_tensors[i] == NULL) {
|
54
|
-
alloc->allocated_tensors[i] = tensor;
|
146
|
+
if (alloc->allocated_tensors[i].tensor == NULL) {
|
147
|
+
alloc->allocated_tensors[i].tensor = tensor;
|
148
|
+
alloc->allocated_tensors[i].offset = offset;
|
55
149
|
return;
|
56
150
|
}
|
57
151
|
}
|
58
152
|
GGML_ASSERT(!"out of allocated_tensors");
|
59
153
|
}
|
60
|
-
static void remove_allocated_tensor(
|
154
|
+
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
61
155
|
for (int i = 0; i < 1024; i++) {
|
62
|
-
if (alloc->allocated_tensors[i] ==
|
63
|
-
|
64
|
-
alloc->allocated_tensors[i] = NULL;
|
156
|
+
if (alloc->allocated_tensors[i].offset == offset) {
|
157
|
+
alloc->allocated_tensors[i].tensor = NULL;
|
65
158
|
return;
|
66
159
|
}
|
67
160
|
}
|
68
|
-
|
161
|
+
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
|
69
162
|
GGML_ASSERT(!"tensor not found");
|
70
163
|
}
|
71
164
|
#endif
|
72
165
|
|
73
|
-
|
74
|
-
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
75
|
-
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
76
|
-
}
|
77
|
-
|
78
|
-
static bool ggml_is_view(struct ggml_tensor * t) {
|
79
|
-
return t->view_src != NULL;
|
80
|
-
}
|
81
|
-
|
82
|
-
void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
83
|
-
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
84
|
-
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
85
|
-
|
86
|
-
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
166
|
+
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
87
167
|
size = aligned_offset(NULL, size, alloc->alignment);
|
88
168
|
|
89
169
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
109
189
|
if (block->size >= size) {
|
110
190
|
best_fit_block = alloc->n_free_blocks - 1;
|
111
191
|
} else {
|
112
|
-
|
113
|
-
|
192
|
+
// this should never happen
|
193
|
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
194
|
+
__func__, size, max_avail);
|
114
195
|
GGML_ASSERT(!"not enough space in the buffer");
|
115
|
-
|
196
|
+
GGML_UNREACHABLE();
|
116
197
|
}
|
117
198
|
}
|
118
199
|
|
119
200
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
120
|
-
|
121
|
-
block->
|
201
|
+
size_t offset = block->offset;
|
202
|
+
block->offset = offset + size;
|
122
203
|
block->size -= size;
|
123
204
|
if (block->size == 0) {
|
124
205
|
// remove block if empty
|
@@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
128
209
|
}
|
129
210
|
}
|
130
211
|
|
131
|
-
AT_PRINTF("block %d,
|
132
|
-
|
133
|
-
tensor->data = addr;
|
134
|
-
tensor->buffer = alloc->buffer;
|
135
|
-
if (!alloc->measure) {
|
136
|
-
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
137
|
-
}
|
212
|
+
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
138
213
|
|
139
214
|
#ifdef GGML_ALLOCATOR_DEBUG
|
140
|
-
add_allocated_tensor(alloc, tensor);
|
141
|
-
size_t cur_max =
|
215
|
+
add_allocated_tensor(alloc, offset, tensor);
|
216
|
+
size_t cur_max = offset + size;
|
142
217
|
if (cur_max > alloc->max_size) {
|
143
|
-
|
218
|
+
// sort allocated_tensors by offset
|
144
219
|
for (int i = 0; i < 1024; i++) {
|
145
|
-
|
146
|
-
|
220
|
+
for (int j = i + 1; j < 1024; j++) {
|
221
|
+
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
|
222
|
+
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
223
|
+
size_t tmp_offset = alloc->allocated_tensors[i].offset;
|
224
|
+
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
225
|
+
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
|
226
|
+
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
227
|
+
alloc->allocated_tensors[j].offset = tmp_offset;
|
228
|
+
}
|
147
229
|
}
|
148
230
|
}
|
149
|
-
|
231
|
+
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
232
|
+
for (int i = 0; i < 1024; i++) {
|
233
|
+
if (alloc->allocated_tensors[i].tensor) {
|
234
|
+
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
235
|
+
alloc->allocated_tensors[i].offset,
|
236
|
+
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
237
|
+
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
238
|
+
}
|
239
|
+
}
|
240
|
+
fprintf(stderr, "\n");
|
150
241
|
}
|
151
242
|
#endif
|
152
243
|
|
153
|
-
alloc->max_size = MAX(alloc->max_size,
|
154
|
-
}
|
244
|
+
alloc->max_size = MAX(alloc->max_size, offset + size);
|
155
245
|
|
156
|
-
|
157
|
-
static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
158
|
-
if (ggml_tallocr_is_own(alloc, tensor) == false) {
|
159
|
-
// the tensor was not allocated in this buffer
|
160
|
-
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
161
|
-
// the easiest way to deal with this is just to ignore it
|
162
|
-
// AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
163
|
-
return;
|
164
|
-
}
|
246
|
+
return offset;
|
165
247
|
|
166
|
-
|
248
|
+
GGML_UNUSED(tensor);
|
249
|
+
}
|
167
250
|
|
168
|
-
|
251
|
+
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
252
|
+
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
|
169
253
|
size = aligned_offset(NULL, size, alloc->alignment);
|
170
|
-
|
254
|
+
|
255
|
+
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
|
171
256
|
|
172
257
|
#ifdef GGML_ALLOCATOR_DEBUG
|
173
|
-
remove_allocated_tensor(alloc, tensor);
|
258
|
+
remove_allocated_tensor(alloc, offset, tensor);
|
174
259
|
#endif
|
175
260
|
|
176
261
|
// see if we can merge with an existing block
|
177
262
|
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
178
263
|
struct free_block * block = &alloc->free_blocks[i];
|
179
264
|
// check if ptr is at the end of the block
|
180
|
-
if (
|
265
|
+
if (block->offset + block->size == offset) {
|
181
266
|
block->size += size;
|
182
267
|
// check if we can merge with the next block
|
183
|
-
if (i < alloc->n_free_blocks - 1 &&
|
268
|
+
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
|
184
269
|
block->size += alloc->free_blocks[i+1].size;
|
185
270
|
alloc->n_free_blocks--;
|
186
271
|
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
@@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
190
275
|
return;
|
191
276
|
}
|
192
277
|
// check if ptr is at the beginning of the block
|
193
|
-
if (
|
194
|
-
block->
|
278
|
+
if (offset + size == block->offset) {
|
279
|
+
block->offset = offset;
|
195
280
|
block->size += size;
|
196
281
|
// check if we can merge with the previous block
|
197
|
-
if (i > 0 &&
|
282
|
+
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
|
198
283
|
alloc->free_blocks[i-1].size += block->size;
|
199
284
|
alloc->n_free_blocks--;
|
200
285
|
for (int j = i; j < alloc->n_free_blocks; j++) {
|
@@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
208
293
|
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
209
294
|
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
210
295
|
int insert_pos = 0;
|
211
|
-
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].
|
296
|
+
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
212
297
|
insert_pos++;
|
213
298
|
}
|
214
299
|
// shift all blocks from insert_pos onward to make room for the new block
|
@@ -216,565 +301,585 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|
216
301
|
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
217
302
|
}
|
218
303
|
// insert the new block
|
219
|
-
alloc->free_blocks[insert_pos].
|
304
|
+
alloc->free_blocks[insert_pos].offset = offset;
|
220
305
|
alloc->free_blocks[insert_pos].size = size;
|
221
306
|
alloc->n_free_blocks++;
|
307
|
+
|
308
|
+
GGML_UNUSED(tensor);
|
222
309
|
}
|
223
310
|
|
224
|
-
void
|
311
|
+
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
225
312
|
alloc->n_free_blocks = 1;
|
226
|
-
|
227
|
-
alloc->free_blocks[0].
|
228
|
-
|
229
|
-
if (alloc->measure) {
|
230
|
-
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
231
|
-
} else {
|
232
|
-
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
233
|
-
ggml_backend_buffer_reset(alloc->buffer);
|
234
|
-
}
|
313
|
+
alloc->free_blocks[0].offset = 0;
|
314
|
+
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
315
|
+
alloc->max_size = 0;
|
235
316
|
}
|
236
317
|
|
237
|
-
|
238
|
-
struct
|
239
|
-
|
240
|
-
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
318
|
+
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
319
|
+
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
241
320
|
|
242
|
-
*alloc = (struct
|
243
|
-
/*.buffer = */ buffer,
|
244
|
-
/*.buffer_owned = */ true,
|
245
|
-
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
321
|
+
*alloc = (struct ggml_dyn_tallocr) {
|
246
322
|
/*.alignment = */ alignment,
|
247
323
|
/*.n_free_blocks = */ 0,
|
248
324
|
/*.free_blocks = */ {{0}},
|
249
325
|
/*.max_size = */ 0,
|
250
|
-
/*.measure = */ false,
|
251
326
|
#ifdef GGML_ALLOCATOR_DEBUG
|
252
|
-
/*.allocated_tensors = */ {0},
|
327
|
+
/*.allocated_tensors = */ {{0}},
|
253
328
|
#endif
|
254
329
|
};
|
255
330
|
|
256
|
-
|
257
|
-
|
258
|
-
return alloc;
|
259
|
-
}
|
260
|
-
|
261
|
-
ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
262
|
-
ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
|
263
|
-
alloc->measure = true;
|
331
|
+
ggml_dyn_tallocr_reset(alloc);
|
264
332
|
|
265
333
|
return alloc;
|
266
334
|
}
|
267
335
|
|
268
|
-
|
269
|
-
|
270
|
-
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
271
|
-
|
272
|
-
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
273
|
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
274
|
-
alloc->buffer_owned = true;
|
275
|
-
alloc->measure = true;
|
276
|
-
ggml_tallocr_reset(alloc);
|
277
|
-
return alloc;
|
336
|
+
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
337
|
+
free(alloc);
|
278
338
|
}
|
279
339
|
|
280
|
-
|
281
|
-
return
|
340
|
+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
341
|
+
return alloc->max_size;
|
282
342
|
}
|
283
343
|
|
284
|
-
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
285
|
-
// create a backend buffer to get the correct tensor allocation sizes
|
286
|
-
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
287
|
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
288
|
-
alloc->buffer_owned = true;
|
289
|
-
return alloc;
|
290
|
-
}
|
291
344
|
|
292
|
-
|
293
|
-
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
294
|
-
}
|
345
|
+
/////////////////////////////////////
|
295
346
|
|
296
|
-
|
297
|
-
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
347
|
+
// graph allocator
|
298
348
|
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
/*.max_size = */ 0,
|
307
|
-
/*.measure = */ false,
|
308
|
-
#ifdef GGML_ALLOCATOR_DEBUG
|
309
|
-
/*.allocated_tensors = */ {0},
|
310
|
-
#endif
|
311
|
-
};
|
349
|
+
struct hash_node {
|
350
|
+
int n_children;
|
351
|
+
int n_views;
|
352
|
+
int buffer_id;
|
353
|
+
size_t offset; // offset within the buffer
|
354
|
+
bool allocated;
|
355
|
+
};
|
312
356
|
|
313
|
-
|
357
|
+
//
|
358
|
+
struct tensor_alloc {
|
359
|
+
size_t offset;
|
360
|
+
size_t size_max; // 0 = pre-allocated, unused, or view
|
361
|
+
};
|
314
362
|
|
315
|
-
|
316
|
-
|
363
|
+
struct node_alloc {
|
364
|
+
int buffer_id;
|
365
|
+
struct tensor_alloc dst;
|
366
|
+
struct tensor_alloc src[GGML_MAX_SRC];
|
367
|
+
};
|
317
368
|
|
318
|
-
struct
|
319
|
-
|
320
|
-
|
369
|
+
struct ggml_gallocr {
|
370
|
+
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
371
|
+
ggml_backend_buffer_t * buffers; // [n_buffers]
|
372
|
+
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
373
|
+
int n_buffers;
|
321
374
|
|
322
|
-
|
323
|
-
|
324
|
-
return;
|
325
|
-
}
|
375
|
+
struct ggml_hash_set hash_set;
|
376
|
+
struct hash_node * hash_values; // [hash_set.size]
|
326
377
|
|
327
|
-
|
328
|
-
|
329
|
-
}
|
330
|
-
free(alloc);
|
331
|
-
}
|
378
|
+
struct node_alloc * node_allocs; // [n_nodes]
|
379
|
+
int n_nodes;
|
332
380
|
|
333
|
-
|
334
|
-
|
335
|
-
}
|
381
|
+
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
382
|
+
int n_leafs;
|
383
|
+
};
|
336
384
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
return alloc->max_size + alloc->max_size/10;
|
341
|
-
}
|
385
|
+
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
386
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
|
387
|
+
GGML_ASSERT(galloc != NULL);
|
342
388
|
|
343
|
-
|
389
|
+
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
|
390
|
+
GGML_ASSERT(galloc->bufts != NULL);
|
344
391
|
|
345
|
-
|
346
|
-
|
347
|
-
int n_views;
|
348
|
-
};
|
392
|
+
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
|
393
|
+
GGML_ASSERT(galloc->buffers != NULL);
|
349
394
|
|
350
|
-
struct
|
351
|
-
|
352
|
-
struct ggml_hash_set hash_set;
|
353
|
-
struct hash_node * hash_values;
|
354
|
-
size_t hash_values_size;
|
355
|
-
ggml_tallocr_t * hash_allocs;
|
356
|
-
int * parse_seq;
|
357
|
-
int parse_seq_len;
|
358
|
-
};
|
395
|
+
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
|
396
|
+
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
359
397
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
/*.hash_values_size = */ 0,
|
368
|
-
/*.hash_allocs = */ NULL,
|
369
|
-
/*.parse_seq = */ NULL,
|
370
|
-
/*.parse_seq_len = */ 0,
|
371
|
-
};
|
398
|
+
for (int i = 0; i < n_bufs; i++) {
|
399
|
+
galloc->bufts[i] = bufts[i];
|
400
|
+
galloc->buffers[i] = NULL;
|
401
|
+
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
402
|
+
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
403
|
+
}
|
404
|
+
galloc->n_buffers = n_bufs;
|
372
405
|
|
373
406
|
return galloc;
|
374
407
|
}
|
375
408
|
|
409
|
+
ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
|
410
|
+
return ggml_gallocr_new_n(&buft, 1);
|
411
|
+
}
|
412
|
+
|
376
413
|
void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
377
414
|
if (galloc == NULL) {
|
378
415
|
return;
|
379
416
|
}
|
380
417
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
free(galloc->hash_allocs);
|
389
|
-
}
|
390
|
-
if (galloc->parse_seq != NULL) {
|
391
|
-
free(galloc->parse_seq);
|
418
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
419
|
+
if (galloc->buffers != NULL) {
|
420
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
421
|
+
}
|
422
|
+
if (galloc->buf_tallocs != NULL) {
|
423
|
+
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
424
|
+
}
|
392
425
|
}
|
426
|
+
|
427
|
+
free(galloc->hash_set.keys);
|
428
|
+
free(galloc->hash_values);
|
429
|
+
free(galloc->bufts);
|
430
|
+
free(galloc->buffers);
|
431
|
+
free(galloc->buf_tallocs);
|
432
|
+
free(galloc->node_allocs);
|
433
|
+
free(galloc->leaf_allocs);
|
393
434
|
free(galloc);
|
394
435
|
}
|
395
436
|
|
396
|
-
|
397
|
-
free(galloc->parse_seq);
|
398
|
-
galloc->parse_seq = malloc(sizeof(int) * n);
|
437
|
+
typedef struct ggml_gallocr * ggml_gallocr_t;
|
399
438
|
|
400
|
-
|
401
|
-
galloc->parse_seq[i] = list[i];
|
402
|
-
}
|
403
|
-
galloc->parse_seq_len = n;
|
404
|
-
}
|
405
|
-
|
406
|
-
static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
439
|
+
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
407
440
|
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
408
441
|
return &galloc->hash_values[i];
|
409
442
|
}
|
410
443
|
|
411
|
-
static bool
|
412
|
-
|
413
|
-
return false;
|
414
|
-
}
|
415
|
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
416
|
-
if (a->ne[i] != b->ne[i]) {
|
417
|
-
return false;
|
418
|
-
}
|
419
|
-
if (a->nb[i] != b->nb[i]) {
|
420
|
-
return false;
|
421
|
-
}
|
422
|
-
}
|
423
|
-
return true;
|
444
|
+
static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
445
|
+
return ggml_gallocr_hash_get(galloc, t)->allocated;
|
424
446
|
}
|
425
447
|
|
426
|
-
static
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
case GGML_OP_ADD:
|
432
|
-
case GGML_OP_ADD1:
|
433
|
-
case GGML_OP_SUB:
|
434
|
-
case GGML_OP_MUL:
|
435
|
-
case GGML_OP_DIV:
|
436
|
-
case GGML_OP_SQR:
|
437
|
-
case GGML_OP_SQRT:
|
438
|
-
case GGML_OP_LOG:
|
439
|
-
case GGML_OP_UNARY:
|
440
|
-
case GGML_OP_ROPE:
|
441
|
-
case GGML_OP_RMS_NORM:
|
442
|
-
case GGML_OP_SOFT_MAX:
|
443
|
-
return true;
|
444
|
-
|
445
|
-
default:
|
446
|
-
return false;
|
447
|
-
}
|
448
|
+
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
|
449
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
450
|
+
hn->buffer_id = buffer_id;
|
451
|
+
hn->offset = offset;
|
452
|
+
hn->allocated = true;
|
448
453
|
}
|
449
454
|
|
450
|
-
static
|
451
|
-
|
452
|
-
return galloc->talloc;
|
453
|
-
}
|
454
|
-
|
455
|
-
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
455
|
+
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
456
|
+
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
456
457
|
}
|
457
458
|
|
458
|
-
static void
|
459
|
-
|
459
|
+
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
460
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
460
461
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
}
|
465
|
-
// views are initialized in the alloc buffer rather than the view_src buffer
|
466
|
-
view->buffer = alloc->buffer;
|
467
|
-
view->data = (char *)view->view_src->data + view->view_offs;
|
462
|
+
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
463
|
+
hn->allocated = true;
|
464
|
+
assert(hn->offset == 0);
|
468
465
|
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
466
|
+
// try to reuse a parent's buffer (inplace)
|
467
|
+
if (ggml_op_can_inplace(node->op)) {
|
468
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
469
|
+
struct ggml_tensor * parent = node->src[i];
|
470
|
+
if (parent == NULL) {
|
471
|
+
continue;
|
472
|
+
}
|
475
473
|
|
476
|
-
|
477
|
-
|
474
|
+
// if the node's data is external, then we cannot re-use it
|
475
|
+
if (!ggml_gallocr_is_own(galloc, parent)) {
|
476
|
+
AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
|
477
|
+
continue;
|
478
|
+
}
|
478
479
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
if (ggml_op_can_inplace(node->op)) {
|
485
|
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
486
|
-
struct ggml_tensor * parent = node->src[i];
|
487
|
-
if (parent == NULL) {
|
488
|
-
break;
|
489
|
-
}
|
480
|
+
// outputs cannot be reused
|
481
|
+
if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
|
482
|
+
AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
|
483
|
+
continue;
|
484
|
+
}
|
490
485
|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
}
|
486
|
+
if (!ggml_are_same_layout(node, parent)) {
|
487
|
+
AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
|
488
|
+
continue;
|
489
|
+
}
|
496
490
|
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
node->view_src = view_src;
|
510
|
-
view_src_hn->n_views += 1;
|
511
|
-
init_view(galloc, node, false);
|
512
|
-
return;
|
513
|
-
}
|
514
|
-
} else {
|
515
|
-
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
516
|
-
node->view_src = parent;
|
517
|
-
p_hn->n_views += 1;
|
518
|
-
init_view(galloc, node, false);
|
491
|
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
492
|
+
if (p_hn->n_children == 1 && p_hn->n_views == 0) {
|
493
|
+
if (ggml_is_view(parent)) {
|
494
|
+
struct ggml_tensor * view_src = parent->view_src;
|
495
|
+
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
496
|
+
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
497
|
+
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
498
|
+
assert(view_src_hn->offset == p_hn->offset);
|
499
|
+
hn->buffer_id = p_hn->buffer_id;
|
500
|
+
hn->offset = p_hn->offset;
|
501
|
+
p_hn->allocated = false; // avoid freeing the parent
|
502
|
+
view_src_hn->allocated = false;
|
519
503
|
return;
|
520
504
|
}
|
505
|
+
} else {
|
506
|
+
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
507
|
+
hn->buffer_id = p_hn->buffer_id;
|
508
|
+
hn->offset = p_hn->offset;
|
509
|
+
p_hn->allocated = false; // avoid freeing the parent
|
510
|
+
return;
|
521
511
|
}
|
522
512
|
}
|
523
513
|
}
|
524
|
-
ggml_tallocr_alloc(alloc, node);
|
525
514
|
}
|
515
|
+
// allocate tensor from the buffer
|
516
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
517
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
518
|
+
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
519
|
+
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
520
|
+
hn->buffer_id = buffer_id;
|
521
|
+
hn->offset = offset;
|
522
|
+
return;
|
526
523
|
}
|
527
524
|
}
|
528
525
|
|
529
|
-
static void
|
530
|
-
|
526
|
+
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
527
|
+
// graph outputs are never freed
|
528
|
+
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
529
|
+
AT_PRINTF("not freeing output %s\n", node->name);
|
530
|
+
return;
|
531
|
+
}
|
532
|
+
|
533
|
+
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
534
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
535
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
536
|
+
size_t offset = hn->offset;
|
537
|
+
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
538
|
+
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
539
|
+
hn->allocated = false;
|
540
|
+
}
|
531
541
|
|
532
|
-
|
542
|
+
static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
543
|
+
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
533
544
|
}
|
534
545
|
|
535
|
-
static void
|
536
|
-
|
537
|
-
|
546
|
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
547
|
+
// clear hash tables
|
548
|
+
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
549
|
+
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
538
550
|
|
539
551
|
// count number of children and views
|
540
|
-
|
541
|
-
|
552
|
+
// allocate all graph inputs and leafs first to avoid overwriting them
|
553
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
554
|
+
struct ggml_tensor * node = graph->nodes[i];
|
542
555
|
|
543
556
|
if (ggml_is_view(node)) {
|
544
557
|
struct ggml_tensor * view_src = node->view_src;
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
558
|
+
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
559
|
+
}
|
560
|
+
|
561
|
+
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
562
|
+
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
550
563
|
}
|
551
564
|
|
552
565
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
553
|
-
struct ggml_tensor *
|
554
|
-
if (
|
555
|
-
|
566
|
+
struct ggml_tensor * src = node->src[j];
|
567
|
+
if (src == NULL) {
|
568
|
+
continue;
|
556
569
|
}
|
557
|
-
|
558
|
-
|
559
|
-
|
570
|
+
|
571
|
+
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
572
|
+
|
573
|
+
// allocate explicit inputs and leafs
|
574
|
+
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
575
|
+
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
560
576
|
}
|
561
577
|
}
|
562
|
-
|
578
|
+
}
|
579
|
+
|
580
|
+
// allocate the remaining leafs that are unused on the graph
|
581
|
+
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
582
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
583
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
584
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
585
|
+
|
586
|
+
if (hn->n_children == 0) {
|
587
|
+
assert(!hn->allocated);
|
588
|
+
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
589
|
+
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
590
|
+
}
|
591
|
+
}
|
563
592
|
|
564
593
|
// allocate tensors
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
// allocate parents (leafs)
|
576
|
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
577
|
-
struct ggml_tensor * parent = node->src[j];
|
578
|
-
if (parent == NULL) {
|
579
|
-
break;
|
580
|
-
}
|
581
|
-
allocate_node(galloc, parent);
|
594
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
595
|
+
struct ggml_tensor * node = graph->nodes[i];
|
596
|
+
int buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
597
|
+
|
598
|
+
// allocate parents (only leafs need to be allocated at this point)
|
599
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
600
|
+
struct ggml_tensor * parent = node->src[j];
|
601
|
+
if (parent == NULL) {
|
602
|
+
continue;
|
582
603
|
}
|
604
|
+
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
605
|
+
}
|
583
606
|
|
584
|
-
|
585
|
-
|
607
|
+
// allocate node
|
608
|
+
ggml_gallocr_allocate_node(galloc, node, buffer_id);
|
586
609
|
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
}
|
610
|
+
AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
|
611
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
612
|
+
struct ggml_tensor * parent = node->src[j];
|
613
|
+
if (parent == NULL) {
|
614
|
+
continue;
|
615
|
+
}
|
616
|
+
AT_PRINTF("%s", parent->name);
|
617
|
+
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
618
|
+
AT_PRINTF(", ");
|
597
619
|
}
|
598
|
-
AT_PRINTF("\n");
|
599
620
|
}
|
621
|
+
AT_PRINTF("\n");
|
600
622
|
|
601
623
|
// update parents
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
struct hash_node *
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
if (ggml_is_view(parent)) {
|
623
|
-
struct ggml_tensor * view_src = parent->view_src;
|
624
|
-
struct hash_node * view_src_hn = hash_get(galloc, view_src);
|
625
|
-
view_src_hn->n_views -= 1;
|
626
|
-
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
627
|
-
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
|
628
|
-
free_node(galloc, view_src);
|
629
|
-
}
|
630
|
-
}
|
631
|
-
else {
|
632
|
-
free_node(galloc, parent);
|
633
|
-
}
|
624
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
625
|
+
struct ggml_tensor * parent = node->src[j];
|
626
|
+
if (parent == NULL) {
|
627
|
+
continue;
|
628
|
+
}
|
629
|
+
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
630
|
+
p_hn->n_children -= 1;
|
631
|
+
|
632
|
+
AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
|
633
|
+
parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
|
634
|
+
|
635
|
+
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
636
|
+
if (ggml_is_view(parent)) {
|
637
|
+
struct ggml_tensor * view_src = parent->view_src;
|
638
|
+
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
639
|
+
view_src_hn->n_views -= 1;
|
640
|
+
AT_PRINTF("view_src %s: %d children, %d views\n",
|
641
|
+
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
642
|
+
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
643
|
+
ggml_gallocr_free_node(galloc, view_src, buffer_id);
|
634
644
|
}
|
635
645
|
}
|
646
|
+
else if (p_hn->allocated) {
|
647
|
+
ggml_gallocr_free_node(galloc, parent, buffer_id);
|
648
|
+
}
|
636
649
|
}
|
637
650
|
AT_PRINTF("\n");
|
638
|
-
if (parse_seq_len) {
|
639
|
-
last_barrier_pos = ind + 1;
|
640
|
-
}
|
641
651
|
}
|
642
652
|
}
|
643
653
|
}
|
644
654
|
|
645
|
-
|
655
|
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
646
656
|
size_t hash_size = graph->visited_hash_table.size;
|
647
657
|
|
648
|
-
//
|
658
|
+
// initialize hash table
|
649
659
|
if (galloc->hash_set.size < hash_size) {
|
650
|
-
|
651
|
-
|
652
|
-
}
|
653
|
-
if (galloc->hash_values != NULL) {
|
654
|
-
free(galloc->hash_values);
|
655
|
-
}
|
656
|
-
galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
|
660
|
+
free(galloc->hash_set.keys);
|
661
|
+
free(galloc->hash_values);
|
657
662
|
galloc->hash_set.size = hash_size;
|
658
|
-
galloc->
|
663
|
+
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
|
664
|
+
galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
|
665
|
+
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
666
|
+
GGML_ASSERT(galloc->hash_values != NULL);
|
667
|
+
} else {
|
668
|
+
// reset hash table
|
669
|
+
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
670
|
+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
659
671
|
}
|
660
672
|
|
661
|
-
// reset
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
galloc->talloc = talloc;
|
666
|
-
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
667
|
-
galloc->talloc = NULL;
|
668
|
-
|
669
|
-
size_t max_size = ggml_tallocr_max_size(talloc);
|
670
|
-
|
671
|
-
return max_size;
|
672
|
-
}
|
673
|
-
|
674
|
-
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
675
|
-
const size_t hash_size = hash_set.size;
|
676
|
-
|
677
|
-
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
673
|
+
// reset allocators
|
674
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
675
|
+
ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
|
676
|
+
}
|
678
677
|
|
679
|
-
|
678
|
+
// allocate in hash table
|
679
|
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
680
680
|
|
681
|
-
//
|
682
|
-
if (galloc->
|
683
|
-
free(galloc->
|
684
|
-
galloc->
|
685
|
-
galloc->
|
681
|
+
// set the node_allocs from the hash table
|
682
|
+
if (galloc->n_nodes < graph->n_nodes) {
|
683
|
+
free(galloc->node_allocs);
|
684
|
+
galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
|
685
|
+
GGML_ASSERT(galloc->node_allocs != NULL);
|
686
686
|
}
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
687
|
+
galloc->n_nodes = graph->n_nodes;
|
688
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
689
|
+
struct ggml_tensor * node = graph->nodes[i];
|
690
|
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
691
|
+
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
692
|
+
if (node->view_src || node->data) {
|
693
|
+
node_alloc->dst.offset = SIZE_MAX;
|
694
|
+
node_alloc->dst.size_max = 0;
|
695
|
+
} else {
|
696
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
697
|
+
node_alloc->dst.offset = hn->offset;
|
698
|
+
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
699
|
+
}
|
700
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
701
|
+
struct ggml_tensor * src = node->src[j];
|
702
|
+
if (!src || src->view_src || src->data) {
|
703
|
+
node_alloc->src[j].offset = SIZE_MAX;
|
704
|
+
node_alloc->src[j].size_max = 0;
|
705
|
+
} else {
|
706
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
707
|
+
node_alloc->src[j].offset = hn->offset;
|
708
|
+
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
709
|
+
}
|
710
|
+
}
|
711
|
+
}
|
712
|
+
if (galloc->n_leafs < graph->n_leafs) {
|
713
|
+
free(galloc->leaf_allocs);
|
714
|
+
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
715
|
+
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
716
|
+
}
|
717
|
+
galloc->n_leafs = graph->n_leafs;
|
718
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
719
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
720
|
+
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
721
|
+
galloc->leaf_allocs[i].offset = hn->offset;
|
722
|
+
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
691
723
|
}
|
692
|
-
galloc->hash_set = hash_set;
|
693
|
-
|
694
|
-
// reset hash values
|
695
|
-
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
696
724
|
|
697
|
-
|
725
|
+
// reallocate buffers if needed
|
726
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
727
|
+
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
728
|
+
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
698
729
|
|
699
|
-
|
730
|
+
if (new_size > cur_size) {
|
731
|
+
#ifndef NDEBUG
|
732
|
+
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
733
|
+
#endif
|
734
|
+
ggml_backend_buffer_free(galloc->buffers[i]);
|
735
|
+
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
736
|
+
if (galloc->buffers[i] == NULL) {
|
737
|
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
738
|
+
return false;
|
739
|
+
}
|
740
|
+
}
|
741
|
+
}
|
700
742
|
|
701
|
-
|
702
|
-
galloc->hash_set.keys = NULL;
|
703
|
-
galloc->hash_allocs = NULL;
|
743
|
+
return true;
|
704
744
|
}
|
705
745
|
|
706
|
-
|
707
|
-
|
708
|
-
struct ggml_allocr {
|
709
|
-
ggml_tallocr_t talloc;
|
710
|
-
ggml_gallocr_t galloc;
|
711
|
-
};
|
712
|
-
|
713
|
-
static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
|
714
|
-
ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
|
715
|
-
*alloc = (struct ggml_allocr) {
|
716
|
-
/*.talloc = */ talloc,
|
717
|
-
/*.galloc = */ ggml_gallocr_new(),
|
718
|
-
};
|
719
|
-
return alloc;
|
746
|
+
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
747
|
+
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
720
748
|
}
|
721
749
|
|
722
|
-
|
723
|
-
|
724
|
-
}
|
750
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
751
|
+
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
725
752
|
|
726
|
-
|
727
|
-
|
753
|
+
if (node->view_src != NULL) {
|
754
|
+
if (node->buffer == NULL) {
|
755
|
+
assert(tensor_alloc->offset == SIZE_MAX);
|
756
|
+
if (node->view_src->buffer == NULL) {
|
757
|
+
// this tensor was allocated without ggml-backend
|
758
|
+
return;
|
759
|
+
}
|
760
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
761
|
+
}
|
762
|
+
} else {
|
763
|
+
if (node->data == NULL) {
|
764
|
+
assert(tensor_alloc->offset != SIZE_MAX);
|
765
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
766
|
+
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
767
|
+
void * addr = (char *)base + tensor_alloc->offset;
|
768
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
769
|
+
} else {
|
770
|
+
if (node->buffer == NULL) {
|
771
|
+
// this tensor was allocated without ggml-backend
|
772
|
+
return;
|
773
|
+
}
|
774
|
+
}
|
775
|
+
}
|
728
776
|
}
|
729
777
|
|
730
|
-
|
731
|
-
|
778
|
+
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
|
779
|
+
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
|
780
|
+
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
781
|
+
return talloc->size_max >= node_size;
|
732
782
|
}
|
733
783
|
|
734
|
-
|
735
|
-
|
736
|
-
|
784
|
+
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
785
|
+
if (galloc->n_nodes != graph->n_nodes) {
|
786
|
+
#ifndef NDEBUG
|
787
|
+
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
|
788
|
+
#endif
|
789
|
+
return true;
|
790
|
+
}
|
737
791
|
|
738
|
-
|
739
|
-
|
740
|
-
|
792
|
+
if (galloc->n_leafs != graph->n_leafs) {
|
793
|
+
#ifndef NDEBUG
|
794
|
+
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
795
|
+
#endif
|
796
|
+
return true;
|
797
|
+
}
|
741
798
|
|
742
|
-
|
743
|
-
|
744
|
-
|
799
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
800
|
+
struct ggml_tensor * node = graph->nodes[i];
|
801
|
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
745
802
|
|
746
|
-
|
747
|
-
|
748
|
-
|
803
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
|
804
|
+
#ifndef NDEBUG
|
805
|
+
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
806
|
+
#endif
|
807
|
+
return true;
|
808
|
+
}
|
749
809
|
|
750
|
-
|
751
|
-
|
752
|
-
|
810
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
811
|
+
struct ggml_tensor * src = node->src[j];
|
812
|
+
if (src == NULL) {
|
813
|
+
continue;
|
814
|
+
}
|
815
|
+
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
816
|
+
#ifndef NDEBUG
|
817
|
+
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
818
|
+
#endif
|
819
|
+
return true;
|
820
|
+
}
|
821
|
+
}
|
753
822
|
}
|
754
823
|
|
755
|
-
|
756
|
-
ggml_tallocr_free(alloc->talloc);
|
757
|
-
free(alloc);
|
824
|
+
return false;
|
758
825
|
}
|
759
826
|
|
760
|
-
bool
|
761
|
-
|
762
|
-
|
827
|
+
bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
828
|
+
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
829
|
+
if (galloc->n_buffers == 1) {
|
830
|
+
#ifndef NDEBUG
|
831
|
+
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
|
832
|
+
#endif
|
833
|
+
if (!ggml_gallocr_reserve(galloc, graph)) {
|
834
|
+
return false;
|
835
|
+
}
|
836
|
+
} else {
|
837
|
+
#ifndef NDEBUG
|
838
|
+
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
839
|
+
#endif
|
840
|
+
return false;
|
841
|
+
}
|
842
|
+
}
|
763
843
|
|
764
|
-
|
765
|
-
|
766
|
-
|
844
|
+
// reset buffers
|
845
|
+
for (int i = 0; i < galloc->n_buffers; i++) {
|
846
|
+
// zero size buffers are not allocated
|
847
|
+
if (galloc->buffers[i] != NULL) {
|
848
|
+
ggml_backend_buffer_reset(galloc->buffers[i]);
|
849
|
+
}
|
850
|
+
}
|
767
851
|
|
768
|
-
|
769
|
-
|
770
|
-
|
852
|
+
// allocate the graph tensors from the previous assignments
|
853
|
+
// nodes
|
854
|
+
for (int i = 0; i < graph->n_nodes; i++) {
|
855
|
+
struct ggml_tensor * node = graph->nodes[i];
|
856
|
+
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
857
|
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
858
|
+
struct ggml_tensor * src = node->src[j];
|
859
|
+
if (src == NULL) {
|
860
|
+
continue;
|
861
|
+
}
|
862
|
+
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
863
|
+
}
|
864
|
+
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
865
|
+
}
|
866
|
+
// leafs
|
867
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
868
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
869
|
+
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
870
|
+
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
871
|
+
}
|
771
872
|
|
772
|
-
|
773
|
-
return ggml_tallocr_max_size(alloc->talloc);
|
873
|
+
return true;
|
774
874
|
}
|
775
875
|
|
776
|
-
size_t
|
777
|
-
|
876
|
+
size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
877
|
+
GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
|
878
|
+
|
879
|
+
if (galloc->buffers[buffer_id] == NULL) {
|
880
|
+
return 0;
|
881
|
+
}
|
882
|
+
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
778
883
|
}
|
779
884
|
|
780
885
|
// utils
|
@@ -795,17 +900,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
795
900
|
return false;
|
796
901
|
}
|
797
902
|
|
798
|
-
|
903
|
+
struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
|
799
904
|
|
800
905
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
801
906
|
if (t->data == NULL) {
|
802
907
|
if (t->view_src == NULL) {
|
803
908
|
ggml_tallocr_alloc(tallocr, t);
|
804
|
-
} else {
|
909
|
+
} else if (t->buffer == NULL) {
|
805
910
|
ggml_backend_view_init(buffer, t);
|
806
911
|
}
|
807
912
|
} else {
|
808
|
-
if (t->view_src != NULL) {
|
913
|
+
if (t->view_src != NULL && t->buffer == NULL) {
|
809
914
|
// view of a pre-allocated tensor
|
810
915
|
ggml_backend_view_init(buffer, t);
|
811
916
|
}
|
@@ -838,7 +943,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
838
943
|
}
|
839
944
|
|
840
945
|
if (this_size > max_size) {
|
841
|
-
// tensor is too large to fit in a single buffer
|
842
946
|
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
843
947
|
__func__, t->name,
|
844
948
|
ggml_backend_buft_name(buft),
|
@@ -870,7 +974,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
870
974
|
}
|
871
975
|
|
872
976
|
if (n_buffers == 0) {
|
873
|
-
// all the tensors in the context are already allocated
|
874
977
|
#ifndef NDEBUG
|
875
978
|
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
876
979
|
#endif
|