llama_cpp 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8045208b5f7801979212a4f6ed395217e78f06bcfbc2d0362aaaa04c529745cd
|
4
|
+
data.tar.gz: 4011dfe279d8d4041c6c79dc5a6bad199777f83b5f0559f11ccd2f68c957e462
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d15e74da491773961006eca8ca6c6d80b30ffc995c56a9140961be0002eb09134f1a029c4e8ee192497fb7256fe36cf1c3ed928967ce57ece4c7a0904392c8fe
|
7
|
+
data.tar.gz: a863596304ddb9ac5e4be2b2b65bebc7d3913705b8a0f516debfee0ca213f9dca69707edda8d70cfafb15500fcb6e70cffb6d5d1119302d24e05059c50f0da77
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from b1380 to b1405
|
5
|
+
- Add column index argument to `set_seq_id` and `get_seq_id` methods in Batch.
|
6
|
+
- Add `special` keyword argument to `tokenize` method in Model.
|
7
|
+
- Add `n_seq_max` keyword argument to `initialize` method in Batch.
|
8
|
+
|
9
|
+
## [[0.7.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.0...v0.7.1)] - 2023-10-14
|
10
|
+
|
11
|
+
- Bump bundled llama.cpp from b1334 to b1380.
|
12
|
+
|
1
13
|
## [[0.7.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.6.0...v0.7.0)] - 2023-10-07
|
2
14
|
|
3
15
|
- Bump bundled llama.cpp from b1292 to b1334.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -5,7 +5,7 @@ require 'fileutils'
|
|
5
5
|
|
6
6
|
abort 'libstdc++ is not found.' unless have_library('stdc++')
|
7
7
|
|
8
|
-
$srcs = %w[ggml.c ggml-alloc.c llama.cpp llama_cpp.cpp]
|
8
|
+
$srcs = %w[ggml.c ggml-backend.c ggml-alloc.c llama.cpp llama_cpp.cpp]
|
9
9
|
$srcs << 'ggml-opencl.cpp' if with_config('clblast')
|
10
10
|
$srcs << 'ggml-mpi.c' if with_config('mpi')
|
11
11
|
$CFLAGS << ' -w -DNDEBUG'
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -63,8 +63,8 @@ public:
|
|
63
63
|
rb_define_method(rb_cLLaMABatch, "get_token", RUBY_METHOD_FUNC(_llama_batch_get_token), 1);
|
64
64
|
rb_define_method(rb_cLLaMABatch, "set_pos", RUBY_METHOD_FUNC(_llama_batch_set_pos), 2);
|
65
65
|
rb_define_method(rb_cLLaMABatch, "get_pos", RUBY_METHOD_FUNC(_llama_batch_get_pos), 1);
|
66
|
-
rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id),
|
67
|
-
rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id),
|
66
|
+
rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 3);
|
67
|
+
rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 2);
|
68
68
|
rb_define_method(rb_cLLaMABatch, "set_logits", RUBY_METHOD_FUNC(_llama_batch_set_logits), 2);
|
69
69
|
rb_define_method(rb_cLLaMABatch, "get_logits", RUBY_METHOD_FUNC(_llama_batch_get_logits), 1);
|
70
70
|
}
|
@@ -74,10 +74,10 @@ private:
|
|
74
74
|
|
75
75
|
static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
|
76
76
|
VALUE kw_args = Qnil;
|
77
|
-
ID kw_table[
|
78
|
-
VALUE kw_values[
|
77
|
+
ID kw_table[3] = { rb_intern("n_tokens"), rb_intern("embd"), rb_intern("n_seq_max") };
|
78
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
79
79
|
rb_scan_args(argc, argv, ":", &kw_args);
|
80
|
-
rb_get_kwargs(kw_args, kw_table,
|
80
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
81
81
|
|
82
82
|
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
83
83
|
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
@@ -87,12 +87,17 @@ private:
|
|
87
87
|
rb_raise(rb_eArgError, "embd must be an integer");
|
88
88
|
return Qnil;
|
89
89
|
}
|
90
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
91
|
+
rb_raise(rb_eArgError, "n_seq_max must be an integer");
|
92
|
+
return Qnil;
|
93
|
+
}
|
90
94
|
|
91
95
|
const int32_t n_tokens = NUM2INT(kw_values[0]);
|
92
96
|
const int32_t embd = NUM2INT(kw_values[1]);
|
97
|
+
const int32_t n_seq_max = NUM2INT(kw_values[2]);
|
93
98
|
|
94
99
|
LLaMABatchWrapper* ptr = get_llama_batch(self);
|
95
|
-
ptr->batch = llama_batch_init(n_tokens, embd);
|
100
|
+
ptr->batch = llama_batch_init(n_tokens, embd, n_seq_max);
|
96
101
|
|
97
102
|
return Qnil;
|
98
103
|
}
|
@@ -190,25 +195,35 @@ private:
|
|
190
195
|
}
|
191
196
|
|
192
197
|
// seq_id
|
193
|
-
static VALUE _llama_batch_set_seq_id(VALUE self, VALUE
|
198
|
+
static VALUE _llama_batch_set_seq_id(VALUE self, VALUE i_, VALUE j_, VALUE value) {
|
194
199
|
LLaMABatchWrapper* ptr = get_llama_batch(self);
|
195
|
-
const int32_t
|
196
|
-
if (
|
197
|
-
rb_raise(rb_eArgError, "
|
200
|
+
const int32_t i = NUM2INT(i_);
|
201
|
+
if (i < 0 || i >= ptr->batch.n_tokens) {
|
202
|
+
rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
|
203
|
+
return Qnil;
|
204
|
+
}
|
205
|
+
const int32_t j = NUM2INT(j_);
|
206
|
+
if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
|
207
|
+
rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
|
198
208
|
return Qnil;
|
199
209
|
}
|
200
|
-
ptr->batch.seq_id[
|
201
|
-
return INT2NUM(ptr->batch.seq_id[
|
210
|
+
ptr->batch.seq_id[i][j] = NUM2INT(value);
|
211
|
+
return INT2NUM(ptr->batch.seq_id[i][j]);
|
202
212
|
}
|
203
213
|
|
204
|
-
static VALUE _llama_batch_get_seq_id(VALUE self, VALUE
|
214
|
+
static VALUE _llama_batch_get_seq_id(VALUE self, VALUE i_, VALUE j_) {
|
205
215
|
LLaMABatchWrapper* ptr = get_llama_batch(self);
|
206
|
-
const int32_t
|
207
|
-
if (
|
208
|
-
rb_raise(rb_eArgError, "
|
216
|
+
const int32_t i = NUM2INT(i_);
|
217
|
+
if (i < 0 || i >= ptr->batch.n_tokens) {
|
218
|
+
rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
|
219
|
+
return Qnil;
|
220
|
+
}
|
221
|
+
const int32_t j = NUM2INT(j_);
|
222
|
+
if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
|
223
|
+
rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
|
209
224
|
return Qnil;
|
210
225
|
}
|
211
|
-
return INT2NUM(ptr->batch.seq_id[
|
226
|
+
return INT2NUM(ptr->batch.seq_id[i][j]);
|
212
227
|
}
|
213
228
|
|
214
229
|
// logits
|
@@ -1319,10 +1334,10 @@ private:
|
|
1319
1334
|
|
1320
1335
|
static VALUE _llama_model_tokenize(int argc, VALUE* argv, VALUE self) {
|
1321
1336
|
VALUE kw_args = Qnil;
|
1322
|
-
ID kw_table[
|
1323
|
-
VALUE kw_values[
|
1337
|
+
ID kw_table[4] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos"), rb_intern("special") };
|
1338
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
1324
1339
|
rb_scan_args(argc, argv, ":", &kw_args);
|
1325
|
-
rb_get_kwargs(kw_args, kw_table, 1,
|
1340
|
+
rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
|
1326
1341
|
|
1327
1342
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1328
1343
|
rb_raise(rb_eArgError, "text must be a String");
|
@@ -1336,15 +1351,20 @@ private:
|
|
1336
1351
|
rb_raise(rb_eArgError, "add_bos must be a boolean");
|
1337
1352
|
return Qnil;
|
1338
1353
|
}
|
1354
|
+
if (kw_values[3] != Qundef && (kw_values[3] != Qtrue && kw_values[3] != Qfalse)) {
|
1355
|
+
rb_raise(rb_eArgError, "special must be a boolean");
|
1356
|
+
return Qnil;
|
1357
|
+
}
|
1339
1358
|
|
1340
1359
|
VALUE text_ = kw_values[0];
|
1341
1360
|
std::string text = StringValueCStr(text_);
|
1342
1361
|
const bool add_bos = kw_values[2] == Qtrue ? true : false;
|
1362
|
+
const bool special = kw_values[3] == Qtrue ? true : false;
|
1343
1363
|
const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
|
1344
1364
|
|
1345
1365
|
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1346
1366
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1347
|
-
const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
|
1367
|
+
const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos, special);
|
1348
1368
|
|
1349
1369
|
if (n_tokens < 0) {
|
1350
1370
|
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "ggml-alloc.h"
|
2
|
+
#include "ggml-backend.h"
|
2
3
|
#include "ggml.h"
|
3
4
|
#include <assert.h>
|
4
5
|
#include <stdarg.h>
|
@@ -6,25 +7,6 @@
|
|
6
7
|
#include <stdlib.h>
|
7
8
|
#include <string.h>
|
8
9
|
|
9
|
-
#ifdef __has_include
|
10
|
-
#if __has_include(<unistd.h>)
|
11
|
-
#include <unistd.h>
|
12
|
-
#if defined(_POSIX_MAPPED_FILES)
|
13
|
-
#include <sys/types.h>
|
14
|
-
#include <sys/mman.h>
|
15
|
-
#endif
|
16
|
-
#endif
|
17
|
-
#endif
|
18
|
-
|
19
|
-
#if defined(_WIN32)
|
20
|
-
#define WIN32_LEAN_AND_MEAN
|
21
|
-
#ifndef NOMINMAX
|
22
|
-
#define NOMINMAX
|
23
|
-
#endif
|
24
|
-
#include <windows.h>
|
25
|
-
#include <memoryapi.h>
|
26
|
-
#endif
|
27
|
-
|
28
10
|
|
29
11
|
#define UNUSED(x) (void)(x)
|
30
12
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
@@ -80,8 +62,9 @@ struct free_block {
|
|
80
62
|
#define MAX_FREE_BLOCKS 256
|
81
63
|
|
82
64
|
struct ggml_allocr {
|
65
|
+
struct ggml_backend_buffer * buffer;
|
66
|
+
bool buffer_owned;
|
83
67
|
void * data;
|
84
|
-
size_t size;
|
85
68
|
size_t alignment;
|
86
69
|
int n_free_blocks;
|
87
70
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
119
102
|
}
|
120
103
|
#endif
|
121
104
|
|
122
|
-
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
123
|
-
return ggml_nbytes(tensor);
|
124
|
-
|
125
|
-
UNUSED(alloc);
|
126
|
-
}
|
127
|
-
|
128
105
|
// check if a tensor is allocated by this buffer
|
129
106
|
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
130
|
-
|
131
|
-
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
107
|
+
return tensor->buffer == alloc->buffer;
|
132
108
|
}
|
133
109
|
|
134
110
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
|
136
112
|
}
|
137
113
|
|
138
114
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
139
|
-
#ifdef GGML_ALLOCATOR_DEBUG
|
140
115
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
141
116
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
142
|
-
|
143
|
-
size_t size =
|
117
|
+
|
118
|
+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
144
119
|
size = aligned_offset(NULL, size, alloc->alignment);
|
145
120
|
|
146
121
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
188
163
|
|
189
164
|
tensor->data = addr;
|
190
165
|
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
166
|
+
tensor->buffer = alloc->buffer;
|
167
|
+
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
191
168
|
|
192
169
|
#ifdef GGML_ALLOCATOR_DEBUG
|
193
170
|
add_allocated_tensor(alloc, tensor);
|
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
208
185
|
|
209
186
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
210
187
|
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
211
|
-
void * ptr = tensor->data;
|
212
|
-
|
213
188
|
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
214
189
|
// the tensor was not allocated in this buffer
|
215
190
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
216
191
|
// the easiest way to deal with this is just to ignore it
|
192
|
+
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
217
193
|
return;
|
218
194
|
}
|
219
195
|
|
220
|
-
|
196
|
+
void * ptr = tensor->data;
|
197
|
+
|
198
|
+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
221
199
|
size = aligned_offset(NULL, size, alloc->alignment);
|
222
200
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223
|
-
|
201
|
+
|
202
|
+
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
224
203
|
|
225
204
|
#ifdef GGML_ALLOCATOR_DEBUG
|
226
205
|
remove_allocated_tensor(alloc, tensor);
|
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
|
285
264
|
alloc->n_free_blocks = 1;
|
286
265
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
287
266
|
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
288
|
-
alloc->free_blocks[0].size = alloc->
|
267
|
+
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
289
268
|
}
|
290
269
|
|
291
270
|
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
292
|
-
struct
|
271
|
+
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
272
|
+
|
273
|
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
293
274
|
|
294
275
|
*alloc = (struct ggml_allocr){
|
295
|
-
/*.
|
296
|
-
/*.
|
276
|
+
/*.buffer = */ buffer,
|
277
|
+
/*.buffer_owned = */ true,
|
278
|
+
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
297
279
|
/*.alignment = */ alignment,
|
298
280
|
/*.n_free_blocks = */ 0,
|
299
281
|
/*.free_blocks = */ {{0}},
|
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
312
294
|
return alloc;
|
313
295
|
}
|
314
296
|
|
315
|
-
// OS specific functions to allocate and free uncommitted virtual memory
|
316
|
-
static void * alloc_vmem(size_t size) {
|
317
|
-
#if defined(_WIN32)
|
318
|
-
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
319
|
-
#elif defined(_POSIX_MAPPED_FILES)
|
320
|
-
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
321
|
-
if (ptr == MAP_FAILED) {
|
322
|
-
return NULL;
|
323
|
-
}
|
324
|
-
return ptr;
|
325
|
-
#else
|
326
|
-
// use a fixed address for other platforms
|
327
|
-
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
328
|
-
return (void *)base_addr;
|
329
|
-
#endif
|
330
|
-
}
|
331
|
-
|
332
|
-
static void free_vmem(void * base_addr, size_t size) {
|
333
|
-
#if defined(_WIN32)
|
334
|
-
VirtualFree(base_addr, 0, MEM_RELEASE);
|
335
|
-
UNUSED(size);
|
336
|
-
#elif defined(_POSIX_MAPPED_FILES)
|
337
|
-
munmap(base_addr, size);
|
338
|
-
#else
|
339
|
-
// nothing to do
|
340
|
-
UNUSED(base_addr);
|
341
|
-
UNUSED(size);
|
342
|
-
#endif
|
343
|
-
}
|
344
|
-
|
345
|
-
// allocate uncommitted virtual memory to measure the size of the graph
|
346
|
-
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
347
|
-
// 128GB for 64-bit, 1GB for 32-bit
|
348
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
349
|
-
do {
|
350
|
-
*base_addr = alloc_vmem(*size);
|
351
|
-
if (*base_addr != NULL) {
|
352
|
-
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
353
|
-
return;
|
354
|
-
}
|
355
|
-
// try again with half the size
|
356
|
-
*size /= 2;
|
357
|
-
} while (*size > 0);
|
358
|
-
|
359
|
-
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
360
|
-
}
|
361
|
-
|
362
|
-
static void free_measure_vmem(void * base_addr, size_t size) {
|
363
|
-
free_vmem(base_addr, size);
|
364
|
-
}
|
365
|
-
|
366
297
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
367
|
-
struct ggml_allocr * alloc = (
|
298
|
+
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
|
299
|
+
alloc->measure = true;
|
368
300
|
|
369
|
-
|
370
|
-
|
301
|
+
return alloc;
|
302
|
+
}
|
371
303
|
|
372
|
-
|
304
|
+
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
305
|
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
373
306
|
|
374
307
|
*alloc = (struct ggml_allocr){
|
375
|
-
/*.
|
376
|
-
/*.
|
377
|
-
/*.
|
308
|
+
/*.buffer = */ buffer,
|
309
|
+
/*.buffer_owned = */ false,
|
310
|
+
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
311
|
+
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
378
312
|
/*.n_free_blocks = */ 0,
|
379
313
|
/*.free_blocks = */ {{0}},
|
380
314
|
/*.hash_table = */ {{0}},
|
381
315
|
/*.max_size = */ 0,
|
382
|
-
/*.measure = */
|
316
|
+
/*.measure = */ false,
|
383
317
|
/*.parse_seq = */ {0},
|
384
318
|
/*.parse_seq_len = */ 0,
|
385
319
|
#ifdef GGML_ALLOCATOR_DEBUG
|
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
393
327
|
}
|
394
328
|
|
395
329
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
396
|
-
if (alloc->
|
397
|
-
|
330
|
+
if (alloc->buffer_owned) {
|
331
|
+
ggml_backend_buffer_free(alloc->buffer);
|
398
332
|
}
|
399
333
|
free(alloc);
|
400
334
|
}
|
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
437
371
|
case GGML_OP_ROPE:
|
438
372
|
case GGML_OP_RMS_NORM:
|
439
373
|
case GGML_OP_SOFT_MAX:
|
440
|
-
case GGML_OP_CONT:
|
441
374
|
return true;
|
442
375
|
|
443
376
|
default:
|
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
445
378
|
}
|
446
379
|
}
|
447
380
|
|
381
|
+
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
382
|
+
assert(view->view_src != NULL && view->view_src->data != NULL);
|
383
|
+
view->backend = view->view_src->backend;
|
384
|
+
view->buffer = view->view_src->buffer;
|
385
|
+
view->data = (char *)view->view_src->data + view->view_offs;
|
386
|
+
|
387
|
+
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
388
|
+
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
389
|
+
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
390
|
+
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
391
|
+
}
|
392
|
+
|
448
393
|
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
449
394
|
struct hash_node * ht = alloc->hash_table;
|
450
395
|
if (node->data == NULL) {
|
451
396
|
if (ggml_is_view(node)) {
|
452
|
-
|
453
|
-
node->data = (char *)node->view_src->data + node->view_offs;
|
397
|
+
init_view(alloc, node);
|
454
398
|
} else {
|
455
399
|
// see if we can reuse a parent's buffer (inplace)
|
456
400
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
478
422
|
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
479
423
|
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
480
424
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
481
|
-
node->
|
425
|
+
node->view_src = view_src;
|
426
|
+
view_src_hn->n_views += 1;
|
427
|
+
init_view(alloc, node);
|
482
428
|
return;
|
483
429
|
}
|
484
430
|
}
|
485
431
|
else {
|
486
432
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
487
|
-
node->
|
433
|
+
node->view_src = parent;
|
434
|
+
p_hn->n_views += 1;
|
435
|
+
init_view(alloc, node);
|
488
436
|
return;
|
489
437
|
}
|
490
438
|
}
|
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
495
443
|
}
|
496
444
|
}
|
497
445
|
|
498
|
-
|
446
|
+
size_t ggml_allocr_alloc_graph_n(
|
499
447
|
struct ggml_allocr * alloc,
|
500
448
|
struct ggml_cgraph ** graphs, int n_graphs,
|
501
449
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
513
461
|
if (ggml_is_view(node)) {
|
514
462
|
struct ggml_tensor * view_src = node->view_src;
|
515
463
|
hash_get(ht, view_src)->n_views += 1;
|
464
|
+
if (node->buffer == NULL && node->data != NULL) {
|
465
|
+
// view of a pre-allocated tensor, didn't call init_view() yet
|
466
|
+
init_view(alloc, node);
|
467
|
+
}
|
516
468
|
}
|
517
469
|
|
518
470
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
521
473
|
break;
|
522
474
|
}
|
523
475
|
hash_get(ht, parent)->n_children += 1;
|
476
|
+
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
477
|
+
init_view(alloc, parent);
|
478
|
+
}
|
524
479
|
}
|
525
480
|
}
|
526
481
|
}
|
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
631
586
|
}
|
632
587
|
|
633
588
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
634
|
-
return
|
589
|
+
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
|
635
590
|
}
|
636
591
|
|
637
592
|
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
@@ -6,21 +6,27 @@
|
|
6
6
|
extern "C" {
|
7
7
|
#endif
|
8
8
|
|
9
|
+
struct ggml_backend_buffer;
|
9
10
|
|
10
11
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11
12
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
13
|
+
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
12
14
|
|
13
15
|
// tell the allocator to parse nodes following the order described in the list
|
14
16
|
// you should call this if your graph are optimized to execute out-of-order
|
15
17
|
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
18
|
|
17
|
-
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18
|
-
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
19
|
-
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20
|
-
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
19
|
+
GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
|
20
|
+
GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
|
21
|
+
GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
|
22
|
+
GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21
23
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22
|
-
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
24
|
+
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
23
25
|
|
26
|
+
GGML_API size_t ggml_allocr_alloc_graph_n(
|
27
|
+
struct ggml_allocr * alloc,
|
28
|
+
struct ggml_cgraph ** graphs, int n_graphs,
|
29
|
+
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
24
30
|
|
25
31
|
#ifdef __cplusplus
|
26
32
|
}
|