llama_cpp 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-alloc.c +62 -107
- data/ext/llama_cpp/src/ggml-alloc.h +11 -5
- data/ext/llama_cpp/src/ggml-backend.c +385 -0
- data/ext/llama_cpp/src/ggml-backend.h +143 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +500 -78
- data/ext/llama_cpp/src/ggml-cuda.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.h +18 -1
- data/ext/llama_cpp/src/ggml-metal.m +396 -127
- data/ext/llama_cpp/src/ggml-metal.metal +290 -46
- data/ext/llama_cpp/src/ggml-opencl.cpp +47 -71
- data/ext/llama_cpp/src/ggml.c +71 -55
- data/ext/llama_cpp/src/ggml.h +15 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/k_quants.h +5 -5
- data/ext/llama_cpp/src/llama.cpp +1851 -250
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8045208b5f7801979212a4f6ed395217e78f06bcfbc2d0362aaaa04c529745cd
|
4
|
+
data.tar.gz: 4011dfe279d8d4041c6c79dc5a6bad199777f83b5f0559f11ccd2f68c957e462
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d15e74da491773961006eca8ca6c6d80b30ffc995c56a9140961be0002eb09134f1a029c4e8ee192497fb7256fe36cf1c3ed928967ce57ece4c7a0904392c8fe
|
7
|
+
data.tar.gz: a863596304ddb9ac5e4be2b2b65bebc7d3913705b8a0f516debfee0ca213f9dca69707edda8d70cfafb15500fcb6e70cffb6d5d1119302d24e05059c50f0da77
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
|
2
|
+
|
3
|
+
**Breaking Changes**
|
4
|
+
- Bump bundled llama.cpp from b1380 to b1405
|
5
|
+
- Add column index argument to `set_seq_id` and `get_seq_id` methods in Batch.
|
6
|
+
- Add `special` keyword argument to `tokenize` method in Model.
|
7
|
+
- Add `n_seq_max` keyword argument to `initialize` method in Batch.
|
8
|
+
|
9
|
+
## [[0.7.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.0...v0.7.1)] - 2023-10-14
|
10
|
+
|
11
|
+
- Bump bundled llama.cpp from b1334 to b1380.
|
12
|
+
|
1
13
|
## [[0.7.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.6.0...v0.7.0)] - 2023-10-07
|
2
14
|
|
3
15
|
- Bump bundled llama.cpp from b1292 to b1334.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -5,7 +5,7 @@ require 'fileutils'
|
|
5
5
|
|
6
6
|
abort 'libstdc++ is not found.' unless have_library('stdc++')
|
7
7
|
|
8
|
-
$srcs = %w[ggml.c ggml-alloc.c llama.cpp llama_cpp.cpp]
|
8
|
+
$srcs = %w[ggml.c ggml-backend.c ggml-alloc.c llama.cpp llama_cpp.cpp]
|
9
9
|
$srcs << 'ggml-opencl.cpp' if with_config('clblast')
|
10
10
|
$srcs << 'ggml-mpi.c' if with_config('mpi')
|
11
11
|
$CFLAGS << ' -w -DNDEBUG'
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -63,8 +63,8 @@ public:
|
|
63
63
|
rb_define_method(rb_cLLaMABatch, "get_token", RUBY_METHOD_FUNC(_llama_batch_get_token), 1);
|
64
64
|
rb_define_method(rb_cLLaMABatch, "set_pos", RUBY_METHOD_FUNC(_llama_batch_set_pos), 2);
|
65
65
|
rb_define_method(rb_cLLaMABatch, "get_pos", RUBY_METHOD_FUNC(_llama_batch_get_pos), 1);
|
66
|
-
rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id),
|
67
|
-
rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id),
|
66
|
+
rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 3);
|
67
|
+
rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 2);
|
68
68
|
rb_define_method(rb_cLLaMABatch, "set_logits", RUBY_METHOD_FUNC(_llama_batch_set_logits), 2);
|
69
69
|
rb_define_method(rb_cLLaMABatch, "get_logits", RUBY_METHOD_FUNC(_llama_batch_get_logits), 1);
|
70
70
|
}
|
@@ -74,10 +74,10 @@ private:
|
|
74
74
|
|
75
75
|
static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
|
76
76
|
VALUE kw_args = Qnil;
|
77
|
-
ID kw_table[
|
78
|
-
VALUE kw_values[
|
77
|
+
ID kw_table[3] = { rb_intern("n_tokens"), rb_intern("embd"), rb_intern("n_seq_max") };
|
78
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
79
79
|
rb_scan_args(argc, argv, ":", &kw_args);
|
80
|
-
rb_get_kwargs(kw_args, kw_table,
|
80
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
81
81
|
|
82
82
|
if (!RB_INTEGER_TYPE_P(kw_values[0])) {
|
83
83
|
rb_raise(rb_eArgError, "n_tokens must be an integer");
|
@@ -87,12 +87,17 @@ private:
|
|
87
87
|
rb_raise(rb_eArgError, "embd must be an integer");
|
88
88
|
return Qnil;
|
89
89
|
}
|
90
|
+
if (!RB_INTEGER_TYPE_P(kw_values[2])) {
|
91
|
+
rb_raise(rb_eArgError, "n_seq_max must be an integer");
|
92
|
+
return Qnil;
|
93
|
+
}
|
90
94
|
|
91
95
|
const int32_t n_tokens = NUM2INT(kw_values[0]);
|
92
96
|
const int32_t embd = NUM2INT(kw_values[1]);
|
97
|
+
const int32_t n_seq_max = NUM2INT(kw_values[2]);
|
93
98
|
|
94
99
|
LLaMABatchWrapper* ptr = get_llama_batch(self);
|
95
|
-
ptr->batch = llama_batch_init(n_tokens, embd);
|
100
|
+
ptr->batch = llama_batch_init(n_tokens, embd, n_seq_max);
|
96
101
|
|
97
102
|
return Qnil;
|
98
103
|
}
|
@@ -190,25 +195,35 @@ private:
|
|
190
195
|
}
|
191
196
|
|
192
197
|
// seq_id
|
193
|
-
static VALUE _llama_batch_set_seq_id(VALUE self, VALUE
|
198
|
+
static VALUE _llama_batch_set_seq_id(VALUE self, VALUE i_, VALUE j_, VALUE value) {
|
194
199
|
LLaMABatchWrapper* ptr = get_llama_batch(self);
|
195
|
-
const int32_t
|
196
|
-
if (
|
197
|
-
rb_raise(rb_eArgError, "
|
200
|
+
const int32_t i = NUM2INT(i_);
|
201
|
+
if (i < 0 || i >= ptr->batch.n_tokens) {
|
202
|
+
rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
|
203
|
+
return Qnil;
|
204
|
+
}
|
205
|
+
const int32_t j = NUM2INT(j_);
|
206
|
+
if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
|
207
|
+
rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
|
198
208
|
return Qnil;
|
199
209
|
}
|
200
|
-
ptr->batch.seq_id[
|
201
|
-
return INT2NUM(ptr->batch.seq_id[
|
210
|
+
ptr->batch.seq_id[i][j] = NUM2INT(value);
|
211
|
+
return INT2NUM(ptr->batch.seq_id[i][j]);
|
202
212
|
}
|
203
213
|
|
204
|
-
static VALUE _llama_batch_get_seq_id(VALUE self, VALUE
|
214
|
+
static VALUE _llama_batch_get_seq_id(VALUE self, VALUE i_, VALUE j_) {
|
205
215
|
LLaMABatchWrapper* ptr = get_llama_batch(self);
|
206
|
-
const int32_t
|
207
|
-
if (
|
208
|
-
rb_raise(rb_eArgError, "
|
216
|
+
const int32_t i = NUM2INT(i_);
|
217
|
+
if (i < 0 || i >= ptr->batch.n_tokens) {
|
218
|
+
rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
|
219
|
+
return Qnil;
|
220
|
+
}
|
221
|
+
const int32_t j = NUM2INT(j_);
|
222
|
+
if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
|
223
|
+
rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
|
209
224
|
return Qnil;
|
210
225
|
}
|
211
|
-
return INT2NUM(ptr->batch.seq_id[
|
226
|
+
return INT2NUM(ptr->batch.seq_id[i][j]);
|
212
227
|
}
|
213
228
|
|
214
229
|
// logits
|
@@ -1319,10 +1334,10 @@ private:
|
|
1319
1334
|
|
1320
1335
|
static VALUE _llama_model_tokenize(int argc, VALUE* argv, VALUE self) {
|
1321
1336
|
VALUE kw_args = Qnil;
|
1322
|
-
ID kw_table[
|
1323
|
-
VALUE kw_values[
|
1337
|
+
ID kw_table[4] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos"), rb_intern("special") };
|
1338
|
+
VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
|
1324
1339
|
rb_scan_args(argc, argv, ":", &kw_args);
|
1325
|
-
rb_get_kwargs(kw_args, kw_table, 1,
|
1340
|
+
rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
|
1326
1341
|
|
1327
1342
|
if (!RB_TYPE_P(kw_values[0], T_STRING)) {
|
1328
1343
|
rb_raise(rb_eArgError, "text must be a String");
|
@@ -1336,15 +1351,20 @@ private:
|
|
1336
1351
|
rb_raise(rb_eArgError, "add_bos must be a boolean");
|
1337
1352
|
return Qnil;
|
1338
1353
|
}
|
1354
|
+
if (kw_values[3] != Qundef && (kw_values[3] != Qtrue && kw_values[3] != Qfalse)) {
|
1355
|
+
rb_raise(rb_eArgError, "special must be a boolean");
|
1356
|
+
return Qnil;
|
1357
|
+
}
|
1339
1358
|
|
1340
1359
|
VALUE text_ = kw_values[0];
|
1341
1360
|
std::string text = StringValueCStr(text_);
|
1342
1361
|
const bool add_bos = kw_values[2] == Qtrue ? true : false;
|
1362
|
+
const bool special = kw_values[3] == Qtrue ? true : false;
|
1343
1363
|
const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
|
1344
1364
|
|
1345
1365
|
llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
|
1346
1366
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1347
|
-
const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
|
1367
|
+
const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos, special);
|
1348
1368
|
|
1349
1369
|
if (n_tokens < 0) {
|
1350
1370
|
rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "ggml-alloc.h"
|
2
|
+
#include "ggml-backend.h"
|
2
3
|
#include "ggml.h"
|
3
4
|
#include <assert.h>
|
4
5
|
#include <stdarg.h>
|
@@ -6,25 +7,6 @@
|
|
6
7
|
#include <stdlib.h>
|
7
8
|
#include <string.h>
|
8
9
|
|
9
|
-
#ifdef __has_include
|
10
|
-
#if __has_include(<unistd.h>)
|
11
|
-
#include <unistd.h>
|
12
|
-
#if defined(_POSIX_MAPPED_FILES)
|
13
|
-
#include <sys/types.h>
|
14
|
-
#include <sys/mman.h>
|
15
|
-
#endif
|
16
|
-
#endif
|
17
|
-
#endif
|
18
|
-
|
19
|
-
#if defined(_WIN32)
|
20
|
-
#define WIN32_LEAN_AND_MEAN
|
21
|
-
#ifndef NOMINMAX
|
22
|
-
#define NOMINMAX
|
23
|
-
#endif
|
24
|
-
#include <windows.h>
|
25
|
-
#include <memoryapi.h>
|
26
|
-
#endif
|
27
|
-
|
28
10
|
|
29
11
|
#define UNUSED(x) (void)(x)
|
30
12
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
@@ -80,8 +62,9 @@ struct free_block {
|
|
80
62
|
#define MAX_FREE_BLOCKS 256
|
81
63
|
|
82
64
|
struct ggml_allocr {
|
65
|
+
struct ggml_backend_buffer * buffer;
|
66
|
+
bool buffer_owned;
|
83
67
|
void * data;
|
84
|
-
size_t size;
|
85
68
|
size_t alignment;
|
86
69
|
int n_free_blocks;
|
87
70
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
119
102
|
}
|
120
103
|
#endif
|
121
104
|
|
122
|
-
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
123
|
-
return ggml_nbytes(tensor);
|
124
|
-
|
125
|
-
UNUSED(alloc);
|
126
|
-
}
|
127
|
-
|
128
105
|
// check if a tensor is allocated by this buffer
|
129
106
|
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
130
|
-
|
131
|
-
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
107
|
+
return tensor->buffer == alloc->buffer;
|
132
108
|
}
|
133
109
|
|
134
110
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
|
136
112
|
}
|
137
113
|
|
138
114
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
139
|
-
#ifdef GGML_ALLOCATOR_DEBUG
|
140
115
|
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
141
116
|
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
142
|
-
|
143
|
-
size_t size =
|
117
|
+
|
118
|
+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
144
119
|
size = aligned_offset(NULL, size, alloc->alignment);
|
145
120
|
|
146
121
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
188
163
|
|
189
164
|
tensor->data = addr;
|
190
165
|
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
166
|
+
tensor->buffer = alloc->buffer;
|
167
|
+
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
191
168
|
|
192
169
|
#ifdef GGML_ALLOCATOR_DEBUG
|
193
170
|
add_allocated_tensor(alloc, tensor);
|
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
208
185
|
|
209
186
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
210
187
|
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
211
|
-
void * ptr = tensor->data;
|
212
|
-
|
213
188
|
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
214
189
|
// the tensor was not allocated in this buffer
|
215
190
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
216
191
|
// the easiest way to deal with this is just to ignore it
|
192
|
+
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
217
193
|
return;
|
218
194
|
}
|
219
195
|
|
220
|
-
|
196
|
+
void * ptr = tensor->data;
|
197
|
+
|
198
|
+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
221
199
|
size = aligned_offset(NULL, size, alloc->alignment);
|
222
200
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223
|
-
|
201
|
+
|
202
|
+
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
224
203
|
|
225
204
|
#ifdef GGML_ALLOCATOR_DEBUG
|
226
205
|
remove_allocated_tensor(alloc, tensor);
|
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
|
285
264
|
alloc->n_free_blocks = 1;
|
286
265
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
287
266
|
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
288
|
-
alloc->free_blocks[0].size = alloc->
|
267
|
+
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
289
268
|
}
|
290
269
|
|
291
270
|
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
292
|
-
struct
|
271
|
+
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
272
|
+
|
273
|
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
293
274
|
|
294
275
|
*alloc = (struct ggml_allocr){
|
295
|
-
/*.
|
296
|
-
/*.
|
276
|
+
/*.buffer = */ buffer,
|
277
|
+
/*.buffer_owned = */ true,
|
278
|
+
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
297
279
|
/*.alignment = */ alignment,
|
298
280
|
/*.n_free_blocks = */ 0,
|
299
281
|
/*.free_blocks = */ {{0}},
|
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
312
294
|
return alloc;
|
313
295
|
}
|
314
296
|
|
315
|
-
// OS specific functions to allocate and free uncommitted virtual memory
|
316
|
-
static void * alloc_vmem(size_t size) {
|
317
|
-
#if defined(_WIN32)
|
318
|
-
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
319
|
-
#elif defined(_POSIX_MAPPED_FILES)
|
320
|
-
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
321
|
-
if (ptr == MAP_FAILED) {
|
322
|
-
return NULL;
|
323
|
-
}
|
324
|
-
return ptr;
|
325
|
-
#else
|
326
|
-
// use a fixed address for other platforms
|
327
|
-
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
328
|
-
return (void *)base_addr;
|
329
|
-
#endif
|
330
|
-
}
|
331
|
-
|
332
|
-
static void free_vmem(void * base_addr, size_t size) {
|
333
|
-
#if defined(_WIN32)
|
334
|
-
VirtualFree(base_addr, 0, MEM_RELEASE);
|
335
|
-
UNUSED(size);
|
336
|
-
#elif defined(_POSIX_MAPPED_FILES)
|
337
|
-
munmap(base_addr, size);
|
338
|
-
#else
|
339
|
-
// nothing to do
|
340
|
-
UNUSED(base_addr);
|
341
|
-
UNUSED(size);
|
342
|
-
#endif
|
343
|
-
}
|
344
|
-
|
345
|
-
// allocate uncommitted virtual memory to measure the size of the graph
|
346
|
-
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
347
|
-
// 128GB for 64-bit, 1GB for 32-bit
|
348
|
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
349
|
-
do {
|
350
|
-
*base_addr = alloc_vmem(*size);
|
351
|
-
if (*base_addr != NULL) {
|
352
|
-
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
353
|
-
return;
|
354
|
-
}
|
355
|
-
// try again with half the size
|
356
|
-
*size /= 2;
|
357
|
-
} while (*size > 0);
|
358
|
-
|
359
|
-
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
360
|
-
}
|
361
|
-
|
362
|
-
static void free_measure_vmem(void * base_addr, size_t size) {
|
363
|
-
free_vmem(base_addr, size);
|
364
|
-
}
|
365
|
-
|
366
297
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
367
|
-
struct ggml_allocr * alloc = (
|
298
|
+
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
|
299
|
+
alloc->measure = true;
|
368
300
|
|
369
|
-
|
370
|
-
|
301
|
+
return alloc;
|
302
|
+
}
|
371
303
|
|
372
|
-
|
304
|
+
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
305
|
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
373
306
|
|
374
307
|
*alloc = (struct ggml_allocr){
|
375
|
-
/*.
|
376
|
-
/*.
|
377
|
-
/*.
|
308
|
+
/*.buffer = */ buffer,
|
309
|
+
/*.buffer_owned = */ false,
|
310
|
+
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
311
|
+
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
378
312
|
/*.n_free_blocks = */ 0,
|
379
313
|
/*.free_blocks = */ {{0}},
|
380
314
|
/*.hash_table = */ {{0}},
|
381
315
|
/*.max_size = */ 0,
|
382
|
-
/*.measure = */
|
316
|
+
/*.measure = */ false,
|
383
317
|
/*.parse_seq = */ {0},
|
384
318
|
/*.parse_seq_len = */ 0,
|
385
319
|
#ifdef GGML_ALLOCATOR_DEBUG
|
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
393
327
|
}
|
394
328
|
|
395
329
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
396
|
-
if (alloc->
|
397
|
-
|
330
|
+
if (alloc->buffer_owned) {
|
331
|
+
ggml_backend_buffer_free(alloc->buffer);
|
398
332
|
}
|
399
333
|
free(alloc);
|
400
334
|
}
|
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
437
371
|
case GGML_OP_ROPE:
|
438
372
|
case GGML_OP_RMS_NORM:
|
439
373
|
case GGML_OP_SOFT_MAX:
|
440
|
-
case GGML_OP_CONT:
|
441
374
|
return true;
|
442
375
|
|
443
376
|
default:
|
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
445
378
|
}
|
446
379
|
}
|
447
380
|
|
381
|
+
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
382
|
+
assert(view->view_src != NULL && view->view_src->data != NULL);
|
383
|
+
view->backend = view->view_src->backend;
|
384
|
+
view->buffer = view->view_src->buffer;
|
385
|
+
view->data = (char *)view->view_src->data + view->view_offs;
|
386
|
+
|
387
|
+
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
388
|
+
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
389
|
+
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
390
|
+
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
391
|
+
}
|
392
|
+
|
448
393
|
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
449
394
|
struct hash_node * ht = alloc->hash_table;
|
450
395
|
if (node->data == NULL) {
|
451
396
|
if (ggml_is_view(node)) {
|
452
|
-
|
453
|
-
node->data = (char *)node->view_src->data + node->view_offs;
|
397
|
+
init_view(alloc, node);
|
454
398
|
} else {
|
455
399
|
// see if we can reuse a parent's buffer (inplace)
|
456
400
|
if (ggml_op_can_inplace(node->op)) {
|
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
478
422
|
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
479
423
|
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
480
424
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
481
|
-
node->
|
425
|
+
node->view_src = view_src;
|
426
|
+
view_src_hn->n_views += 1;
|
427
|
+
init_view(alloc, node);
|
482
428
|
return;
|
483
429
|
}
|
484
430
|
}
|
485
431
|
else {
|
486
432
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
487
|
-
node->
|
433
|
+
node->view_src = parent;
|
434
|
+
p_hn->n_views += 1;
|
435
|
+
init_view(alloc, node);
|
488
436
|
return;
|
489
437
|
}
|
490
438
|
}
|
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
495
443
|
}
|
496
444
|
}
|
497
445
|
|
498
|
-
|
446
|
+
size_t ggml_allocr_alloc_graph_n(
|
499
447
|
struct ggml_allocr * alloc,
|
500
448
|
struct ggml_cgraph ** graphs, int n_graphs,
|
501
449
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
513
461
|
if (ggml_is_view(node)) {
|
514
462
|
struct ggml_tensor * view_src = node->view_src;
|
515
463
|
hash_get(ht, view_src)->n_views += 1;
|
464
|
+
if (node->buffer == NULL && node->data != NULL) {
|
465
|
+
// view of a pre-allocated tensor, didn't call init_view() yet
|
466
|
+
init_view(alloc, node);
|
467
|
+
}
|
516
468
|
}
|
517
469
|
|
518
470
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
521
473
|
break;
|
522
474
|
}
|
523
475
|
hash_get(ht, parent)->n_children += 1;
|
476
|
+
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
477
|
+
init_view(alloc, parent);
|
478
|
+
}
|
524
479
|
}
|
525
480
|
}
|
526
481
|
}
|
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
631
586
|
}
|
632
587
|
|
633
588
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
634
|
-
return
|
589
|
+
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
|
635
590
|
}
|
636
591
|
|
637
592
|
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
@@ -6,21 +6,27 @@
|
|
6
6
|
extern "C" {
|
7
7
|
#endif
|
8
8
|
|
9
|
+
struct ggml_backend_buffer;
|
9
10
|
|
10
11
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11
12
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
13
|
+
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
12
14
|
|
13
15
|
// tell the allocator to parse nodes following the order described in the list
|
14
16
|
// you should call this if your graph are optimized to execute out-of-order
|
15
17
|
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16
18
|
|
17
|
-
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18
|
-
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
19
|
-
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20
|
-
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
19
|
+
GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
|
20
|
+
GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
|
21
|
+
GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
|
22
|
+
GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21
23
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22
|
-
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
24
|
+
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
23
25
|
|
26
|
+
GGML_API size_t ggml_allocr_alloc_graph_n(
|
27
|
+
struct ggml_allocr * alloc,
|
28
|
+
struct ggml_cgraph ** graphs, int n_graphs,
|
29
|
+
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
24
30
|
|
25
31
|
#ifdef __cplusplus
|
26
32
|
}
|