llama_cpp 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 144a7130adb5ac32d31699bce809a6de6c3a6ecf8cfccca36ebdee436c28b645
4
- data.tar.gz: d00b2c2db583e6e38d472033c7348f22e9614febdb633c4e454ca49e00d2fec6
3
+ metadata.gz: 8045208b5f7801979212a4f6ed395217e78f06bcfbc2d0362aaaa04c529745cd
4
+ data.tar.gz: 4011dfe279d8d4041c6c79dc5a6bad199777f83b5f0559f11ccd2f68c957e462
5
5
  SHA512:
6
- metadata.gz: 2c30854fef304e0258250d9285bac8ab3ea014950d1638e88682029763a3e90eae36da1b3757b2441ff5a7a798401ee1e731bcfc014e7e651811726d7afea224
7
- data.tar.gz: 10ea5bb5bf5d85a7e7030b514e2eb38650e9ce8a97ab339f63538b637d3c85293b406fea66c055a00f919c457a9a2af5c8f5710d0d31d702fe7e6f703b52933d
6
+ metadata.gz: d15e74da491773961006eca8ca6c6d80b30ffc995c56a9140961be0002eb09134f1a029c4e8ee192497fb7256fe36cf1c3ed928967ce57ece4c7a0904392c8fe
7
+ data.tar.gz: a863596304ddb9ac5e4be2b2b65bebc7d3913705b8a0f516debfee0ca213f9dca69707edda8d70cfafb15500fcb6e70cffb6d5d1119302d24e05059c50f0da77
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from b1380 to b1405
5
+ - Add column index argument to `set_seq_id` and `get_seq_id` methods in Batch.
6
+ - Add `special` keyword argument to `tokenize` method in Model.
7
+ - Add `n_seq_max` keyword argument to `initialize` method in Batch.
8
+
9
+ ## [[0.7.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.0...v0.7.1)] - 2023-10-14
10
+
11
+ - Bump bundled llama.cpp from b1334 to b1380.
12
+
1
13
  ## [[0.7.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.6.0...v0.7.0)] - 2023-10-07
2
14
 
3
15
  - Bump bundled llama.cpp from b1292 to b1334.
@@ -5,7 +5,7 @@ require 'fileutils'
5
5
 
6
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
7
7
 
8
- $srcs = %w[ggml.c ggml-alloc.c llama.cpp llama_cpp.cpp]
8
+ $srcs = %w[ggml.c ggml-backend.c ggml-alloc.c llama.cpp llama_cpp.cpp]
9
9
  $srcs << 'ggml-opencl.cpp' if with_config('clblast')
10
10
  $srcs << 'ggml-mpi.c' if with_config('mpi')
11
11
  $CFLAGS << ' -w -DNDEBUG'
@@ -63,8 +63,8 @@ public:
63
63
  rb_define_method(rb_cLLaMABatch, "get_token", RUBY_METHOD_FUNC(_llama_batch_get_token), 1);
64
64
  rb_define_method(rb_cLLaMABatch, "set_pos", RUBY_METHOD_FUNC(_llama_batch_set_pos), 2);
65
65
  rb_define_method(rb_cLLaMABatch, "get_pos", RUBY_METHOD_FUNC(_llama_batch_get_pos), 1);
66
- rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 2);
67
- rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 1);
66
+ rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 3);
67
+ rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 2);
68
68
  rb_define_method(rb_cLLaMABatch, "set_logits", RUBY_METHOD_FUNC(_llama_batch_set_logits), 2);
69
69
  rb_define_method(rb_cLLaMABatch, "get_logits", RUBY_METHOD_FUNC(_llama_batch_get_logits), 1);
70
70
  }
@@ -74,10 +74,10 @@ private:
74
74
 
75
75
  static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
76
76
  VALUE kw_args = Qnil;
77
- ID kw_table[2] = { rb_intern("n_tokens"), rb_intern("embd") };
78
- VALUE kw_values[2] = { Qundef, Qundef };
77
+ ID kw_table[3] = { rb_intern("n_tokens"), rb_intern("embd"), rb_intern("n_seq_max") };
78
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
79
79
  rb_scan_args(argc, argv, ":", &kw_args);
80
- rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
80
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
81
81
 
82
82
  if (!RB_INTEGER_TYPE_P(kw_values[0])) {
83
83
  rb_raise(rb_eArgError, "n_tokens must be an integer");
@@ -87,12 +87,17 @@ private:
87
87
  rb_raise(rb_eArgError, "embd must be an integer");
88
88
  return Qnil;
89
89
  }
90
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
91
+ rb_raise(rb_eArgError, "n_seq_max must be an integer");
92
+ return Qnil;
93
+ }
90
94
 
91
95
  const int32_t n_tokens = NUM2INT(kw_values[0]);
92
96
  const int32_t embd = NUM2INT(kw_values[1]);
97
+ const int32_t n_seq_max = NUM2INT(kw_values[2]);
93
98
 
94
99
  LLaMABatchWrapper* ptr = get_llama_batch(self);
95
- ptr->batch = llama_batch_init(n_tokens, embd);
100
+ ptr->batch = llama_batch_init(n_tokens, embd, n_seq_max);
96
101
 
97
102
  return Qnil;
98
103
  }
@@ -190,25 +195,35 @@ private:
190
195
  }
191
196
 
192
197
  // seq_id
193
- static VALUE _llama_batch_set_seq_id(VALUE self, VALUE idx, VALUE value) {
198
+ static VALUE _llama_batch_set_seq_id(VALUE self, VALUE i_, VALUE j_, VALUE value) {
194
199
  LLaMABatchWrapper* ptr = get_llama_batch(self);
195
- const int32_t id = NUM2INT(idx);
196
- if (id < 0 || id >= ptr->batch.n_tokens) {
197
- rb_raise(rb_eArgError, "id must be in [0, n_tokens)");
200
+ const int32_t i = NUM2INT(i_);
201
+ if (i < 0 || i >= ptr->batch.n_tokens) {
202
+ rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
203
+ return Qnil;
204
+ }
205
+ const int32_t j = NUM2INT(j_);
206
+ if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
207
+ rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
198
208
  return Qnil;
199
209
  }
200
- ptr->batch.seq_id[id] = NUM2INT(value);
201
- return INT2NUM(ptr->batch.seq_id[id]);
210
+ ptr->batch.seq_id[i][j] = NUM2INT(value);
211
+ return INT2NUM(ptr->batch.seq_id[i][j]);
202
212
  }
203
213
 
204
- static VALUE _llama_batch_get_seq_id(VALUE self, VALUE idx) {
214
+ static VALUE _llama_batch_get_seq_id(VALUE self, VALUE i_, VALUE j_) {
205
215
  LLaMABatchWrapper* ptr = get_llama_batch(self);
206
- const int32_t id = NUM2INT(idx);
207
- if (id < 0 || id >= ptr->batch.n_tokens) {
208
- rb_raise(rb_eArgError, "id must be in [0, n_tokens)");
216
+ const int32_t i = NUM2INT(i_);
217
+ if (i < 0 || i >= ptr->batch.n_tokens) {
218
+ rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
219
+ return Qnil;
220
+ }
221
+ const int32_t j = NUM2INT(j_);
222
+ if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
223
+ rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
209
224
  return Qnil;
210
225
  }
211
- return INT2NUM(ptr->batch.seq_id[id]);
226
+ return INT2NUM(ptr->batch.seq_id[i][j]);
212
227
  }
213
228
 
214
229
  // logits
@@ -1319,10 +1334,10 @@ private:
1319
1334
 
1320
1335
  static VALUE _llama_model_tokenize(int argc, VALUE* argv, VALUE self) {
1321
1336
  VALUE kw_args = Qnil;
1322
- ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
1323
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
1337
+ ID kw_table[4] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos"), rb_intern("special") };
1338
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1324
1339
  rb_scan_args(argc, argv, ":", &kw_args);
1325
- rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
1340
+ rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
1326
1341
 
1327
1342
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1328
1343
  rb_raise(rb_eArgError, "text must be a String");
@@ -1336,15 +1351,20 @@ private:
1336
1351
  rb_raise(rb_eArgError, "add_bos must be a boolean");
1337
1352
  return Qnil;
1338
1353
  }
1354
+ if (kw_values[3] != Qundef && (kw_values[3] != Qtrue && kw_values[3] != Qfalse)) {
1355
+ rb_raise(rb_eArgError, "special must be a boolean");
1356
+ return Qnil;
1357
+ }
1339
1358
 
1340
1359
  VALUE text_ = kw_values[0];
1341
1360
  std::string text = StringValueCStr(text_);
1342
1361
  const bool add_bos = kw_values[2] == Qtrue ? true : false;
1362
+ const bool special = kw_values[3] == Qtrue ? true : false;
1343
1363
  const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
1344
1364
 
1345
1365
  llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
1346
1366
  LLaMAModelWrapper* ptr = get_llama_model(self);
1347
- const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
1367
+ const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos, special);
1348
1368
 
1349
1369
  if (n_tokens < 0) {
1350
1370
  rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1,4 +1,5 @@
1
1
  #include "ggml-alloc.h"
2
+ #include "ggml-backend.h"
2
3
  #include "ggml.h"
3
4
  #include <assert.h>
4
5
  #include <stdarg.h>
@@ -6,25 +7,6 @@
6
7
  #include <stdlib.h>
7
8
  #include <string.h>
8
9
 
9
- #ifdef __has_include
10
- #if __has_include(<unistd.h>)
11
- #include <unistd.h>
12
- #if defined(_POSIX_MAPPED_FILES)
13
- #include <sys/types.h>
14
- #include <sys/mman.h>
15
- #endif
16
- #endif
17
- #endif
18
-
19
- #if defined(_WIN32)
20
- #define WIN32_LEAN_AND_MEAN
21
- #ifndef NOMINMAX
22
- #define NOMINMAX
23
- #endif
24
- #include <windows.h>
25
- #include <memoryapi.h>
26
- #endif
27
-
28
10
 
29
11
  #define UNUSED(x) (void)(x)
30
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
80
62
  #define MAX_FREE_BLOCKS 256
81
63
 
82
64
  struct ggml_allocr {
65
+ struct ggml_backend_buffer * buffer;
66
+ bool buffer_owned;
83
67
  void * data;
84
- size_t size;
85
68
  size_t alignment;
86
69
  int n_free_blocks;
87
70
  struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
119
102
  }
120
103
  #endif
121
104
 
122
- static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
123
- return ggml_nbytes(tensor);
124
-
125
- UNUSED(alloc);
126
- }
127
-
128
105
  // check if a tensor is allocated by this buffer
129
106
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130
- void * ptr = tensor->data;
131
- return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
107
+ return tensor->buffer == alloc->buffer;
132
108
  }
133
109
 
134
110
  static bool ggml_is_view(struct ggml_tensor * t) {
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
136
112
  }
137
113
 
138
114
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
139
- #ifdef GGML_ALLOCATOR_DEBUG
140
115
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
141
116
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
142
- #endif
143
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
117
+
118
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
144
119
  size = aligned_offset(NULL, size, alloc->alignment);
145
120
 
146
121
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
188
163
 
189
164
  tensor->data = addr;
190
165
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
+ tensor->buffer = alloc->buffer;
167
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
191
168
 
192
169
  #ifdef GGML_ALLOCATOR_DEBUG
193
170
  add_allocated_tensor(alloc, tensor);
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
208
185
 
209
186
  // this is a very naive implementation, but for our case the number of free blocks should be very small
210
187
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
211
- void * ptr = tensor->data;
212
-
213
188
  if (ggml_allocr_is_own(alloc, tensor) == false) {
214
189
  // the tensor was not allocated in this buffer
215
190
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
216
191
  // the easiest way to deal with this is just to ignore it
192
+ AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
217
193
  return;
218
194
  }
219
195
 
220
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
196
+ void * ptr = tensor->data;
197
+
198
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
221
199
  size = aligned_offset(NULL, size, alloc->alignment);
222
200
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
223
- AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
201
+
202
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
224
203
 
225
204
  #ifdef GGML_ALLOCATOR_DEBUG
226
205
  remove_allocated_tensor(alloc, tensor);
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
285
264
  alloc->n_free_blocks = 1;
286
265
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
287
266
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
288
- alloc->free_blocks[0].size = alloc->size - align_offset;
267
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
289
268
  }
290
269
 
291
270
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
292
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
271
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
+
273
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
293
274
 
294
275
  *alloc = (struct ggml_allocr){
295
- /*.data = */ data,
296
- /*.size = */ size,
276
+ /*.buffer = */ buffer,
277
+ /*.buffer_owned = */ true,
278
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
297
279
  /*.alignment = */ alignment,
298
280
  /*.n_free_blocks = */ 0,
299
281
  /*.free_blocks = */ {{0}},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
312
294
  return alloc;
313
295
  }
314
296
 
315
- // OS specific functions to allocate and free uncommitted virtual memory
316
- static void * alloc_vmem(size_t size) {
317
- #if defined(_WIN32)
318
- return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
319
- #elif defined(_POSIX_MAPPED_FILES)
320
- void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
321
- if (ptr == MAP_FAILED) {
322
- return NULL;
323
- }
324
- return ptr;
325
- #else
326
- // use a fixed address for other platforms
327
- uintptr_t base_addr = (uintptr_t)-size - 0x100;
328
- return (void *)base_addr;
329
- #endif
330
- }
331
-
332
- static void free_vmem(void * base_addr, size_t size) {
333
- #if defined(_WIN32)
334
- VirtualFree(base_addr, 0, MEM_RELEASE);
335
- UNUSED(size);
336
- #elif defined(_POSIX_MAPPED_FILES)
337
- munmap(base_addr, size);
338
- #else
339
- // nothing to do
340
- UNUSED(base_addr);
341
- UNUSED(size);
342
- #endif
343
- }
344
-
345
- // allocate uncommitted virtual memory to measure the size of the graph
346
- static void alloc_measure_vmem(void ** base_addr, size_t * size) {
347
- // 128GB for 64-bit, 1GB for 32-bit
348
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
349
- do {
350
- *base_addr = alloc_vmem(*size);
351
- if (*base_addr != NULL) {
352
- AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
353
- return;
354
- }
355
- // try again with half the size
356
- *size /= 2;
357
- } while (*size > 0);
358
-
359
- GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
360
- }
361
-
362
- static void free_measure_vmem(void * base_addr, size_t size) {
363
- free_vmem(base_addr, size);
364
- }
365
-
366
297
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
367
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
298
+ struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
299
+ alloc->measure = true;
368
300
 
369
- void * base_addr;
370
- size_t size;
301
+ return alloc;
302
+ }
371
303
 
372
- alloc_measure_vmem(&base_addr, &size);
304
+ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
373
306
 
374
307
  *alloc = (struct ggml_allocr){
375
- /*.data = */ base_addr,
376
- /*.size = */ size,
377
- /*.alignment = */ alignment,
308
+ /*.buffer = */ buffer,
309
+ /*.buffer_owned = */ false,
310
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
311
+ /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
378
312
  /*.n_free_blocks = */ 0,
379
313
  /*.free_blocks = */ {{0}},
380
314
  /*.hash_table = */ {{0}},
381
315
  /*.max_size = */ 0,
382
- /*.measure = */ true,
316
+ /*.measure = */ false,
383
317
  /*.parse_seq = */ {0},
384
318
  /*.parse_seq_len = */ 0,
385
319
  #ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
393
327
  }
394
328
 
395
329
  void ggml_allocr_free(struct ggml_allocr * alloc) {
396
- if (alloc->measure) {
397
- free_measure_vmem(alloc->data, alloc->size);
330
+ if (alloc->buffer_owned) {
331
+ ggml_backend_buffer_free(alloc->buffer);
398
332
  }
399
333
  free(alloc);
400
334
  }
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
437
371
  case GGML_OP_ROPE:
438
372
  case GGML_OP_RMS_NORM:
439
373
  case GGML_OP_SOFT_MAX:
440
- case GGML_OP_CONT:
441
374
  return true;
442
375
 
443
376
  default:
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
445
378
  }
446
379
  }
447
380
 
381
+ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
382
+ assert(view->view_src != NULL && view->view_src->data != NULL);
383
+ view->backend = view->view_src->backend;
384
+ view->buffer = view->view_src->buffer;
385
+ view->data = (char *)view->view_src->data + view->view_offs;
386
+
387
+ // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388
+ // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389
+ assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
390
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
391
+ }
392
+
448
393
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
449
394
  struct hash_node * ht = alloc->hash_table;
450
395
  if (node->data == NULL) {
451
396
  if (ggml_is_view(node)) {
452
- assert(node->view_src->data != NULL);
453
- node->data = (char *)node->view_src->data + node->view_offs;
397
+ init_view(alloc, node);
454
398
  } else {
455
399
  // see if we can reuse a parent's buffer (inplace)
456
400
  if (ggml_op_can_inplace(node->op)) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
478
422
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479
423
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480
424
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
481
- node->data = parent->data;
425
+ node->view_src = view_src;
426
+ view_src_hn->n_views += 1;
427
+ init_view(alloc, node);
482
428
  return;
483
429
  }
484
430
  }
485
431
  else {
486
432
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
487
- node->data = parent->data;
433
+ node->view_src = parent;
434
+ p_hn->n_views += 1;
435
+ init_view(alloc, node);
488
436
  return;
489
437
  }
490
438
  }
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
495
443
  }
496
444
  }
497
445
 
498
- static size_t ggml_allocr_alloc_graph_tensors_n(
446
+ size_t ggml_allocr_alloc_graph_n(
499
447
  struct ggml_allocr * alloc,
500
448
  struct ggml_cgraph ** graphs, int n_graphs,
501
449
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
513
461
  if (ggml_is_view(node)) {
514
462
  struct ggml_tensor * view_src = node->view_src;
515
463
  hash_get(ht, view_src)->n_views += 1;
464
+ if (node->buffer == NULL && node->data != NULL) {
465
+ // view of a pre-allocated tensor, didn't call init_view() yet
466
+ init_view(alloc, node);
467
+ }
516
468
  }
517
469
 
518
470
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
521
473
  break;
522
474
  }
523
475
  hash_get(ht, parent)->n_children += 1;
476
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
+ init_view(alloc, parent);
478
+ }
524
479
  }
525
480
  }
526
481
  }
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
631
586
  }
632
587
 
633
588
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
634
- return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
589
+ return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
635
590
  }
636
591
 
637
592
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
@@ -6,21 +6,27 @@
6
6
  extern "C" {
7
7
  #endif
8
8
 
9
+ struct ggml_backend_buffer;
9
10
 
10
11
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
12
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
13
+ GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
12
14
 
13
15
  // tell the allocator to parse nodes following the order described in the list
14
16
  // you should call this if your graph are optimized to execute out-of-order
15
17
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
18
 
17
- GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
- GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
- GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
- GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
19
+ GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
20
+ GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
21
+ GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
22
+ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
23
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
- GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
24
+ GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
23
25
 
26
+ GGML_API size_t ggml_allocr_alloc_graph_n(
27
+ struct ggml_allocr * alloc,
28
+ struct ggml_cgraph ** graphs, int n_graphs,
29
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
24
30
 
25
31
  #ifdef __cplusplus
26
32
  }