llama_cpp 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 144a7130adb5ac32d31699bce809a6de6c3a6ecf8cfccca36ebdee436c28b645
4
- data.tar.gz: d00b2c2db583e6e38d472033c7348f22e9614febdb633c4e454ca49e00d2fec6
3
+ metadata.gz: 8045208b5f7801979212a4f6ed395217e78f06bcfbc2d0362aaaa04c529745cd
4
+ data.tar.gz: 4011dfe279d8d4041c6c79dc5a6bad199777f83b5f0559f11ccd2f68c957e462
5
5
  SHA512:
6
- metadata.gz: 2c30854fef304e0258250d9285bac8ab3ea014950d1638e88682029763a3e90eae36da1b3757b2441ff5a7a798401ee1e731bcfc014e7e651811726d7afea224
7
- data.tar.gz: 10ea5bb5bf5d85a7e7030b514e2eb38650e9ce8a97ab339f63538b637d3c85293b406fea66c055a00f919c457a9a2af5c8f5710d0d31d702fe7e6f703b52933d
6
+ metadata.gz: d15e74da491773961006eca8ca6c6d80b30ffc995c56a9140961be0002eb09134f1a029c4e8ee192497fb7256fe36cf1c3ed928967ce57ece4c7a0904392c8fe
7
+ data.tar.gz: a863596304ddb9ac5e4be2b2b65bebc7d3913705b8a0f516debfee0ca213f9dca69707edda8d70cfafb15500fcb6e70cffb6d5d1119302d24e05059c50f0da77
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [[0.8.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.1...v0.8.0)] - 2023-10-21
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from b1380 to b1405
5
+ - Add column index argument to `set_seq_id` and `get_seq_id` methods in Batch.
6
+ - Add `special` keyword argument to `tokenize` method in Model.
7
+ - Add `n_seq_max` keyword argument to `initialize` method in Batch.
8
+
9
+ ## [[0.7.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.0...v0.7.1)] - 2023-10-14
10
+
11
+ - Bump bundled llama.cpp from b1334 to b1380.
12
+
1
13
  ## [[0.7.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.6.0...v0.7.0)] - 2023-10-07
2
14
 
3
15
  - Bump bundled llama.cpp from b1292 to b1334.
@@ -5,7 +5,7 @@ require 'fileutils'
5
5
 
6
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
7
7
 
8
- $srcs = %w[ggml.c ggml-alloc.c llama.cpp llama_cpp.cpp]
8
+ $srcs = %w[ggml.c ggml-backend.c ggml-alloc.c llama.cpp llama_cpp.cpp]
9
9
  $srcs << 'ggml-opencl.cpp' if with_config('clblast')
10
10
  $srcs << 'ggml-mpi.c' if with_config('mpi')
11
11
  $CFLAGS << ' -w -DNDEBUG'
@@ -63,8 +63,8 @@ public:
63
63
  rb_define_method(rb_cLLaMABatch, "get_token", RUBY_METHOD_FUNC(_llama_batch_get_token), 1);
64
64
  rb_define_method(rb_cLLaMABatch, "set_pos", RUBY_METHOD_FUNC(_llama_batch_set_pos), 2);
65
65
  rb_define_method(rb_cLLaMABatch, "get_pos", RUBY_METHOD_FUNC(_llama_batch_get_pos), 1);
66
- rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 2);
67
- rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 1);
66
+ rb_define_method(rb_cLLaMABatch, "set_seq_id", RUBY_METHOD_FUNC(_llama_batch_set_seq_id), 3);
67
+ rb_define_method(rb_cLLaMABatch, "get_seq_id", RUBY_METHOD_FUNC(_llama_batch_get_seq_id), 2);
68
68
  rb_define_method(rb_cLLaMABatch, "set_logits", RUBY_METHOD_FUNC(_llama_batch_set_logits), 2);
69
69
  rb_define_method(rb_cLLaMABatch, "get_logits", RUBY_METHOD_FUNC(_llama_batch_get_logits), 1);
70
70
  }
@@ -74,10 +74,10 @@ private:
74
74
 
75
75
  static VALUE _llama_batch_initialize(int argc, VALUE* argv, VALUE self) {
76
76
  VALUE kw_args = Qnil;
77
- ID kw_table[2] = { rb_intern("n_tokens"), rb_intern("embd") };
78
- VALUE kw_values[2] = { Qundef, Qundef };
77
+ ID kw_table[3] = { rb_intern("n_tokens"), rb_intern("embd"), rb_intern("n_seq_max") };
78
+ VALUE kw_values[3] = { Qundef, Qundef, Qundef };
79
79
  rb_scan_args(argc, argv, ":", &kw_args);
80
- rb_get_kwargs(kw_args, kw_table, 2, 0, kw_values);
80
+ rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
81
81
 
82
82
  if (!RB_INTEGER_TYPE_P(kw_values[0])) {
83
83
  rb_raise(rb_eArgError, "n_tokens must be an integer");
@@ -87,12 +87,17 @@ private:
87
87
  rb_raise(rb_eArgError, "embd must be an integer");
88
88
  return Qnil;
89
89
  }
90
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
91
+ rb_raise(rb_eArgError, "n_seq_max must be an integer");
92
+ return Qnil;
93
+ }
90
94
 
91
95
  const int32_t n_tokens = NUM2INT(kw_values[0]);
92
96
  const int32_t embd = NUM2INT(kw_values[1]);
97
+ const int32_t n_seq_max = NUM2INT(kw_values[2]);
93
98
 
94
99
  LLaMABatchWrapper* ptr = get_llama_batch(self);
95
- ptr->batch = llama_batch_init(n_tokens, embd);
100
+ ptr->batch = llama_batch_init(n_tokens, embd, n_seq_max);
96
101
 
97
102
  return Qnil;
98
103
  }
@@ -190,25 +195,35 @@ private:
190
195
  }
191
196
 
192
197
  // seq_id
193
- static VALUE _llama_batch_set_seq_id(VALUE self, VALUE idx, VALUE value) {
198
+ static VALUE _llama_batch_set_seq_id(VALUE self, VALUE i_, VALUE j_, VALUE value) {
194
199
  LLaMABatchWrapper* ptr = get_llama_batch(self);
195
- const int32_t id = NUM2INT(idx);
196
- if (id < 0 || id >= ptr->batch.n_tokens) {
197
- rb_raise(rb_eArgError, "id must be in [0, n_tokens)");
200
+ const int32_t i = NUM2INT(i_);
201
+ if (i < 0 || i >= ptr->batch.n_tokens) {
202
+ rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
203
+ return Qnil;
204
+ }
205
+ const int32_t j = NUM2INT(j_);
206
+ if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
207
+ rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
198
208
  return Qnil;
199
209
  }
200
- ptr->batch.seq_id[id] = NUM2INT(value);
201
- return INT2NUM(ptr->batch.seq_id[id]);
210
+ ptr->batch.seq_id[i][j] = NUM2INT(value);
211
+ return INT2NUM(ptr->batch.seq_id[i][j]);
202
212
  }
203
213
 
204
- static VALUE _llama_batch_get_seq_id(VALUE self, VALUE idx) {
214
+ static VALUE _llama_batch_get_seq_id(VALUE self, VALUE i_, VALUE j_) {
205
215
  LLaMABatchWrapper* ptr = get_llama_batch(self);
206
- const int32_t id = NUM2INT(idx);
207
- if (id < 0 || id >= ptr->batch.n_tokens) {
208
- rb_raise(rb_eArgError, "id must be in [0, n_tokens)");
216
+ const int32_t i = NUM2INT(i_);
217
+ if (i < 0 || i >= ptr->batch.n_tokens) {
218
+ rb_raise(rb_eArgError, "i must be in [0, n_tokens)");
219
+ return Qnil;
220
+ }
221
+ const int32_t j = NUM2INT(j_);
222
+ if (j < 0 || j >= ptr->batch.n_seq_id[i]) {
223
+ rb_raise(rb_eArgError, "j must be in [0, n_seq_id[i])");
209
224
  return Qnil;
210
225
  }
211
- return INT2NUM(ptr->batch.seq_id[id]);
226
+ return INT2NUM(ptr->batch.seq_id[i][j]);
212
227
  }
213
228
 
214
229
  // logits
@@ -1319,10 +1334,10 @@ private:
1319
1334
 
1320
1335
  static VALUE _llama_model_tokenize(int argc, VALUE* argv, VALUE self) {
1321
1336
  VALUE kw_args = Qnil;
1322
- ID kw_table[3] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos") };
1323
- VALUE kw_values[3] = { Qundef, Qundef, Qundef };
1337
+ ID kw_table[4] = { rb_intern("text"), rb_intern("n_max_tokens"), rb_intern("add_bos"), rb_intern("special") };
1338
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1324
1339
  rb_scan_args(argc, argv, ":", &kw_args);
1325
- rb_get_kwargs(kw_args, kw_table, 1, 2, kw_values);
1340
+ rb_get_kwargs(kw_args, kw_table, 1, 3, kw_values);
1326
1341
 
1327
1342
  if (!RB_TYPE_P(kw_values[0], T_STRING)) {
1328
1343
  rb_raise(rb_eArgError, "text must be a String");
@@ -1336,15 +1351,20 @@ private:
1336
1351
  rb_raise(rb_eArgError, "add_bos must be a boolean");
1337
1352
  return Qnil;
1338
1353
  }
1354
+ if (kw_values[3] != Qundef && (kw_values[3] != Qtrue && kw_values[3] != Qfalse)) {
1355
+ rb_raise(rb_eArgError, "special must be a boolean");
1356
+ return Qnil;
1357
+ }
1339
1358
 
1340
1359
  VALUE text_ = kw_values[0];
1341
1360
  std::string text = StringValueCStr(text_);
1342
1361
  const bool add_bos = kw_values[2] == Qtrue ? true : false;
1362
+ const bool special = kw_values[3] == Qtrue ? true : false;
1343
1363
  const int n_max_tokens = kw_values[1] != Qundef ? NUM2INT(kw_values[1]) : text.size() + (add_bos ? 1 : 0);
1344
1364
 
1345
1365
  llama_token* tokens = ALLOCA_N(llama_token, n_max_tokens);
1346
1366
  LLaMAModelWrapper* ptr = get_llama_model(self);
1347
- const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos);
1367
+ const int n_tokens = llama_tokenize(ptr->model, text.c_str(), text.size(), tokens, n_max_tokens, add_bos, special);
1348
1368
 
1349
1369
  if (n_tokens < 0) {
1350
1370
  rb_raise(rb_eRuntimeError, "failed to tokenize. The numebr of tokens (%d) is greater than n_max_tokens.", -n_tokens);
@@ -1,4 +1,5 @@
1
1
  #include "ggml-alloc.h"
2
+ #include "ggml-backend.h"
2
3
  #include "ggml.h"
3
4
  #include <assert.h>
4
5
  #include <stdarg.h>
@@ -6,25 +7,6 @@
6
7
  #include <stdlib.h>
7
8
  #include <string.h>
8
9
 
9
- #ifdef __has_include
10
- #if __has_include(<unistd.h>)
11
- #include <unistd.h>
12
- #if defined(_POSIX_MAPPED_FILES)
13
- #include <sys/types.h>
14
- #include <sys/mman.h>
15
- #endif
16
- #endif
17
- #endif
18
-
19
- #if defined(_WIN32)
20
- #define WIN32_LEAN_AND_MEAN
21
- #ifndef NOMINMAX
22
- #define NOMINMAX
23
- #endif
24
- #include <windows.h>
25
- #include <memoryapi.h>
26
- #endif
27
-
28
10
 
29
11
  #define UNUSED(x) (void)(x)
30
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
80
62
  #define MAX_FREE_BLOCKS 256
81
63
 
82
64
  struct ggml_allocr {
65
+ struct ggml_backend_buffer * buffer;
66
+ bool buffer_owned;
83
67
  void * data;
84
- size_t size;
85
68
  size_t alignment;
86
69
  int n_free_blocks;
87
70
  struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
119
102
  }
120
103
  #endif
121
104
 
122
- static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
123
- return ggml_nbytes(tensor);
124
-
125
- UNUSED(alloc);
126
- }
127
-
128
105
  // check if a tensor is allocated by this buffer
129
106
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130
- void * ptr = tensor->data;
131
- return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
107
+ return tensor->buffer == alloc->buffer;
132
108
  }
133
109
 
134
110
  static bool ggml_is_view(struct ggml_tensor * t) {
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
136
112
  }
137
113
 
138
114
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
139
- #ifdef GGML_ALLOCATOR_DEBUG
140
115
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
141
116
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
142
- #endif
143
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
117
+
118
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
144
119
  size = aligned_offset(NULL, size, alloc->alignment);
145
120
 
146
121
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
188
163
 
189
164
  tensor->data = addr;
190
165
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
+ tensor->buffer = alloc->buffer;
167
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
191
168
 
192
169
  #ifdef GGML_ALLOCATOR_DEBUG
193
170
  add_allocated_tensor(alloc, tensor);
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
208
185
 
209
186
  // this is a very naive implementation, but for our case the number of free blocks should be very small
210
187
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
211
- void * ptr = tensor->data;
212
-
213
188
  if (ggml_allocr_is_own(alloc, tensor) == false) {
214
189
  // the tensor was not allocated in this buffer
215
190
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
216
191
  // the easiest way to deal with this is just to ignore it
192
+ AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
217
193
  return;
218
194
  }
219
195
 
220
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
196
+ void * ptr = tensor->data;
197
+
198
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
221
199
  size = aligned_offset(NULL, size, alloc->alignment);
222
200
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
223
- AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
201
+
202
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
224
203
 
225
204
  #ifdef GGML_ALLOCATOR_DEBUG
226
205
  remove_allocated_tensor(alloc, tensor);
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
285
264
  alloc->n_free_blocks = 1;
286
265
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
287
266
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
288
- alloc->free_blocks[0].size = alloc->size - align_offset;
267
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
289
268
  }
290
269
 
291
270
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
292
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
271
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
+
273
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
293
274
 
294
275
  *alloc = (struct ggml_allocr){
295
- /*.data = */ data,
296
- /*.size = */ size,
276
+ /*.buffer = */ buffer,
277
+ /*.buffer_owned = */ true,
278
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
297
279
  /*.alignment = */ alignment,
298
280
  /*.n_free_blocks = */ 0,
299
281
  /*.free_blocks = */ {{0}},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
312
294
  return alloc;
313
295
  }
314
296
 
315
- // OS specific functions to allocate and free uncommitted virtual memory
316
- static void * alloc_vmem(size_t size) {
317
- #if defined(_WIN32)
318
- return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
319
- #elif defined(_POSIX_MAPPED_FILES)
320
- void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
321
- if (ptr == MAP_FAILED) {
322
- return NULL;
323
- }
324
- return ptr;
325
- #else
326
- // use a fixed address for other platforms
327
- uintptr_t base_addr = (uintptr_t)-size - 0x100;
328
- return (void *)base_addr;
329
- #endif
330
- }
331
-
332
- static void free_vmem(void * base_addr, size_t size) {
333
- #if defined(_WIN32)
334
- VirtualFree(base_addr, 0, MEM_RELEASE);
335
- UNUSED(size);
336
- #elif defined(_POSIX_MAPPED_FILES)
337
- munmap(base_addr, size);
338
- #else
339
- // nothing to do
340
- UNUSED(base_addr);
341
- UNUSED(size);
342
- #endif
343
- }
344
-
345
- // allocate uncommitted virtual memory to measure the size of the graph
346
- static void alloc_measure_vmem(void ** base_addr, size_t * size) {
347
- // 128GB for 64-bit, 1GB for 32-bit
348
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
349
- do {
350
- *base_addr = alloc_vmem(*size);
351
- if (*base_addr != NULL) {
352
- AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
353
- return;
354
- }
355
- // try again with half the size
356
- *size /= 2;
357
- } while (*size > 0);
358
-
359
- GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
360
- }
361
-
362
- static void free_measure_vmem(void * base_addr, size_t size) {
363
- free_vmem(base_addr, size);
364
- }
365
-
366
297
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
367
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
298
+ struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
299
+ alloc->measure = true;
368
300
 
369
- void * base_addr;
370
- size_t size;
301
+ return alloc;
302
+ }
371
303
 
372
- alloc_measure_vmem(&base_addr, &size);
304
+ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
373
306
 
374
307
  *alloc = (struct ggml_allocr){
375
- /*.data = */ base_addr,
376
- /*.size = */ size,
377
- /*.alignment = */ alignment,
308
+ /*.buffer = */ buffer,
309
+ /*.buffer_owned = */ false,
310
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
311
+ /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
378
312
  /*.n_free_blocks = */ 0,
379
313
  /*.free_blocks = */ {{0}},
380
314
  /*.hash_table = */ {{0}},
381
315
  /*.max_size = */ 0,
382
- /*.measure = */ true,
316
+ /*.measure = */ false,
383
317
  /*.parse_seq = */ {0},
384
318
  /*.parse_seq_len = */ 0,
385
319
  #ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
393
327
  }
394
328
 
395
329
  void ggml_allocr_free(struct ggml_allocr * alloc) {
396
- if (alloc->measure) {
397
- free_measure_vmem(alloc->data, alloc->size);
330
+ if (alloc->buffer_owned) {
331
+ ggml_backend_buffer_free(alloc->buffer);
398
332
  }
399
333
  free(alloc);
400
334
  }
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
437
371
  case GGML_OP_ROPE:
438
372
  case GGML_OP_RMS_NORM:
439
373
  case GGML_OP_SOFT_MAX:
440
- case GGML_OP_CONT:
441
374
  return true;
442
375
 
443
376
  default:
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
445
378
  }
446
379
  }
447
380
 
381
+ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
382
+ assert(view->view_src != NULL && view->view_src->data != NULL);
383
+ view->backend = view->view_src->backend;
384
+ view->buffer = view->view_src->buffer;
385
+ view->data = (char *)view->view_src->data + view->view_offs;
386
+
387
+ // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388
+ // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389
+ assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
390
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
391
+ }
392
+
448
393
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
449
394
  struct hash_node * ht = alloc->hash_table;
450
395
  if (node->data == NULL) {
451
396
  if (ggml_is_view(node)) {
452
- assert(node->view_src->data != NULL);
453
- node->data = (char *)node->view_src->data + node->view_offs;
397
+ init_view(alloc, node);
454
398
  } else {
455
399
  // see if we can reuse a parent's buffer (inplace)
456
400
  if (ggml_op_can_inplace(node->op)) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
478
422
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479
423
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480
424
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
481
- node->data = parent->data;
425
+ node->view_src = view_src;
426
+ view_src_hn->n_views += 1;
427
+ init_view(alloc, node);
482
428
  return;
483
429
  }
484
430
  }
485
431
  else {
486
432
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
487
- node->data = parent->data;
433
+ node->view_src = parent;
434
+ p_hn->n_views += 1;
435
+ init_view(alloc, node);
488
436
  return;
489
437
  }
490
438
  }
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
495
443
  }
496
444
  }
497
445
 
498
- static size_t ggml_allocr_alloc_graph_tensors_n(
446
+ size_t ggml_allocr_alloc_graph_n(
499
447
  struct ggml_allocr * alloc,
500
448
  struct ggml_cgraph ** graphs, int n_graphs,
501
449
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
513
461
  if (ggml_is_view(node)) {
514
462
  struct ggml_tensor * view_src = node->view_src;
515
463
  hash_get(ht, view_src)->n_views += 1;
464
+ if (node->buffer == NULL && node->data != NULL) {
465
+ // view of a pre-allocated tensor, didn't call init_view() yet
466
+ init_view(alloc, node);
467
+ }
516
468
  }
517
469
 
518
470
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
521
473
  break;
522
474
  }
523
475
  hash_get(ht, parent)->n_children += 1;
476
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
+ init_view(alloc, parent);
478
+ }
524
479
  }
525
480
  }
526
481
  }
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
631
586
  }
632
587
 
633
588
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
634
- return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
589
+ return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
635
590
  }
636
591
 
637
592
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
@@ -6,21 +6,27 @@
6
6
  extern "C" {
7
7
  #endif
8
8
 
9
+ struct ggml_backend_buffer;
9
10
 
10
11
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
12
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
13
+ GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
12
14
 
13
15
  // tell the allocator to parse nodes following the order described in the list
14
16
  // you should call this if your graph are optimized to execute out-of-order
15
17
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
18
 
17
- GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
- GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
- GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
- GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
19
+ GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
20
+ GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
21
+ GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
22
+ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
23
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
- GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
24
+ GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
23
25
 
26
+ GGML_API size_t ggml_allocr_alloc_graph_n(
27
+ struct ggml_allocr * alloc,
28
+ struct ggml_cgraph ** graphs, int n_graphs,
29
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
24
30
 
25
31
  #ifdef __cplusplus
26
32
  }