llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
4
- data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
3
+ metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
4
+ data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
5
5
  SHA512:
6
- metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
7
- data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
6
+ metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
7
+ data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
2
+
3
+ - Bump bundled llama.cpp from master-b1140 to master-b1198.
4
+
5
+ ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
6
+
7
+ **Breaking Changes**
8
+ - Bump bundled llama.cpp from master-b1060 to master-b1140.
9
+ - Rename `token_to_str` method on Context to `token_to_piece` method.
10
+ - Rename `token_to_str` method on Model to `token_to_piece` method.
11
+ - Rename `type` method on Model to `desc` method.
12
+ - Add `size` and `n_params` methods to Model.
13
+
1
14
  ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
2
15
 
3
16
  **Breaking Changes**
data/examples/chat.rb CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
122
122
 
123
123
  if input_echo
124
124
  output = []
125
- embd.each { |token| output << context.token_to_str(token) }
125
+ embd.each { |token| output << context.token_to_piece(token) }
126
126
  output_str = output.join
127
127
  output_str.chomp!(antiprompt) if first_input
128
128
  print(output_str)
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
131
131
  if embd_input.size <= n_consumed
132
132
  if antiprompt.size.positive?
133
133
  last_output = []
134
- last_n_tokens.each { |token| last_output << context.token_to_str(token) }
134
+ last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
135
135
  last_output_str = last_output.join
136
136
 
137
137
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -50,7 +50,7 @@ if with_config('accelerate')
50
50
  end
51
51
 
52
52
  if with_config('metal')
53
- $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
53
+ $CFLAGS << ' -DGGML_USE_METAL'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
56
  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
@@ -811,9 +811,11 @@ public:
811
811
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
812
  rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
813
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
- rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
814
+ rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
815
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
816
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
816
+ rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
817
+ rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
818
+ rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
817
819
  }
818
820
 
819
821
  private:
@@ -974,7 +976,7 @@ private:
974
976
  return INT2NUM(llama_model_n_embd(ptr->model));
975
977
  }
976
978
 
977
- static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
979
+ static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
978
980
  if (!RB_INTEGER_TYPE_P(token_)) {
979
981
  rb_raise(rb_eArgError, "token must be an integer");
980
982
  return Qnil;
@@ -982,10 +984,10 @@ private:
982
984
  const llama_token token = NUM2INT(token_);
983
985
  LLaMAModelWrapper* ptr = get_llama_model(self);
984
986
  std::vector<char> result(8, 0);
985
- const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
987
+ const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
986
988
  if (n_tokens < 0) {
987
989
  result.resize(-n_tokens);
988
- const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
990
+ const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
989
991
  if (check != -n_tokens) {
990
992
  rb_raise(rb_eRuntimeError, "failed to convert");
991
993
  return Qnil;
@@ -1040,12 +1042,22 @@ private:
1040
1042
  return ret;
1041
1043
  }
1042
1044
 
1043
- static VALUE _llama_model_get_model_type(VALUE self) {
1045
+ static VALUE _llama_model_get_model_desc(VALUE self) {
1044
1046
  LLaMAModelWrapper* ptr = get_llama_model(self);
1045
1047
  char buf[128];
1046
- ::llama_model_type(ptr->model, buf, sizeof(buf));
1048
+ llama_model_desc(ptr->model, buf, sizeof(buf));
1047
1049
  return rb_str_new_cstr(buf);
1048
1050
  }
1051
+
1052
+ static VALUE _llama_model_get_model_size(VALUE self) {
1053
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1054
+ return UINT2NUM(llama_model_size(ptr->model));
1055
+ }
1056
+
1057
+ static VALUE _llama_model_get_model_n_params(VALUE self) {
1058
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1059
+ return UINT2NUM(llama_model_n_params(ptr->model));
1060
+ }
1049
1061
  };
1050
1062
 
1051
1063
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1326,7 +1338,7 @@ public:
1326
1338
  rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1327
1339
  rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1328
1340
  rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1329
- rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1341
+ rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1330
1342
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1331
1343
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1332
1344
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
@@ -1567,7 +1579,7 @@ private:
1567
1579
  return output;
1568
1580
  }
1569
1581
 
1570
- static VALUE _llama_context_token_to_str(VALUE self, VALUE token_) {
1582
+ static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
1571
1583
  LLaMAContextWrapper* ptr = get_llama_context(self);
1572
1584
  if (ptr->ctx == NULL) {
1573
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
@@ -1575,10 +1587,10 @@ private:
1575
1587
  }
1576
1588
  const llama_token token = NUM2INT(token_);
1577
1589
  std::vector<char> result(8, 0);
1578
- const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1590
+ const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1579
1591
  if (n_tokens < 0) {
1580
1592
  result.resize(-n_tokens);
1581
- const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1593
+ const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1582
1594
  if (check != -n_tokens) {
1583
1595
  rb_raise(rb_eRuntimeError, "failed to convert");
1584
1596
  return Qnil;
@@ -1,3 +1,8 @@
1
+ // defines MAP_ANONYMOUS
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
1
6
  #include "ggml-alloc.h"
2
7
  #include "ggml.h"
3
8
  #include <assert.h>
@@ -6,6 +11,26 @@
6
11
  #include <stdlib.h>
7
12
  #include <string.h>
8
13
 
14
+ #ifdef __has_include
15
+ #if __has_include(<unistd.h>)
16
+ #include <unistd.h>
17
+ #if defined(_POSIX_MAPPED_FILES)
18
+ #include <sys/types.h>
19
+ #include <sys/mman.h>
20
+ #endif
21
+ #endif
22
+ #endif
23
+
24
+ #if defined(_WIN32)
25
+ #define WIN32_LEAN_AND_MEAN
26
+ #ifndef NOMINMAX
27
+ #define NOMINMAX
28
+ #endif
29
+ #include <windows.h>
30
+ #include <memoryapi.h>
31
+ #endif
32
+
33
+
9
34
  #define UNUSED(x) (void)(x)
10
35
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
36
  #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,15 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
99
124
  }
100
125
  #endif
101
126
 
102
-
103
- static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
127
+ static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
128
  return ggml_nbytes(tensor);
105
129
 
106
130
  UNUSED(alloc);
107
131
  }
108
132
 
133
+ // check if a tensor is allocated by this buffer
134
+ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
135
+ void * ptr = tensor->data;
136
+ return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
137
+ }
138
+
109
139
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
140
+ #ifdef GGML_ALLOCATOR_DEBUG
141
+ GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
142
+ GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
143
+ #endif
144
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
111
145
  size = aligned_offset(NULL, size, alloc->alignment);
112
146
 
113
147
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -131,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
131
165
  if (best_fit_block == -1) {
132
166
  // the last block is our last resort
133
167
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
168
+ max_avail = MAX(max_avail, block->size);
134
169
  if (block->size >= size) {
135
170
  best_fit_block = alloc->n_free_blocks - 1;
136
- max_avail = MAX(max_avail, block->size);
137
171
  } else {
138
172
  fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
139
173
  __func__, size, max_avail);
140
174
  GGML_ASSERT(!"not enough space in the buffer");
141
- return;
175
+ return;
142
176
  }
143
177
  }
144
178
  struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -173,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
173
207
  }
174
208
 
175
209
  // this is a very naive implementation, but for our case the number of free blocks should be very small
176
- static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
210
+ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
177
211
  void * ptr = tensor->data;
178
212
 
179
- if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
213
+ if (ggml_allocr_is_own(alloc, tensor) == false) {
180
214
  // the tensor was not allocated in this buffer
181
215
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
182
216
  // the easiest way to deal with this is just to ignore it
183
217
  return;
184
218
  }
185
219
 
186
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
220
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
187
221
  size = aligned_offset(NULL, size, alloc->alignment);
188
222
  AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
189
223
 
@@ -268,7 +302,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
268
302
  /*.parse_seq = */ {0},
269
303
  /*.parse_seq_len = */ 0,
270
304
  #ifdef GGML_ALLOCATOR_DEBUG
271
- /*.allocated_tensors = */ = {0},
305
+ /*.allocated_tensors = */ {0},
272
306
  #endif
273
307
  };
274
308
 
@@ -277,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
277
311
  return alloc;
278
312
  }
279
313
 
280
- // address and size of the buffer when measuring
281
- // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
282
- static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
283
- static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
314
+ // OS specific functions to allocate and free uncommitted virtual memory
315
+ static void * alloc_vmem(size_t size) {
316
+ #if defined(_WIN32)
317
+ return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
318
+ #elif defined(_POSIX_MAPPED_FILES)
319
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
320
+ if (ptr == MAP_FAILED) {
321
+ return NULL;
322
+ }
323
+ return ptr;
324
+ #else
325
+ // use a fixed address for other platforms
326
+ uintptr_t base_addr = (uintptr_t)-size - 0x100;
327
+ return (void *)base_addr;
328
+ #endif
329
+ }
330
+
331
+ static void free_vmem(void * base_addr, size_t size) {
332
+ #if defined(_WIN32)
333
+ VirtualFree(base_addr, 0, MEM_RELEASE);
334
+ UNUSED(size);
335
+ #elif defined(_POSIX_MAPPED_FILES)
336
+ munmap(base_addr, size);
337
+ #else
338
+ // nothing to do
339
+ UNUSED(base_addr);
340
+ UNUSED(size);
341
+ #endif
342
+ }
343
+
344
+ // allocate uncommitted virtual memory to measure the size of the graph
345
+ static void alloc_measure_vmem(void ** base_addr, size_t * size) {
346
+ // 1TB for 64-bit, 1GB for 32-bit
347
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
348
+ do {
349
+ *base_addr = alloc_vmem(*size);
350
+ if (*base_addr != NULL) {
351
+ AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
352
+ return;
353
+ }
354
+ // try again with half the size
355
+ *size /= 2;
356
+ } while (*size > 0);
357
+
358
+ GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
359
+ }
360
+
361
+ static void free_measure_vmem(void * base_addr, size_t size) {
362
+ free_vmem(base_addr, size);
363
+ }
284
364
 
285
365
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
286
366
  struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
287
367
 
368
+ void * base_addr;
369
+ size_t size;
370
+
371
+ alloc_measure_vmem(&base_addr, &size);
372
+
288
373
  *alloc = (struct ggml_allocr){
289
- /*.data = */ MEASURE_BASE_ADDR,
290
- /*.size = */ MEASURE_MAX_SIZE,
374
+ /*.data = */ base_addr,
375
+ /*.size = */ size,
291
376
  /*.alignment = */ alignment,
292
377
  /*.n_free_blocks = */ 0,
293
378
  /*.free_blocks = */ {{0}},
@@ -297,7 +382,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
297
382
  /*.parse_seq = */ {0},
298
383
  /*.parse_seq_len = */ 0,
299
384
  #ifdef GGML_ALLOCATOR_DEBUG
300
- /*.allocated_tensors = */ = {0},
385
+ /*.allocated_tensors = */ {0},
301
386
  #endif
302
387
  };
303
388
 
@@ -307,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
307
392
  }
308
393
 
309
394
  void ggml_allocr_free(struct ggml_allocr * alloc) {
395
+ if (alloc->measure) {
396
+ free_measure_vmem(alloc->data, alloc->size);
397
+ }
310
398
  free(alloc);
311
399
  }
312
400
 
@@ -317,8 +405,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
317
405
  //////////// compute graph allocator
318
406
 
319
407
  static bool ggml_is_view(struct ggml_tensor * t) {
320
- return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
321
- t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
408
+ return t->view_src != NULL;
322
409
  }
323
410
 
324
411
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -336,28 +423,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
336
423
  return true;
337
424
  }
338
425
 
339
- static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
340
- switch (t->op) {
341
- case GGML_OP_PERMUTE:
342
- case GGML_OP_RESHAPE:
343
- case GGML_OP_TRANSPOSE:
344
- case GGML_OP_VIEW:
345
- return t->src[0];
346
- case GGML_OP_CPY:
347
- return t->src[1];
348
- default:
349
- return NULL;
350
- }
351
- }
352
-
353
- static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
354
- struct ggml_tensor * parent = t;
355
- do {
356
- parent = get_view_parent(parent);
357
- } while (ggml_is_view(parent));
358
- return parent;
359
- }
360
-
361
426
  static bool ggml_op_can_inplace(enum ggml_op op) {
362
427
  switch (op) {
363
428
  case GGML_OP_SCALE:
@@ -365,7 +430,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
365
430
  case GGML_OP_DIAG_MASK_INF:
366
431
  case GGML_OP_ADD:
367
432
  case GGML_OP_ADD1:
368
- case GGML_OP_ACC:
369
433
  case GGML_OP_SUB:
370
434
  case GGML_OP_MUL:
371
435
  case GGML_OP_DIV:
@@ -375,7 +439,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
375
439
  case GGML_OP_UNARY:
376
440
  case GGML_OP_ROPE:
377
441
  case GGML_OP_RMS_NORM:
378
- case GGML_OP_SET:
379
442
  case GGML_OP_SOFT_MAX:
380
443
  case GGML_OP_CONT:
381
444
  return true;
@@ -389,24 +452,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
389
452
  struct hash_node * ht = alloc->hash_table;
390
453
  if (node->data == NULL) {
391
454
  if (ggml_is_view(node)) {
392
- size_t offset;
393
- switch(node->op) {
394
- case GGML_OP_VIEW:
395
- memcpy(&offset, node->op_params, sizeof(size_t));
396
- node->data = (char *) node->src[0]->data + offset;
397
- break;
398
- case GGML_OP_PERMUTE:
399
- case GGML_OP_RESHAPE:
400
- case GGML_OP_TRANSPOSE:
401
- node->data = node->src[0]->data;
402
- break;
403
- case GGML_OP_CPY:
404
- node->data = node->src[1]->data;
405
- break;
406
- default:
407
- GGML_ASSERT(!"unknown view op");
408
- break;
409
- }
455
+ assert(node->view_src->data != NULL);
456
+ node->data = (char *)node->view_src->data + node->view_offs;
410
457
  } else {
411
458
  // see if we can reuse a parent's buffer (inplace)
412
459
  if (ggml_op_can_inplace(node->op)) {
@@ -417,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
417
464
  }
418
465
 
419
466
  // if the node's data is external, then we cannot re-use it
420
- if ((char *) parent->data < (char *) alloc->data ||
421
- (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
467
+ if (ggml_allocr_is_own(alloc, parent) == false) {
422
468
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
423
469
  continue;
424
470
  }
@@ -426,7 +472,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
426
472
  struct hash_node * p_hn = hash_get(ht, parent);
427
473
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
428
474
  if (ggml_is_view(parent)) {
429
- struct ggml_tensor * view_src = get_view_source(parent);
475
+ struct ggml_tensor * view_src = parent->view_src;
430
476
  struct hash_node * view_src_hn = hash_get(ht, view_src);
431
477
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
432
478
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -452,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
452
498
  }
453
499
  }
454
500
 
455
- static size_t ggml_allocator_alloc_graph_tensors_n(
501
+ static size_t ggml_allocr_alloc_graph_tensors_n(
456
502
  struct ggml_allocr * alloc,
457
503
  struct ggml_cgraph ** graphs, int n_graphs,
458
504
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -468,7 +514,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
468
514
  struct ggml_tensor * node = gf->nodes[i];
469
515
 
470
516
  if (ggml_is_view(node)) {
471
- struct ggml_tensor * view_src = get_view_source(node);
517
+ struct ggml_tensor * view_src = node->view_src;
472
518
  hash_get(ht, view_src)->n_views += 1;
473
519
  }
474
520
 
@@ -530,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
530
576
  AT_PRINTF("\n");
531
577
  }
532
578
 
533
-
534
579
  // update parents
535
580
  // update immediately if there is no parse_seq
536
581
  // update only at barriers if there is parse_seq
537
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
582
+ if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
538
583
  int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
584
  int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
585
  for (int i = update_start; i < update_end; i++) {
@@ -553,17 +598,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
553
598
 
554
599
  if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
600
  if (ggml_is_view(parent)) {
556
- struct ggml_tensor * view_src = get_view_source(parent);
601
+ struct ggml_tensor * view_src = parent->view_src;
557
602
  struct hash_node * view_src_hn = hash_get(ht, view_src);
558
603
  view_src_hn->n_views -= 1;
559
- AT_PRINTF("view_src %s\n", view_src->name);
604
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
560
605
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
- ggml_allocator_free_tensor(alloc, view_src);
606
+ ggml_allocr_free_tensor(alloc, view_src);
562
607
  }
563
608
  }
564
609
  else {
565
610
  if (parent->data != node->data) {
566
- ggml_allocator_free_tensor(alloc, parent);
611
+ ggml_allocr_free_tensor(alloc, parent);
567
612
  }
568
613
  }
569
614
  }
@@ -580,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
580
625
  for (int i = 0; outputs[g][i] != NULL; i++) {
581
626
  struct ggml_tensor * output = outputs[g][i];
582
627
  AT_PRINTF("output: %s\n", output->name);
583
- ggml_allocator_free_tensor(alloc, output);
628
+ ggml_allocr_free_tensor(alloc, output);
584
629
  }
585
630
  }
586
631
  }
@@ -589,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
589
634
  }
590
635
 
591
636
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
592
- return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
637
+ return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
593
638
  }