llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
4
- data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
3
+ metadata.gz: fd67587510fff74b8b1d55e2e5861711709dfb5d8c44cf40b3bf762276e57d5b
4
+ data.tar.gz: 5cb5319136e538eb2ec9a6406caaaacdabdb2dceec5cade43769eda1b02de9c5
5
5
  SHA512:
6
- metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
7
- data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
6
+ metadata.gz: c2ab28fe9bf5674976ff2e676ea4d76157bd2ebf24b92ca2f959a6cdf2c19de94fe95d76ab21ca313d9017f835387b0f9ad616cb3700024fc5394fa1e9984fda
7
+ data.tar.gz: 0ce0be3db250eb7d35f3784bd7a3bd54e7ab8833378745417da3504f69bc31910d4fec459d29ad28218fce2614e8321462e9873c96ed1c3793eb5f9bbe5a9eac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,16 @@
1
+ ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
2
+
3
+ - Bump bundled llama.cpp from master-b1140 to master-b1198.
4
+
5
+ ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
6
+
7
+ **Breaking Changes**
8
+ - Bump bundled llama.cpp from master-b1060 to master-b1140.
9
+ - Rename `token_to_str` method on Context to `token_to_piece` method.
10
+ - Rename `token_to_str` method on Model to `token_to_piece` method.
11
+ - Rename `type` method on Model to `desc` method.
12
+ - Add `size` and `n_params` methods to Model.
13
+
1
14
  ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
2
15
 
3
16
  **Breaking Changes**
data/examples/chat.rb CHANGED
@@ -122,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
122
122
 
123
123
  if input_echo
124
124
  output = []
125
- embd.each { |token| output << context.token_to_str(token) }
125
+ embd.each { |token| output << context.token_to_piece(token) }
126
126
  output_str = output.join
127
127
  output_str.chomp!(antiprompt) if first_input
128
128
  print(output_str)
@@ -131,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
131
131
  if embd_input.size <= n_consumed
132
132
  if antiprompt.size.positive?
133
133
  last_output = []
134
- last_n_tokens.each { |token| last_output << context.token_to_str(token) }
134
+ last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
135
135
  last_output_str = last_output.join
136
136
 
137
137
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -50,7 +50,7 @@ if with_config('accelerate')
50
50
  end
51
51
 
52
52
  if with_config('metal')
53
- $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
53
+ $CFLAGS << ' -DGGML_USE_METAL'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
56
  $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
@@ -811,9 +811,11 @@ public:
811
811
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
812
  rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
813
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
- rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
814
+ rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
815
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
816
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
816
+ rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
817
+ rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
818
+ rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
817
819
  }
818
820
 
819
821
  private:
@@ -974,7 +976,7 @@ private:
974
976
  return INT2NUM(llama_model_n_embd(ptr->model));
975
977
  }
976
978
 
977
- static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
979
+ static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
978
980
  if (!RB_INTEGER_TYPE_P(token_)) {
979
981
  rb_raise(rb_eArgError, "token must be an integer");
980
982
  return Qnil;
@@ -982,10 +984,10 @@ private:
982
984
  const llama_token token = NUM2INT(token_);
983
985
  LLaMAModelWrapper* ptr = get_llama_model(self);
984
986
  std::vector<char> result(8, 0);
985
- const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
987
+ const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
986
988
  if (n_tokens < 0) {
987
989
  result.resize(-n_tokens);
988
- const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
990
+ const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
989
991
  if (check != -n_tokens) {
990
992
  rb_raise(rb_eRuntimeError, "failed to convert");
991
993
  return Qnil;
@@ -1040,12 +1042,22 @@ private:
1040
1042
  return ret;
1041
1043
  }
1042
1044
 
1043
- static VALUE _llama_model_get_model_type(VALUE self) {
1045
+ static VALUE _llama_model_get_model_desc(VALUE self) {
1044
1046
  LLaMAModelWrapper* ptr = get_llama_model(self);
1045
1047
  char buf[128];
1046
- ::llama_model_type(ptr->model, buf, sizeof(buf));
1048
+ llama_model_desc(ptr->model, buf, sizeof(buf));
1047
1049
  return rb_str_new_cstr(buf);
1048
1050
  }
1051
+
1052
+ static VALUE _llama_model_get_model_size(VALUE self) {
1053
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1054
+ return UINT2NUM(llama_model_size(ptr->model));
1055
+ }
1056
+
1057
+ static VALUE _llama_model_get_model_n_params(VALUE self) {
1058
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1059
+ return UINT2NUM(llama_model_n_params(ptr->model));
1060
+ }
1049
1061
  };
1050
1062
 
1051
1063
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1326,7 +1338,7 @@ public:
1326
1338
  rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1327
1339
  rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1328
1340
  rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1329
- rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1341
+ rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1330
1342
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1331
1343
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1332
1344
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
@@ -1567,7 +1579,7 @@ private:
1567
1579
  return output;
1568
1580
  }
1569
1581
 
1570
- static VALUE _llama_context_token_to_str(VALUE self, VALUE token_) {
1582
+ static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
1571
1583
  LLaMAContextWrapper* ptr = get_llama_context(self);
1572
1584
  if (ptr->ctx == NULL) {
1573
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
@@ -1575,10 +1587,10 @@ private:
1575
1587
  }
1576
1588
  const llama_token token = NUM2INT(token_);
1577
1589
  std::vector<char> result(8, 0);
1578
- const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1590
+ const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1579
1591
  if (n_tokens < 0) {
1580
1592
  result.resize(-n_tokens);
1581
- const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1593
+ const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1582
1594
  if (check != -n_tokens) {
1583
1595
  rb_raise(rb_eRuntimeError, "failed to convert");
1584
1596
  return Qnil;
@@ -1,3 +1,8 @@
1
+ // defines MAP_ANONYMOUS
2
+ #ifndef _GNU_SOURCE
3
+ #define _GNU_SOURCE
4
+ #endif
5
+
1
6
  #include "ggml-alloc.h"
2
7
  #include "ggml.h"
3
8
  #include <assert.h>
@@ -6,6 +11,26 @@
6
11
  #include <stdlib.h>
7
12
  #include <string.h>
8
13
 
14
+ #ifdef __has_include
15
+ #if __has_include(<unistd.h>)
16
+ #include <unistd.h>
17
+ #if defined(_POSIX_MAPPED_FILES)
18
+ #include <sys/types.h>
19
+ #include <sys/mman.h>
20
+ #endif
21
+ #endif
22
+ #endif
23
+
24
+ #if defined(_WIN32)
25
+ #define WIN32_LEAN_AND_MEAN
26
+ #ifndef NOMINMAX
27
+ #define NOMINMAX
28
+ #endif
29
+ #include <windows.h>
30
+ #include <memoryapi.h>
31
+ #endif
32
+
33
+
9
34
  #define UNUSED(x) (void)(x)
10
35
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
36
  #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,15 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
99
124
  }
100
125
  #endif
101
126
 
102
-
103
- static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
127
+ static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
128
  return ggml_nbytes(tensor);
105
129
 
106
130
  UNUSED(alloc);
107
131
  }
108
132
 
133
+ // check if a tensor is allocated by this buffer
134
+ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
135
+ void * ptr = tensor->data;
136
+ return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
137
+ }
138
+
109
139
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
140
+ #ifdef GGML_ALLOCATOR_DEBUG
141
+ GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
142
+ GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
143
+ #endif
144
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
111
145
  size = aligned_offset(NULL, size, alloc->alignment);
112
146
 
113
147
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -131,14 +165,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
131
165
  if (best_fit_block == -1) {
132
166
  // the last block is our last resort
133
167
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
168
+ max_avail = MAX(max_avail, block->size);
134
169
  if (block->size >= size) {
135
170
  best_fit_block = alloc->n_free_blocks - 1;
136
- max_avail = MAX(max_avail, block->size);
137
171
  } else {
138
172
  fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
139
173
  __func__, size, max_avail);
140
174
  GGML_ASSERT(!"not enough space in the buffer");
141
- return;
175
+ return;
142
176
  }
143
177
  }
144
178
  struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -173,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
173
207
  }
174
208
 
175
209
  // this is a very naive implementation, but for our case the number of free blocks should be very small
176
- static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
210
+ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
177
211
  void * ptr = tensor->data;
178
212
 
179
- if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
213
+ if (ggml_allocr_is_own(alloc, tensor) == false) {
180
214
  // the tensor was not allocated in this buffer
181
215
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
182
216
  // the easiest way to deal with this is just to ignore it
183
217
  return;
184
218
  }
185
219
 
186
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
220
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
187
221
  size = aligned_offset(NULL, size, alloc->alignment);
188
222
  AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
189
223
 
@@ -268,7 +302,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
268
302
  /*.parse_seq = */ {0},
269
303
  /*.parse_seq_len = */ 0,
270
304
  #ifdef GGML_ALLOCATOR_DEBUG
271
- /*.allocated_tensors = */ = {0},
305
+ /*.allocated_tensors = */ {0},
272
306
  #endif
273
307
  };
274
308
 
@@ -277,17 +311,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
277
311
  return alloc;
278
312
  }
279
313
 
280
- // address and size of the buffer when measuring
281
- // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
282
- static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
283
- static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
314
+ // OS specific functions to allocate and free uncommitted virtual memory
315
+ static void * alloc_vmem(size_t size) {
316
+ #if defined(_WIN32)
317
+ return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
318
+ #elif defined(_POSIX_MAPPED_FILES)
319
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
320
+ if (ptr == MAP_FAILED) {
321
+ return NULL;
322
+ }
323
+ return ptr;
324
+ #else
325
+ // use a fixed address for other platforms
326
+ uintptr_t base_addr = (uintptr_t)-size - 0x100;
327
+ return (void *)base_addr;
328
+ #endif
329
+ }
330
+
331
+ static void free_vmem(void * base_addr, size_t size) {
332
+ #if defined(_WIN32)
333
+ VirtualFree(base_addr, 0, MEM_RELEASE);
334
+ UNUSED(size);
335
+ #elif defined(_POSIX_MAPPED_FILES)
336
+ munmap(base_addr, size);
337
+ #else
338
+ // nothing to do
339
+ UNUSED(base_addr);
340
+ UNUSED(size);
341
+ #endif
342
+ }
343
+
344
+ // allocate uncommitted virtual memory to measure the size of the graph
345
+ static void alloc_measure_vmem(void ** base_addr, size_t * size) {
346
+ // 1TB for 64-bit, 1GB for 32-bit
347
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
348
+ do {
349
+ *base_addr = alloc_vmem(*size);
350
+ if (*base_addr != NULL) {
351
+ AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
352
+ return;
353
+ }
354
+ // try again with half the size
355
+ *size /= 2;
356
+ } while (*size > 0);
357
+
358
+ GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
359
+ }
360
+
361
+ static void free_measure_vmem(void * base_addr, size_t size) {
362
+ free_vmem(base_addr, size);
363
+ }
284
364
 
285
365
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
286
366
  struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
287
367
 
368
+ void * base_addr;
369
+ size_t size;
370
+
371
+ alloc_measure_vmem(&base_addr, &size);
372
+
288
373
  *alloc = (struct ggml_allocr){
289
- /*.data = */ MEASURE_BASE_ADDR,
290
- /*.size = */ MEASURE_MAX_SIZE,
374
+ /*.data = */ base_addr,
375
+ /*.size = */ size,
291
376
  /*.alignment = */ alignment,
292
377
  /*.n_free_blocks = */ 0,
293
378
  /*.free_blocks = */ {{0}},
@@ -297,7 +382,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
297
382
  /*.parse_seq = */ {0},
298
383
  /*.parse_seq_len = */ 0,
299
384
  #ifdef GGML_ALLOCATOR_DEBUG
300
- /*.allocated_tensors = */ = {0},
385
+ /*.allocated_tensors = */ {0},
301
386
  #endif
302
387
  };
303
388
 
@@ -307,6 +392,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
307
392
  }
308
393
 
309
394
  void ggml_allocr_free(struct ggml_allocr * alloc) {
395
+ if (alloc->measure) {
396
+ free_measure_vmem(alloc->data, alloc->size);
397
+ }
310
398
  free(alloc);
311
399
  }
312
400
 
@@ -317,8 +405,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
317
405
  //////////// compute graph allocator
318
406
 
319
407
  static bool ggml_is_view(struct ggml_tensor * t) {
320
- return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
321
- t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
408
+ return t->view_src != NULL;
322
409
  }
323
410
 
324
411
  static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@@ -336,28 +423,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
336
423
  return true;
337
424
  }
338
425
 
339
- static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
340
- switch (t->op) {
341
- case GGML_OP_PERMUTE:
342
- case GGML_OP_RESHAPE:
343
- case GGML_OP_TRANSPOSE:
344
- case GGML_OP_VIEW:
345
- return t->src[0];
346
- case GGML_OP_CPY:
347
- return t->src[1];
348
- default:
349
- return NULL;
350
- }
351
- }
352
-
353
- static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
354
- struct ggml_tensor * parent = t;
355
- do {
356
- parent = get_view_parent(parent);
357
- } while (ggml_is_view(parent));
358
- return parent;
359
- }
360
-
361
426
  static bool ggml_op_can_inplace(enum ggml_op op) {
362
427
  switch (op) {
363
428
  case GGML_OP_SCALE:
@@ -365,7 +430,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
365
430
  case GGML_OP_DIAG_MASK_INF:
366
431
  case GGML_OP_ADD:
367
432
  case GGML_OP_ADD1:
368
- case GGML_OP_ACC:
369
433
  case GGML_OP_SUB:
370
434
  case GGML_OP_MUL:
371
435
  case GGML_OP_DIV:
@@ -375,7 +439,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
375
439
  case GGML_OP_UNARY:
376
440
  case GGML_OP_ROPE:
377
441
  case GGML_OP_RMS_NORM:
378
- case GGML_OP_SET:
379
442
  case GGML_OP_SOFT_MAX:
380
443
  case GGML_OP_CONT:
381
444
  return true;
@@ -389,24 +452,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
389
452
  struct hash_node * ht = alloc->hash_table;
390
453
  if (node->data == NULL) {
391
454
  if (ggml_is_view(node)) {
392
- size_t offset;
393
- switch(node->op) {
394
- case GGML_OP_VIEW:
395
- memcpy(&offset, node->op_params, sizeof(size_t));
396
- node->data = (char *) node->src[0]->data + offset;
397
- break;
398
- case GGML_OP_PERMUTE:
399
- case GGML_OP_RESHAPE:
400
- case GGML_OP_TRANSPOSE:
401
- node->data = node->src[0]->data;
402
- break;
403
- case GGML_OP_CPY:
404
- node->data = node->src[1]->data;
405
- break;
406
- default:
407
- GGML_ASSERT(!"unknown view op");
408
- break;
409
- }
455
+ assert(node->view_src->data != NULL);
456
+ node->data = (char *)node->view_src->data + node->view_offs;
410
457
  } else {
411
458
  // see if we can reuse a parent's buffer (inplace)
412
459
  if (ggml_op_can_inplace(node->op)) {
@@ -417,8 +464,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
417
464
  }
418
465
 
419
466
  // if the node's data is external, then we cannot re-use it
420
- if ((char *) parent->data < (char *) alloc->data ||
421
- (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
467
+ if (ggml_allocr_is_own(alloc, parent) == false) {
422
468
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
423
469
  continue;
424
470
  }
@@ -426,7 +472,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
426
472
  struct hash_node * p_hn = hash_get(ht, parent);
427
473
  if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
428
474
  if (ggml_is_view(parent)) {
429
- struct ggml_tensor * view_src = get_view_source(parent);
475
+ struct ggml_tensor * view_src = parent->view_src;
430
476
  struct hash_node * view_src_hn = hash_get(ht, view_src);
431
477
  if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
432
478
  // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -452,7 +498,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
452
498
  }
453
499
  }
454
500
 
455
- static size_t ggml_allocator_alloc_graph_tensors_n(
501
+ static size_t ggml_allocr_alloc_graph_tensors_n(
456
502
  struct ggml_allocr * alloc,
457
503
  struct ggml_cgraph ** graphs, int n_graphs,
458
504
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -468,7 +514,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
468
514
  struct ggml_tensor * node = gf->nodes[i];
469
515
 
470
516
  if (ggml_is_view(node)) {
471
- struct ggml_tensor * view_src = get_view_source(node);
517
+ struct ggml_tensor * view_src = node->view_src;
472
518
  hash_get(ht, view_src)->n_views += 1;
473
519
  }
474
520
 
@@ -530,11 +576,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
530
576
  AT_PRINTF("\n");
531
577
  }
532
578
 
533
-
534
579
  // update parents
535
580
  // update immediately if there is no parse_seq
536
581
  // update only at barriers if there is parse_seq
537
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
582
+ if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
538
583
  int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
584
  int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
585
  for (int i = update_start; i < update_end; i++) {
@@ -553,17 +598,17 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
553
598
 
554
599
  if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
600
  if (ggml_is_view(parent)) {
556
- struct ggml_tensor * view_src = get_view_source(parent);
601
+ struct ggml_tensor * view_src = parent->view_src;
557
602
  struct hash_node * view_src_hn = hash_get(ht, view_src);
558
603
  view_src_hn->n_views -= 1;
559
- AT_PRINTF("view_src %s\n", view_src->name);
604
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
560
605
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
- ggml_allocator_free_tensor(alloc, view_src);
606
+ ggml_allocr_free_tensor(alloc, view_src);
562
607
  }
563
608
  }
564
609
  else {
565
610
  if (parent->data != node->data) {
566
- ggml_allocator_free_tensor(alloc, parent);
611
+ ggml_allocr_free_tensor(alloc, parent);
567
612
  }
568
613
  }
569
614
  }
@@ -580,7 +625,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
580
625
  for (int i = 0; outputs[g][i] != NULL; i++) {
581
626
  struct ggml_tensor * output = outputs[g][i];
582
627
  AT_PRINTF("output: %s\n", output->name);
583
- ggml_allocator_free_tensor(alloc, output);
628
+ ggml_allocr_free_tensor(alloc, output);
584
629
  }
585
630
  }
586
631
  }
@@ -589,5 +634,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
589
634
  }
590
635
 
591
636
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
592
- return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
637
+ return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
593
638
  }