llama_cpp 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 144a7130adb5ac32d31699bce809a6de6c3a6ecf8cfccca36ebdee436c28b645
4
- data.tar.gz: d00b2c2db583e6e38d472033c7348f22e9614febdb633c4e454ca49e00d2fec6
3
+ metadata.gz: 6688a7296f7a7e7ba4aa593b2d9b792beb1d569f7f2e0e872e1dbda64a336b57
4
+ data.tar.gz: 3f683714c3b11b8f247d9ef40774b90e297c25f3bf2ab478e763bda9c983d73a
5
5
  SHA512:
6
- metadata.gz: 2c30854fef304e0258250d9285bac8ab3ea014950d1638e88682029763a3e90eae36da1b3757b2441ff5a7a798401ee1e731bcfc014e7e651811726d7afea224
7
- data.tar.gz: 10ea5bb5bf5d85a7e7030b514e2eb38650e9ce8a97ab339f63538b637d3c85293b406fea66c055a00f919c457a9a2af5c8f5710d0d31d702fe7e6f703b52933d
6
+ metadata.gz: d7dc061516e688624f4090b956fd40999c9e2e5d2ae41fe8a1baac3caaf61ed9aef3ef31e8ca971e0a210a592cb3618f67533483e5808e2e9205e2ba9a7dfcf8
7
+ data.tar.gz: aae1a4952d19aa186aa2ea97ce59af1dac7295f5430108aaf6545949218851b31c266472cf6111a62f7a5784c5f23fd3e3697f1181d5e659c217975890eed299
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## [[0.7.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.7.0...v0.7.1)] - 2023-10-14
2
+
3
+ - Bump bundled llama.cpp from b1334 to b1380.
4
+
1
5
  ## [[0.7.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.6.0...v0.7.0)] - 2023-10-07
2
6
 
3
7
  - Bump bundled llama.cpp from b1292 to b1334.
@@ -5,7 +5,7 @@ require 'fileutils'
5
5
 
6
6
  abort 'libstdc++ is not found.' unless have_library('stdc++')
7
7
 
8
- $srcs = %w[ggml.c ggml-alloc.c llama.cpp llama_cpp.cpp]
8
+ $srcs = %w[ggml.c ggml-backend.c ggml-alloc.c llama.cpp llama_cpp.cpp]
9
9
  $srcs << 'ggml-opencl.cpp' if with_config('clblast')
10
10
  $srcs << 'ggml-mpi.c' if with_config('mpi')
11
11
  $CFLAGS << ' -w -DNDEBUG'
@@ -1,4 +1,5 @@
1
1
  #include "ggml-alloc.h"
2
+ #include "ggml-backend.h"
2
3
  #include "ggml.h"
3
4
  #include <assert.h>
4
5
  #include <stdarg.h>
@@ -6,25 +7,6 @@
6
7
  #include <stdlib.h>
7
8
  #include <string.h>
8
9
 
9
- #ifdef __has_include
10
- #if __has_include(<unistd.h>)
11
- #include <unistd.h>
12
- #if defined(_POSIX_MAPPED_FILES)
13
- #include <sys/types.h>
14
- #include <sys/mman.h>
15
- #endif
16
- #endif
17
- #endif
18
-
19
- #if defined(_WIN32)
20
- #define WIN32_LEAN_AND_MEAN
21
- #ifndef NOMINMAX
22
- #define NOMINMAX
23
- #endif
24
- #include <windows.h>
25
- #include <memoryapi.h>
26
- #endif
27
-
28
10
 
29
11
  #define UNUSED(x) (void)(x)
30
12
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -80,8 +62,9 @@ struct free_block {
80
62
  #define MAX_FREE_BLOCKS 256
81
63
 
82
64
  struct ggml_allocr {
65
+ struct ggml_backend_buffer * buffer;
66
+ bool buffer_owned;
83
67
  void * data;
84
- size_t size;
85
68
  size_t alignment;
86
69
  int n_free_blocks;
87
70
  struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
119
102
  }
120
103
  #endif
121
104
 
122
- static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
123
- return ggml_nbytes(tensor);
124
-
125
- UNUSED(alloc);
126
- }
127
-
128
105
  // check if a tensor is allocated by this buffer
129
106
  static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130
- void * ptr = tensor->data;
131
- return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
107
+ return tensor->buffer == alloc->buffer;
132
108
  }
133
109
 
134
110
  static bool ggml_is_view(struct ggml_tensor * t) {
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
136
112
  }
137
113
 
138
114
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
139
- #ifdef GGML_ALLOCATOR_DEBUG
140
115
  GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
141
116
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
142
- #endif
143
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
117
+
118
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
144
119
  size = aligned_offset(NULL, size, alloc->alignment);
145
120
 
146
121
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
188
163
 
189
164
  tensor->data = addr;
190
165
  AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
166
+ tensor->buffer = alloc->buffer;
167
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
191
168
 
192
169
  #ifdef GGML_ALLOCATOR_DEBUG
193
170
  add_allocated_tensor(alloc, tensor);
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
208
185
 
209
186
  // this is a very naive implementation, but for our case the number of free blocks should be very small
210
187
  static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
211
- void * ptr = tensor->data;
212
-
213
188
  if (ggml_allocr_is_own(alloc, tensor) == false) {
214
189
  // the tensor was not allocated in this buffer
215
190
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
216
191
  // the easiest way to deal with this is just to ignore it
192
+ AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
217
193
  return;
218
194
  }
219
195
 
220
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
196
+ void * ptr = tensor->data;
197
+
198
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
221
199
  size = aligned_offset(NULL, size, alloc->alignment);
222
200
  AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
223
- AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
201
+
202
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
224
203
 
225
204
  #ifdef GGML_ALLOCATOR_DEBUG
226
205
  remove_allocated_tensor(alloc, tensor);
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
285
264
  alloc->n_free_blocks = 1;
286
265
  size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
287
266
  alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
288
- alloc->free_blocks[0].size = alloc->size - align_offset;
267
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
289
268
  }
290
269
 
291
270
  struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
292
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
271
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
272
+
273
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
293
274
 
294
275
  *alloc = (struct ggml_allocr){
295
- /*.data = */ data,
296
- /*.size = */ size,
276
+ /*.buffer = */ buffer,
277
+ /*.buffer_owned = */ true,
278
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
297
279
  /*.alignment = */ alignment,
298
280
  /*.n_free_blocks = */ 0,
299
281
  /*.free_blocks = */ {{0}},
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
312
294
  return alloc;
313
295
  }
314
296
 
315
- // OS specific functions to allocate and free uncommitted virtual memory
316
- static void * alloc_vmem(size_t size) {
317
- #if defined(_WIN32)
318
- return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
319
- #elif defined(_POSIX_MAPPED_FILES)
320
- void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
321
- if (ptr == MAP_FAILED) {
322
- return NULL;
323
- }
324
- return ptr;
325
- #else
326
- // use a fixed address for other platforms
327
- uintptr_t base_addr = (uintptr_t)-size - 0x100;
328
- return (void *)base_addr;
329
- #endif
330
- }
331
-
332
- static void free_vmem(void * base_addr, size_t size) {
333
- #if defined(_WIN32)
334
- VirtualFree(base_addr, 0, MEM_RELEASE);
335
- UNUSED(size);
336
- #elif defined(_POSIX_MAPPED_FILES)
337
- munmap(base_addr, size);
338
- #else
339
- // nothing to do
340
- UNUSED(base_addr);
341
- UNUSED(size);
342
- #endif
343
- }
344
-
345
- // allocate uncommitted virtual memory to measure the size of the graph
346
- static void alloc_measure_vmem(void ** base_addr, size_t * size) {
347
- // 128GB for 64-bit, 1GB for 32-bit
348
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
349
- do {
350
- *base_addr = alloc_vmem(*size);
351
- if (*base_addr != NULL) {
352
- AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
353
- return;
354
- }
355
- // try again with half the size
356
- *size /= 2;
357
- } while (*size > 0);
358
-
359
- GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
360
- }
361
-
362
- static void free_measure_vmem(void * base_addr, size_t size) {
363
- free_vmem(base_addr, size);
364
- }
365
-
366
297
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
367
- struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
298
+ struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
299
+ alloc->measure = true;
368
300
 
369
- void * base_addr;
370
- size_t size;
301
+ return alloc;
302
+ }
371
303
 
372
- alloc_measure_vmem(&base_addr, &size);
304
+ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
305
+ struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
373
306
 
374
307
  *alloc = (struct ggml_allocr){
375
- /*.data = */ base_addr,
376
- /*.size = */ size,
377
- /*.alignment = */ alignment,
308
+ /*.buffer = */ buffer,
309
+ /*.buffer_owned = */ false,
310
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
311
+ /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
378
312
  /*.n_free_blocks = */ 0,
379
313
  /*.free_blocks = */ {{0}},
380
314
  /*.hash_table = */ {{0}},
381
315
  /*.max_size = */ 0,
382
- /*.measure = */ true,
316
+ /*.measure = */ false,
383
317
  /*.parse_seq = */ {0},
384
318
  /*.parse_seq_len = */ 0,
385
319
  #ifdef GGML_ALLOCATOR_DEBUG
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
393
327
  }
394
328
 
395
329
  void ggml_allocr_free(struct ggml_allocr * alloc) {
396
- if (alloc->measure) {
397
- free_measure_vmem(alloc->data, alloc->size);
330
+ if (alloc->buffer_owned) {
331
+ ggml_backend_buffer_free(alloc->buffer);
398
332
  }
399
333
  free(alloc);
400
334
  }
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
437
371
  case GGML_OP_ROPE:
438
372
  case GGML_OP_RMS_NORM:
439
373
  case GGML_OP_SOFT_MAX:
440
- case GGML_OP_CONT:
441
374
  return true;
442
375
 
443
376
  default:
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
445
378
  }
446
379
  }
447
380
 
381
+ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
382
+ assert(view->view_src != NULL && view->view_src->data != NULL);
383
+ view->backend = view->view_src->backend;
384
+ view->buffer = view->view_src->buffer;
385
+ view->data = (char *)view->view_src->data + view->view_offs;
386
+
387
+ // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
388
+ // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
389
+ assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
390
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
391
+ }
392
+
448
393
  static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
449
394
  struct hash_node * ht = alloc->hash_table;
450
395
  if (node->data == NULL) {
451
396
  if (ggml_is_view(node)) {
452
- assert(node->view_src->data != NULL);
453
- node->data = (char *)node->view_src->data + node->view_offs;
397
+ init_view(alloc, node);
454
398
  } else {
455
399
  // see if we can reuse a parent's buffer (inplace)
456
400
  if (ggml_op_can_inplace(node->op)) {
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
478
422
  // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
479
423
  // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
480
424
  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
481
- node->data = parent->data;
425
+ node->view_src = view_src;
426
+ view_src_hn->n_views += 1;
427
+ init_view(alloc, node);
482
428
  return;
483
429
  }
484
430
  }
485
431
  else {
486
432
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
487
- node->data = parent->data;
433
+ node->view_src = parent;
434
+ p_hn->n_views += 1;
435
+ init_view(alloc, node);
488
436
  return;
489
437
  }
490
438
  }
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
495
443
  }
496
444
  }
497
445
 
498
- static size_t ggml_allocr_alloc_graph_tensors_n(
446
+ size_t ggml_allocr_alloc_graph_n(
499
447
  struct ggml_allocr * alloc,
500
448
  struct ggml_cgraph ** graphs, int n_graphs,
501
449
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
513
461
  if (ggml_is_view(node)) {
514
462
  struct ggml_tensor * view_src = node->view_src;
515
463
  hash_get(ht, view_src)->n_views += 1;
464
+ if (node->buffer == NULL && node->data != NULL) {
465
+ // view of a pre-allocated tensor, didn't call init_view() yet
466
+ init_view(alloc, node);
467
+ }
516
468
  }
517
469
 
518
470
  for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
521
473
  break;
522
474
  }
523
475
  hash_get(ht, parent)->n_children += 1;
476
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
477
+ init_view(alloc, parent);
478
+ }
524
479
  }
525
480
  }
526
481
  }
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
631
586
  }
632
587
 
633
588
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
634
- return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
589
+ return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
635
590
  }
636
591
 
637
592
  size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
@@ -6,21 +6,27 @@
6
6
  extern "C" {
7
7
  #endif
8
8
 
9
+ struct ggml_backend_buffer;
9
10
 
10
11
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
11
12
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
13
+ GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
12
14
 
13
15
  // tell the allocator to parse nodes following the order described in the list
14
16
  // you should call this if your graph are optimized to execute out-of-order
15
17
  GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
18
 
17
- GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
- GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
19
- GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
20
- GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
19
+ GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
20
+ GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
21
+ GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
22
+ GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
21
23
  GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
22
- GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
24
+ GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
23
25
 
26
+ GGML_API size_t ggml_allocr_alloc_graph_n(
27
+ struct ggml_allocr * alloc,
28
+ struct ggml_cgraph ** graphs, int n_graphs,
29
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
24
30
 
25
31
  #ifdef __cplusplus
26
32
  }