llama_cpp 0.10.1 → 0.10.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9bd5dbea8695fdb41ea6d97e372c2cea452ee8ed070e26bd558a720d6c24fe27
4
- data.tar.gz: 8d7bfd02445df81644eebb3a6db49bb1ddf241a344fef76a0b949f1c12d0639d
3
+ metadata.gz: b4e94e20f142572fb46cff141e109025e1c5b91b9cd6cabfbaeac163a920bb82
4
+ data.tar.gz: 10fcd2922c057a0c960cbde83a0fa22a56eae6e21a73e20931cfe082108dec26
5
5
  SHA512:
6
- metadata.gz: 116423f4581f605ee379bcd690299e152087c03665be6171bc137205f0824be5d5e0ce6d3c0b548fc6d28193c601679c3cc27a101b6f5b58968c69388a70cbfc
7
- data.tar.gz: fd162b1c4e26732573d32ba7439fc44dd2ff09a9024c64ff5adce997e154a934d9556f0ff3cfd6b83dd7dcbfb8366cc6ce394b8324f39ef349d4cb5834ea43f1
6
+ metadata.gz: 570f5f257d947bad8489fcafe6c7cc0dc342d593132111d3479c0202e0998e444ca75c2140883dc8cf6a22f0f4f74fecd264a577cee6aabde4c168f5003b4c98
7
+ data.tar.gz: d105e41aac3bb39b2ee53c43eb320272ffa5290caa7e14e2f08bebbade94417aed065e01d23b8def9cce08fd6789576d33ba4082750647c0bbf9ff4c776db9ac
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.10.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.1...v0.10.2)] - 2023-12-23
2
+
3
+ - Bump bundled llama.cpp from b1641 to b1686.
4
+ - Add `LLAMA_FILE_MAGIC_GGLA` constant.
5
+ - Add `n_batch` method to `Context`.
6
+
1
7
  ## [[0.10.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.10.0...v0.10.1)] - 2023-12-16
2
8
 
3
9
  - Bump bundled llama.cpp from b1620 to b1641.
@@ -1949,6 +1949,7 @@ public:
1949
1949
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
1950
1950
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
1951
1951
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1952
+ rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
1952
1953
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
1953
1954
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
1954
1955
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
@@ -2201,7 +2202,16 @@ private:
2201
2202
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2202
2203
  return Qnil;
2203
2204
  }
2204
- return INT2NUM(llama_n_ctx(ptr->ctx));
2205
+ return UINT2NUM(llama_n_ctx(ptr->ctx));
2206
+ }
2207
+
2208
+ static VALUE _llama_context_n_batch(VALUE self) {
2209
+ LLaMAContextWrapper* ptr = get_llama_context(self);
2210
+ if (ptr->ctx == NULL) {
2211
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
2212
+ return Qnil;
2213
+ }
2214
+ return UINT2NUM(llama_n_batch(ptr->ctx));
2205
2215
  }
2206
2216
 
2207
2217
  static VALUE _llama_context_get_timings(VALUE self) {
@@ -3146,6 +3156,11 @@ extern "C" void Init_llama_cpp(void) {
3146
3156
  rb_define_const(rb_mLLaMACpp, "LLAMA_ROPE_SCALING_MAX_VALUE", INT2NUM(LLAMA_ROPE_SCALING_MAX_VALUE));
3147
3157
 
3148
3158
  std::stringstream ss_magic;
3159
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
3160
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
3161
+
3162
+ ss_magic.str("");
3163
+ ss_magic.clear(std::stringstream::goodbit);
3149
3164
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3150
3165
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3151
3166
 
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
449
449
  if (update_backend) {
450
450
  view->backend = view->view_src->backend;
451
451
  }
452
- view->buffer = view->view_src->buffer;
452
+ // views are initialized in the alloc buffer rather than the view_src buffer
453
+ view->buffer = alloc->buffer;
453
454
  view->data = (char *)view->view_src->data + view->view_offs;
454
455
 
455
- // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
456
- // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
457
456
  assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458
457
 
459
458
  if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
736
735
  }
737
736
 
738
737
  void ggml_allocr_free(ggml_allocr_t alloc) {
738
+ if (alloc == NULL) {
739
+ return;
740
+ }
741
+
739
742
  ggml_gallocr_free(alloc->galloc);
740
743
  ggml_tallocr_free(alloc->talloc);
741
744
  free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
775
778
  }
776
779
 
777
780
  if (nbytes == 0) {
778
- fprintf(stderr, "%s: no tensors to allocate\n", __func__);
781
+ // all the tensors in the context are already allocated
779
782
  return NULL;
780
783
  }
781
784
 
@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
789
792
  } else {
790
793
  ggml_backend_view_init(buffer, t);
791
794
  }
795
+ } else {
796
+ if (t->view_src != NULL) {
797
+ // view of a pre-allocated tensor
798
+ ggml_backend_view_init(buffer, t);
799
+ }
792
800
  }
793
801
  }
794
802
 
@@ -20,6 +20,9 @@ extern "C" {
20
20
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
21
  size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
22
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
+ // check if tensor data is in host memory
24
+ // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
23
26
  };
24
27
 
25
28
  struct ggml_backend_buffer_type {
@@ -31,15 +34,16 @@ extern "C" {
31
34
  typedef void * ggml_backend_buffer_context_t;
32
35
 
33
36
  struct ggml_backend_buffer_i {
34
- void (*free_buffer)(ggml_backend_buffer_t buffer);
37
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
35
38
  //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36
- void * (*get_base) (ggml_backend_buffer_t buffer);
37
- void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
39
+ void * (*get_base) (ggml_backend_buffer_t buffer);
40
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
40
43
  // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
44
+ void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
+ void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
43
47
  };
44
48
 
45
49
  struct ggml_backend_buffer {
@@ -78,7 +82,7 @@ extern "C" {
78
82
  void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
79
83
  void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
80
84
 
81
- void (*synchronize) (ggml_backend_t backend);
85
+ void (*synchronize)(ggml_backend_t backend);
82
86
 
83
87
  // compute graph with a plan
84
88
  ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
35
35
  return buft->iface.supports_backend(buft, backend);
36
36
  }
37
37
 
38
+ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
39
+ if (buft->iface.is_host) {
40
+ return buft->iface.is_host(buft);
41
+ }
42
+ return false;
43
+ }
44
+
38
45
  // backend buffer
39
46
 
40
47
  ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
94
101
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
95
102
  }
96
103
 
104
+ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
105
+ buffer->iface.clear(buffer, value);
106
+ }
107
+
108
+ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
+ return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110
+ }
111
+
97
112
  ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
98
113
  return buffer->buft;
99
114
  }
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
378
393
 
379
394
  static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
380
395
  free(buffer->context);
381
- GGML_UNUSED(buffer);
382
396
  }
383
397
 
384
398
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
411
425
  GGML_UNUSED(buffer);
412
426
  }
413
427
 
428
+ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
429
+ memset(buffer->context, value, buffer->size);
430
+ }
431
+
414
432
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
415
433
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
416
434
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
419
437
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
420
438
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
421
439
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
440
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
422
441
  };
423
442
 
424
443
  // for buffers from ptr, free is not called
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
430
449
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
431
450
  /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
432
451
  /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
452
+ /* .clear = */ ggml_backend_cpu_buffer_clear,
433
453
  };
434
454
 
435
455
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
455
475
  GGML_UNUSED(buft);
456
476
  }
457
477
 
478
+ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
479
+ return true;
480
+
481
+ GGML_UNUSED(buft);
482
+ }
483
+
458
484
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
459
- static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
485
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
460
486
  /* .iface = */ {
461
487
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
462
488
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
463
489
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
464
490
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
491
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
465
492
  },
466
493
  /* .context = */ NULL,
467
494
  };
468
495
 
469
- return &ggml_backend_buffer_type_cpu;
496
+ return &ggml_backend_cpu_buffer_type;
470
497
  }
471
498
 
499
+ #ifdef GGML_USE_CPU_HBM
500
+
501
+ // buffer type HBM
502
+
503
+ #include <hbwmalloc.h>
504
+
505
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
506
+ hbw_free(buffer->context);
507
+ }
508
+
509
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
510
+ //void * ptr = hbw_malloc(size);
511
+ void * ptr;
512
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
513
+ if (result != 0) {
514
+ fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
515
+ return NULL;
516
+ }
517
+
518
+ // FIXME: this is a hack to avoid having to implement a new buffer type
519
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
520
+ buffer->buft = buft;
521
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
522
+
523
+ return buffer;
524
+ }
525
+
526
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
527
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
528
+ /* .iface = */ {
529
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
530
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
531
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
532
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
533
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
534
+ },
535
+ /* .context = */ NULL,
536
+ };
537
+
538
+ return &ggml_backend_cpu_buffer_type_hbm;
539
+ }
540
+ #endif
541
+
472
542
  struct ggml_backend_cpu_context {
473
543
  int n_threads;
474
544
  void * work_data;
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
505
575
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
506
576
 
507
577
  cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
508
- cpu_plan->cgraph = *cgraph;
578
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
509
579
 
510
580
  if (cpu_plan->cplan.work_size > 0) {
511
581
  cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1180
1250
  // utils
1181
1251
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1182
1252
  GGML_ASSERT(tensor->buffer == NULL);
1183
- GGML_ASSERT(tensor->data == NULL);
1253
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1184
1254
  GGML_ASSERT(tensor->view_src != NULL);
1185
1255
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1186
1256
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -21,6 +21,7 @@ extern "C" {
21
21
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
22
22
  GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
23
23
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
24
25
 
25
26
  // buffer
26
27
  GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
@@ -29,6 +30,8 @@ extern "C" {
29
30
  GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
30
31
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
31
32
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
33
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
32
35
  GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
33
36
 
34
37
  //
@@ -76,6 +79,10 @@ extern "C" {
76
79
 
77
80
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
78
81
 
82
+ #ifdef GGML_USE_CPU_HBM
83
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
84
+ #endif
85
+
79
86
  //
80
87
  // Backend registry
81
88
  //