llama_cpp 0.12.3 → 0.12.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
335
335
  }
336
336
 
337
337
  size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
338
- return alloc->max_size;
338
+ // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
339
+ // to avoid this, we add a 10% margin to the buffer size
340
+ return alloc->max_size + alloc->max_size/10;
339
341
  }
340
342
 
341
343
  // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
776
778
  }
777
779
 
778
780
  // utils
779
- ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
780
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
781
-
782
- size_t alignment = ggml_backend_buft_get_alignment(buft);
783
-
784
- size_t nbytes = 0;
785
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
786
- if (t->data == NULL && t->view_src == NULL) {
787
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
788
- }
789
- }
790
-
791
- if (nbytes == 0) {
792
- // all the tensors in the context are already allocated
793
- #ifndef NDEBUG
794
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
- #endif
796
- return NULL;
797
- }
798
781
 
799
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
782
+ static bool alloc_tensor_range(struct ggml_context * ctx,
783
+ struct ggml_tensor * first, struct ggml_tensor * last,
784
+ ggml_backend_buffer_type_t buft, size_t size,
785
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
786
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
800
787
  if (buffer == NULL) {
801
- // failed to allocate buffer
802
788
  #ifndef NDEBUG
803
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
789
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
804
790
  #endif
805
- return NULL;
791
+ for (size_t i = 0; i < *n_buffers; i++) {
792
+ ggml_backend_buffer_free(*buffers[i]);
793
+ }
794
+ free(*buffers);
795
+ return false;
806
796
  }
807
797
 
808
798
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
809
799
 
810
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
800
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
811
801
  if (t->data == NULL) {
812
802
  if (t->view_src == NULL) {
813
803
  ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
824
814
 
825
815
  ggml_tallocr_free(tallocr);
826
816
 
817
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
818
+ (*buffers)[(*n_buffers)++] = buffer;
819
+
820
+ return true;
821
+ }
822
+
823
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
824
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
825
+
826
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
827
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
828
+
829
+ ggml_backend_buffer_t * buffers = NULL;
830
+ size_t n_buffers = 0;
831
+
832
+ size_t cur_buf_size = 0;
833
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
834
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
835
+ size_t this_size = 0;
836
+ if (t->data == NULL && t->view_src == NULL) {
837
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
838
+ }
839
+
840
+ if (this_size > max_size) {
841
+ // tensor is too large to fit in a single buffer
842
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
843
+ __func__, t->name,
844
+ ggml_backend_buft_name(buft),
845
+ this_size, max_size);
846
+ for (size_t i = 0; i < n_buffers; i++) {
847
+ ggml_backend_buffer_free(buffers[i]);
848
+ }
849
+ free(buffers);
850
+ return NULL;
851
+ }
852
+
853
+ if ((cur_buf_size + this_size) > max_size) {
854
+ // allocate tensors in the current buffer
855
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
856
+ return NULL;
857
+ }
858
+ first = t;
859
+ cur_buf_size = this_size;
860
+ } else {
861
+ cur_buf_size += this_size;
862
+ }
863
+ }
864
+
865
+ // allocate remaining tensors
866
+ if (cur_buf_size > 0) {
867
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
868
+ return NULL;
869
+ }
870
+ }
871
+
872
+ if (n_buffers == 0) {
873
+ // all the tensors in the context are already allocated
874
+ #ifndef NDEBUG
875
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
876
+ #endif
877
+ return NULL;
878
+ }
879
+
880
+ ggml_backend_buffer_t buffer;
881
+ if (n_buffers == 1) {
882
+ buffer = buffers[0];
883
+ } else {
884
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
885
+ }
886
+ free(buffers);
827
887
  return buffer;
828
888
  }
829
889
 
@@ -19,6 +19,7 @@ extern "C" {
19
19
  const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
20
20
  ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
21
  size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
22
23
  size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
24
  bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
25
  // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
63
64
  // do not use directly, use ggml_backend_tensor_copy instead
64
65
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
65
66
 
67
+ // buffer that contains a collection of buffers
68
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
69
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
70
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
71
+
66
72
  //
67
73
  // Backend
68
74
  //
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
27
27
  return buft->iface.get_alignment(buft);
28
28
  }
29
29
 
30
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
31
+ // get_max_size is optional, defaults to SIZE_MAX
32
+ if (buft->iface.get_max_size) {
33
+ return buft->iface.get_max_size(buft);
34
+ }
35
+ return SIZE_MAX;
36
+ }
37
+
30
38
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
39
  // get_alloc_size is optional, defaults to ggml_nbytes
32
40
  if (buft->iface.get_alloc_size) {
33
- return buft->iface.get_alloc_size(buft, tensor);
41
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
42
+ assert(size >= ggml_nbytes(tensor));
43
+ return size;
34
44
  }
35
45
  return ggml_nbytes(tensor);
36
46
  }
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
55
65
  size_t size) {
56
66
  ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
57
67
 
58
- GGML_ASSERT(iface.get_base != NULL);
59
-
60
68
  (*buffer) = (struct ggml_backend_buffer) {
61
69
  /* .interface = */ iface,
62
70
  /* .buft = */ buft,
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
106
114
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
107
115
  }
108
116
 
117
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
118
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
119
+ }
120
+
109
121
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
110
122
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
111
123
  }
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
120
132
 
121
133
  void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
134
  buffer->usage = usage;
135
+
136
+ // FIXME: add a generic callback to the buffer interface
137
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
138
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
139
+ }
123
140
  }
124
141
 
125
142
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
169
186
  return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
170
187
  }
171
188
 
189
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
190
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
191
+ }
192
+
172
193
  void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
173
194
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
174
195
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
337
358
  ggml_backend_cuda_reg_devices();
338
359
  #endif
339
360
 
361
+ #ifdef GGML_USE_SYCL
362
+ extern void ggml_backend_sycl_reg_devices(void);
363
+ ggml_backend_sycl_reg_devices();
364
+ #endif
365
+
340
366
  #ifdef GGML_USE_METAL
341
367
  extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
368
  extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
343
369
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
344
370
  #endif
371
+
372
+ #ifdef GGML_USE_VULKAN
373
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
374
+ ggml_backend_vk_reg_devices();
375
+ #endif
376
+
377
+ #ifdef GGML_USE_KOMPUTE
378
+ extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
379
+ ggml_backend_kompute_reg_devices();
380
+ #endif
345
381
  }
346
382
 
347
383
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
545
581
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
546
582
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
547
583
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
584
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
548
585
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
549
586
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
550
587
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
600
637
  /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
601
638
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
602
639
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
640
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
603
641
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
604
642
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
605
643
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
756
794
  GGML_UNUSED(user_data);
757
795
  }
758
796
 
797
+ // multi-buffer buffer
798
+
799
+ struct ggml_backend_multi_buffer_context {
800
+ ggml_backend_buffer_t * buffers;
801
+ size_t n_buffers;
802
+ };
803
+
804
+ typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
805
+
806
+ GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
807
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
808
+
809
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
810
+ }
811
+
812
+ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
813
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
814
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
815
+ ggml_backend_buffer_free(ctx->buffers[i]);
816
+ }
817
+
818
+ free(ctx->buffers);
819
+ free(ctx);
820
+ }
821
+
822
+ GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
823
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
824
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
825
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
826
+ }
827
+ }
828
+
829
+ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
830
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
831
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
832
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
833
+ /* .get_base = */ NULL,
834
+ /* .init_tensor = */ NULL,
835
+ /* .set_tensor = */ NULL,
836
+ /* .get_tensor = */ NULL,
837
+ /* .cpy_tensor = */ NULL,
838
+ /* .clear = */ ggml_backend_multi_buffer_clear,
839
+ /* .reset = */ NULL,
840
+ };
841
+
842
+ return multi_backend_buffer_i;
843
+ }
844
+
845
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
846
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
847
+ ctx->n_buffers = n_buffers;
848
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
849
+
850
+ size_t total_size = 0;
851
+ for (size_t i = 0; i < n_buffers; i++) {
852
+ ctx->buffers[i] = buffers[i];
853
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
854
+ }
855
+
856
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
857
+ }
858
+
859
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
860
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
861
+ }
862
+
863
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
864
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
865
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
866
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
867
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
868
+ }
869
+ }
870
+
759
871
 
760
872
  // scheduler
761
873
 
@@ -20,6 +20,7 @@ extern "C" {
20
20
  GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
21
  GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
22
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
23
24
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
25
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
@@ -36,6 +37,7 @@ extern "C" {
36
37
  GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
38
  GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
39
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
40
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
39
41
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
42
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
43
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
@@ -54,6 +56,7 @@ extern "C" {
54
56
  GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
55
57
  GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
56
58
  GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
59
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
57
60
 
58
61
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
59
62
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);