llama_cpp 0.12.2 → 0.12.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
27
27
  return buft->iface.get_alignment(buft);
28
28
  }
29
29
 
30
+ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
31
+ // get_max_size is optional, defaults to SIZE_MAX
32
+ if (buft->iface.get_max_size) {
33
+ return buft->iface.get_max_size(buft);
34
+ }
35
+ return SIZE_MAX;
36
+ }
37
+
30
38
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
39
  // get_alloc_size is optional, defaults to ggml_nbytes
32
40
  if (buft->iface.get_alloc_size) {
33
- return buft->iface.get_alloc_size(buft, tensor);
41
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
42
+ assert(size >= ggml_nbytes(tensor));
43
+ return size;
34
44
  }
35
45
  return ggml_nbytes(tensor);
36
46
  }
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
55
65
  size_t size) {
56
66
  ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
57
67
 
58
- GGML_ASSERT(iface.get_base != NULL);
59
-
60
68
  (*buffer) = (struct ggml_backend_buffer) {
61
69
  /* .interface = */ iface,
62
70
  /* .buft = */ buft,
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
106
114
  return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
107
115
  }
108
116
 
117
+ size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
118
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
119
+ }
120
+
109
121
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
110
122
  return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
111
123
  }
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
120
132
 
121
133
  void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
134
  buffer->usage = usage;
135
+
136
+ // FIXME: add a generic callback to the buffer interface
137
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
138
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
139
+ }
123
140
  }
124
141
 
125
142
  ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
169
186
  return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
170
187
  }
171
188
 
189
+ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
190
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
191
+ }
192
+
172
193
  void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
173
194
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
174
195
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
337
358
  ggml_backend_cuda_reg_devices();
338
359
  #endif
339
360
 
361
+ #ifdef GGML_USE_SYCL
362
+ extern void ggml_backend_sycl_reg_devices(void);
363
+ ggml_backend_sycl_reg_devices();
364
+ #endif
365
+
340
366
  #ifdef GGML_USE_METAL
341
367
  extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
342
368
  extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
343
369
  ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
344
370
  #endif
371
+
372
+ #ifdef GGML_USE_VULKAN
373
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
374
+ ggml_backend_vk_reg_devices();
375
+ #endif
376
+
377
+ #ifdef GGML_USE_KOMPUTE
378
+ extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
379
+ ggml_backend_kompute_reg_devices();
380
+ #endif
345
381
  }
346
382
 
347
383
  GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
545
581
  /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
546
582
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
547
583
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
584
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
548
585
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
549
586
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
550
587
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
600
637
  /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
601
638
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
602
639
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
640
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
603
641
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
604
642
  /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
605
643
  /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
@@ -692,6 +730,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
692
730
 
693
731
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
694
732
  switch (op->op) {
733
+ case GGML_OP_CPY:
734
+ return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
695
735
  case GGML_OP_MUL_MAT:
696
736
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
697
737
  default:
@@ -754,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
754
794
  GGML_UNUSED(user_data);
755
795
  }
756
796
 
797
+ // multi-buffer buffer
798
+
799
+ struct ggml_backend_multi_buffer_context {
800
+ ggml_backend_buffer_t * buffers;
801
+ size_t n_buffers;
802
+ };
803
+
804
+ typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
805
+
806
+ GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
807
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
808
+
809
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
810
+ }
811
+
812
+ GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
813
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
814
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
815
+ ggml_backend_buffer_free(ctx->buffers[i]);
816
+ }
817
+
818
+ free(ctx->buffers);
819
+ free(ctx);
820
+ }
821
+
822
+ GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
823
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
824
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
825
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
826
+ }
827
+ }
828
+
829
+ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
830
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
831
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
832
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
833
+ /* .get_base = */ NULL,
834
+ /* .init_tensor = */ NULL,
835
+ /* .set_tensor = */ NULL,
836
+ /* .get_tensor = */ NULL,
837
+ /* .cpy_tensor = */ NULL,
838
+ /* .clear = */ ggml_backend_multi_buffer_clear,
839
+ /* .reset = */ NULL,
840
+ };
841
+
842
+ return multi_backend_buffer_i;
843
+ }
844
+
845
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
846
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
847
+ ctx->n_buffers = n_buffers;
848
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
849
+
850
+ size_t total_size = 0;
851
+ for (size_t i = 0; i < n_buffers; i++) {
852
+ ctx->buffers[i] = buffers[i];
853
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
854
+ }
855
+
856
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
857
+ }
858
+
859
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
860
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
861
+ }
862
+
863
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
864
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
865
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
866
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
867
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
868
+ }
869
+ }
870
+
757
871
 
758
872
  // scheduler
759
873
 
@@ -802,6 +916,9 @@ struct ggml_backend_sched {
802
916
  __attribute__((aligned(GGML_MEM_ALIGN)))
803
917
  #endif
804
918
  char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
919
+
920
+ ggml_backend_sched_eval_callback callback_eval;
921
+ void * callback_eval_user_data;
805
922
  };
806
923
 
807
924
  #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@@ -1186,6 +1303,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1186
1303
  ggml_tallocr_t src_allocr = node_allocr(src);
1187
1304
  GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1188
1305
  if (src_allocr != node_allocr) {
1306
+ // create a copy of the input in the split's backend
1307
+ size_t id = hash_id(src);
1308
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
1309
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1310
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1311
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1312
+
1313
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
1314
+ node_allocr(tensor_copy) = cur_allocr;
1315
+ SET_CAUSE(tensor_copy, "4.cpy");
1316
+
1317
+ int n_inputs = sched->splits[cur_split].n_inputs++;
1318
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1319
+ sched->splits[cur_split].inputs[n_inputs] = src;
1320
+ }
1321
+ node->src[j] = sched->node_copies[id][cur_backend_id];
1322
+
1323
+ #if 0
1189
1324
  // check if the input is already in the split
1190
1325
  bool found = false;
1191
1326
  for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1201,19 +1336,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1201
1336
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1202
1337
  sched->splits[cur_split].inputs[n_inputs] = src;
1203
1338
  }
1204
-
1205
- // create a copy of the input in the split's backend
1206
- size_t id = hash_id(src);
1207
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1208
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1209
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1210
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1211
-
1212
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1213
- node_allocr(tensor_copy) = cur_allocr;
1214
- SET_CAUSE(tensor_copy, "4.cpy");
1215
- }
1216
- node->src[j] = sched->node_copies[id][cur_backend_id];
1339
+ #endif
1217
1340
  }
1218
1341
  }
1219
1342
  }
@@ -1324,9 +1447,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1324
1447
  ggml_graph_dump_dot(split->graph, NULL, split_filename);
1325
1448
  #endif
1326
1449
 
1450
+
1327
1451
  uint64_t compute_start_us = ggml_time_us();
1328
- ggml_backend_graph_compute(split_backend, &split->graph);
1329
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1452
+ if (!sched->callback_eval) {
1453
+ ggml_backend_graph_compute(split_backend, &split->graph);
1454
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1455
+ } else {
1456
+ // similar to ggml_backend_compare_graph_backend
1457
+ for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1458
+ struct ggml_tensor * t = split->graph.nodes[j0];
1459
+
1460
+ // check if the user needs data from this node
1461
+ bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1462
+
1463
+ int j1 = j0;
1464
+
1465
+ // determine the range [j0, j1] of nodes that can be computed together
1466
+ while (!need && j1 < split->graph.n_nodes - 1) {
1467
+ t = split->graph.nodes[++j1];
1468
+ need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1469
+ }
1470
+
1471
+ struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1472
+
1473
+ ggml_backend_graph_compute(split_backend, &gv);
1474
+
1475
+ if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1476
+ break;
1477
+ }
1478
+
1479
+ j0 = j1;
1480
+ }
1481
+ }
1330
1482
  uint64_t compute_end_us = ggml_time_us();
1331
1483
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1332
1484
  }
@@ -1431,6 +1583,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1431
1583
  sched_reset(sched);
1432
1584
  }
1433
1585
 
1586
+
1587
+ void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
1588
+ sched->callback_eval = callback;
1589
+ sched->callback_eval_user_data = user_data;
1590
+ }
1591
+
1434
1592
  int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1435
1593
  return sched->n_splits;
1436
1594
  }
@@ -20,6 +20,7 @@ extern "C" {
20
20
  GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
21
  GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
22
  GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
23
24
  GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
25
  GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
26
  GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
@@ -36,6 +37,7 @@ extern "C" {
36
37
  GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
38
  GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
39
  GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
40
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
39
41
  GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
42
  GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
43
  GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
@@ -54,6 +56,7 @@ extern "C" {
54
56
  GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
55
57
  GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
56
58
  GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
59
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
57
60
 
58
61
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
59
62
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
@@ -148,6 +151,14 @@ extern "C" {
148
151
  struct ggml_backend_sched;
149
152
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
150
153
 
154
+ // when ask == true, the scheduler wants to know if the user wants to observe this node
155
+ // this allows the scheduler to batch nodes together in order to evaluate them in a single call
156
+ //
157
+ // when ask == false, the scheduler is passing the node tensor to the user for observation
158
+ // if the user returns false, the scheduler will cancel the graph compute
159
+ //
160
+ typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
161
+
151
162
  // Initialize a backend scheduler
152
163
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
153
164
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -168,6 +179,9 @@ extern "C" {
168
179
  // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
169
180
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
170
181
 
182
+ // Set a callback to be called for each resulting node during graph compute
183
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
184
+
171
185
  //
172
186
  // Utils
173
187
  //