cui-llama.rn 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +2 -0
  2. package/android/src/main/CMakeLists.txt +2 -2
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +39 -0
  5. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
  6. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
  7. package/cpp/common.cpp +36 -1
  8. package/cpp/common.h +5 -1
  9. package/cpp/ggml-aarch64.c +2 -11
  10. package/cpp/ggml-alloc.h +1 -1
  11. package/cpp/ggml-backend-impl.h +151 -78
  12. package/cpp/{ggml-backend.c → ggml-backend.cpp} +565 -269
  13. package/cpp/ggml-backend.h +147 -62
  14. package/cpp/ggml-impl.h +15 -0
  15. package/cpp/ggml-metal.h +8 -9
  16. package/cpp/ggml-metal.m +2428 -2111
  17. package/cpp/ggml-quants.c +2 -2
  18. package/cpp/ggml-quants.h +0 -4
  19. package/cpp/ggml.c +799 -1121
  20. package/cpp/ggml.h +79 -72
  21. package/cpp/llama-vocab.cpp +189 -106
  22. package/cpp/llama-vocab.h +18 -9
  23. package/cpp/llama.cpp +736 -341
  24. package/cpp/llama.h +9 -4
  25. package/cpp/unicode-data.cpp +6 -4
  26. package/cpp/unicode-data.h +4 -4
  27. package/cpp/unicode.cpp +14 -7
  28. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  29. package/lib/commonjs/index.js +4 -0
  30. package/lib/commonjs/index.js.map +1 -1
  31. package/lib/module/NativeRNLlama.js.map +1 -1
  32. package/lib/module/index.js +3 -0
  33. package/lib/module/index.js.map +1 -1
  34. package/lib/typescript/NativeRNLlama.d.ts +6 -0
  35. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  36. package/lib/typescript/index.d.ts +2 -1
  37. package/lib/typescript/index.d.ts.map +1 -1
  38. package/package.json +1 -1
  39. package/src/NativeRNLlama.ts +7 -0
  40. package/src/index.ts +5 -0
@@ -1,3 +1,13 @@
1
+ // Note: porting this file to C++ is a work in progress
2
+
3
+ #ifdef _WIN32
4
+ #define WIN32_LEAN_AND_MEAN
5
+ #ifndef NOMINMAX
6
+ # define NOMINMAX
7
+ #endif
8
+ #include <windows.h>
9
+ #endif
10
+
1
11
  #include "ggml-backend-impl.h"
2
12
  #include "ggml-alloc.h"
3
13
  #include "ggml-impl.h"
@@ -8,9 +18,14 @@
8
18
  #include <stdio.h>
9
19
  #include <stdlib.h>
10
20
  #include <string.h>
21
+ #include <string>
22
+ #include <vector>
11
23
 
24
+ #ifdef __APPLE__
25
+ #include <sys/types.h>
26
+ #include <sys/sysctl.h>
27
+ #endif
12
28
 
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
29
 
15
30
  // backend buffer type
16
31
 
@@ -18,7 +33,7 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
18
33
  return buft->iface.get_name(buft);
19
34
  }
20
35
 
21
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
36
+ lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
22
37
  return buft->iface.alloc_buffer(buft, size);
23
38
  }
24
39
 
@@ -34,7 +49,7 @@ size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) {
34
49
  return SIZE_MAX;
35
50
  }
36
51
 
37
- LM_GGML_CALL size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
52
+ size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
38
53
  // get_alloc_size is optional, defaults to lm_ggml_nbytes
39
54
  if (buft->iface.get_alloc_size) {
40
55
  size_t size = buft->iface.get_alloc_size(buft, tensor);
@@ -51,16 +66,18 @@ bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) {
51
66
  return false;
52
67
  }
53
68
 
54
- // backend buffer
69
+ lm_ggml_backend_dev_t lm_ggml_backend_buft_get_device(lm_ggml_backend_buffer_type_t buft) {
70
+ return buft->device;
71
+ }
55
72
 
56
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
57
- lm_ggml_backend_buffer_type_t buft,
58
- struct lm_ggml_backend_buffer_i iface,
59
- lm_ggml_backend_buffer_context_t context,
60
- size_t size) {
61
- lm_ggml_backend_buffer_t buffer = malloc(sizeof(struct lm_ggml_backend_buffer));
73
+ // backend buffer
62
74
 
63
- (*buffer) = (struct lm_ggml_backend_buffer) {
75
+ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
76
+ lm_ggml_backend_buffer_type_t buft,
77
+ struct lm_ggml_backend_buffer_i iface,
78
+ void * context,
79
+ size_t size) {
80
+ lm_ggml_backend_buffer_t buffer = new lm_ggml_backend_buffer {
64
81
  /* .interface = */ iface,
65
82
  /* .buft = */ buft,
66
83
  /* .context = */ context,
@@ -83,7 +100,7 @@ void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
83
100
  if (buffer->iface.free_buffer != NULL) {
84
101
  buffer->iface.free_buffer(buffer);
85
102
  }
86
- free(buffer);
103
+ delete buffer;
87
104
  }
88
105
 
89
106
  size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
@@ -98,14 +115,14 @@ void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
98
115
  return base;
99
116
  }
100
117
 
101
- LM_GGML_CALL void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
118
+ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
102
119
  // init_tensor is optional
103
120
  if (buffer->iface.init_tensor) {
104
121
  buffer->iface.init_tensor(buffer, tensor);
105
122
  }
106
123
  }
107
124
 
108
- size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) {
125
+ size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
109
126
  return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
110
127
  }
111
128
 
@@ -218,7 +235,7 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
218
235
  }
219
236
  }
220
237
 
221
- LM_GGML_CALL void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
238
+ void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
239
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
240
 
224
241
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
@@ -232,7 +249,7 @@ LM_GGML_CALL void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, con
232
249
  buf->iface.set_tensor(buf, tensor, data, offset, size);
233
250
  }
234
251
 
235
- LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
252
+ void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
253
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
254
 
238
255
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
@@ -246,7 +263,7 @@ LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tenso
246
263
  buf->iface.get_tensor(buf, tensor, data, offset, size);
247
264
  }
248
265
 
249
- LM_GGML_API LM_GGML_CALL void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
266
+ LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
250
267
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
251
268
 
252
269
  LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
@@ -299,20 +316,39 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
299
316
  }
300
317
 
301
318
  bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
319
+ // helper to ease transition to device interface
320
+ if (backend->device) {
321
+ return lm_ggml_backend_dev_supports_op(backend->device, op);
322
+ }
323
+
302
324
  return backend->iface.supports_op(backend, op);
303
325
  }
304
326
 
305
327
  bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
328
+ // helper to ease transition to device interface
329
+ if (backend->device) {
330
+ return lm_ggml_backend_dev_supports_buft(backend->device, buft);
331
+ }
332
+
306
333
  return backend->iface.supports_buft(backend, buft);
307
334
  }
308
335
 
309
336
  bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
337
+ // helper to ease transition to device interface
338
+ if (backend->device) {
339
+ return lm_ggml_backend_dev_offload_op(backend->device, op);
340
+ }
341
+
310
342
  if (backend->iface.offload_op != NULL) {
311
343
  return backend->iface.offload_op(backend, op);
312
344
  }
313
345
  return false;
314
346
  }
315
347
 
348
+ lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
349
+ return backend->device;
350
+ }
351
+
316
352
  // backend copy
317
353
 
318
354
  static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
@@ -375,30 +411,31 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba
375
411
 
376
412
  // events
377
413
 
378
- lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_t backend) {
379
- if (backend->iface.event_new == NULL) {
414
+ lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_dev_t device) {
415
+ // null device is allowed for the transition period to the device interface
416
+ if (device == NULL || device->iface.event_new == NULL) {
380
417
  return NULL;
381
418
  }
382
- return backend->iface.event_new(backend);
419
+ return device->iface.event_new(device);
383
420
  }
384
421
 
385
422
  void lm_ggml_backend_event_free(lm_ggml_backend_event_t event) {
386
423
  if (event == NULL) {
387
424
  return;
388
425
  }
389
- event->backend->iface.event_free(event);
426
+ event->device->iface.event_free(event->device, event);
390
427
  }
391
428
 
392
- void lm_ggml_backend_event_record(lm_ggml_backend_event_t event) {
393
- LM_GGML_ASSERT(event->backend->iface.event_record != NULL);
429
+ void lm_ggml_backend_event_record(lm_ggml_backend_event_t event, lm_ggml_backend_t backend) {
430
+ LM_GGML_ASSERT(backend->iface.event_record != NULL);
394
431
 
395
- event->backend->iface.event_record(event);
432
+ backend->iface.event_record(backend, event);
396
433
  }
397
434
 
398
435
  void lm_ggml_backend_event_synchronize(lm_ggml_backend_event_t event) {
399
- LM_GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
436
+ LM_GGML_ASSERT(event->device->iface.event_synchronize);
400
437
 
401
- event->backend->iface.event_synchronize(event);
438
+ event->device->iface.event_synchronize(event->device, event);
402
439
  }
403
440
 
404
441
  void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event_t event) {
@@ -407,170 +444,246 @@ void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event
407
444
  backend->iface.event_wait(backend, event);
408
445
  }
409
446
 
410
- // backend registry
447
+ // Backend device
411
448
 
412
- #define LM_GGML_REG_MAX_BACKENDS 64
449
+ const char * lm_ggml_backend_dev_name(lm_ggml_backend_dev_t device) {
450
+ return device->iface.get_name(device);
451
+ }
413
452
 
414
- struct lm_ggml_backend_reg {
415
- char name[128];
416
- lm_ggml_backend_init_fn init_fn;
417
- lm_ggml_backend_buffer_type_t default_buffer_type;
418
- void * user_data;
419
- };
453
+ const char * lm_ggml_backend_dev_description(lm_ggml_backend_dev_t device) {
454
+ return device->iface.get_description(device);
455
+ }
456
+
457
+ void lm_ggml_backend_dev_memory(lm_ggml_backend_dev_t device, size_t * free, size_t * total) {
458
+ device->iface.get_memory(device, free, total);
459
+ }
420
460
 
421
- static struct lm_ggml_backend_reg lm_ggml_backend_registry[LM_GGML_REG_MAX_BACKENDS];
422
- static size_t lm_ggml_backend_registry_count = 0;
461
+ enum lm_ggml_backend_dev_type lm_ggml_backend_dev_type(lm_ggml_backend_dev_t device) {
462
+ return device->iface.get_type(device);
463
+ }
423
464
 
424
- LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data);
465
+ void lm_ggml_backend_dev_get_props(lm_ggml_backend_dev_t device, struct lm_ggml_backend_dev_props * props) {
466
+ memset(props, 0, sizeof(*props));
467
+ device->iface.get_props(device, props);
468
+ }
425
469
 
426
- LM_GGML_CALL static void lm_ggml_backend_registry_init(void) {
427
- static bool initialized = false;
470
+ lm_ggml_backend_reg_t lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_t device) {
471
+ return device->reg;
472
+ }
428
473
 
429
- if (initialized) {
430
- return;
474
+ lm_ggml_backend_t lm_ggml_backend_dev_init(lm_ggml_backend_dev_t device, const char * params) {
475
+ return device->iface.init_backend(device, params);
476
+ }
477
+
478
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_buffer_type(lm_ggml_backend_dev_t device) {
479
+ return device->iface.get_buffer_type(device);
480
+ }
481
+
482
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_host_buffer_type(lm_ggml_backend_dev_t device) {
483
+ if (device->iface.get_host_buffer_type == NULL) {
484
+ return NULL;
431
485
  }
432
486
 
433
- initialized = true;
487
+ return device->iface.get_host_buffer_type(device);
488
+ }
434
489
 
435
- lm_ggml_backend_register("CPU", lm_ggml_backend_reg_cpu_init, lm_ggml_backend_cpu_buffer_type(), NULL);
490
+ lm_ggml_backend_buffer_t lm_ggml_backend_dev_buffer_from_host_ptr(lm_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
491
+ return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
492
+ }
436
493
 
437
- // add forward decls here to avoid including the backend headers
438
- #ifdef LM_GGML_USE_CUDA
439
- extern LM_GGML_CALL void lm_ggml_backend_cuda_reg_devices(void);
440
- lm_ggml_backend_cuda_reg_devices();
441
- #endif
494
+ bool lm_ggml_backend_dev_supports_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
495
+ return device->iface.supports_op(device, op);
496
+ }
497
+
498
+ bool lm_ggml_backend_dev_supports_buft(lm_ggml_backend_dev_t device, lm_ggml_backend_buffer_type_t buft) {
499
+ return device->iface.supports_buft(device, buft);
500
+ }
501
+
502
+ bool lm_ggml_backend_dev_offload_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
503
+ if (device->iface.offload_op != NULL) {
504
+ return device->iface.offload_op(device, op);
505
+ }
506
+
507
+ return false;
508
+ }
442
509
 
443
- #ifdef LM_GGML_USE_SYCL
444
- extern void lm_ggml_backend_sycl_reg_devices(void);
445
- lm_ggml_backend_sycl_reg_devices();
510
+ // Backend (reg)
511
+
512
+ const char * lm_ggml_backend_reg_name(lm_ggml_backend_reg_t reg) {
513
+ return reg->iface.get_name(reg);
514
+ }
515
+
516
+ size_t lm_ggml_backend_reg_dev_count(lm_ggml_backend_reg_t reg) {
517
+ return reg->iface.get_device_count(reg);
518
+ }
519
+
520
+ lm_ggml_backend_dev_t lm_ggml_backend_reg_dev_get(lm_ggml_backend_reg_t reg, size_t index) {
521
+ return reg->iface.get_device(reg, index);
522
+ }
523
+
524
+ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
525
+ if (!reg->iface.get_proc_address) {
526
+ return NULL;
527
+ }
528
+ return reg->iface.get_proc_address(reg, name);
529
+ }
530
+
531
+ // Backend registry
532
+
533
+ #ifdef LM_GGML_USE_CUDA
534
+ #include "ggml-cuda.h"
446
535
  #endif
447
536
 
448
537
  #ifdef LM_GGML_USE_METAL
449
- extern LM_GGML_CALL lm_ggml_backend_t lm_ggml_backend_reg_metal_init(const char * params, void * user_data);
450
- extern LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
451
- lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL);
538
+ #include "ggml-metal.h"
452
539
  #endif
453
540
 
454
- #ifdef LM_GGML_USE_VULKAN
455
- extern LM_GGML_CALL int lm_ggml_backend_vk_reg_devices(void);
456
- lm_ggml_backend_vk_reg_devices();
541
+ #ifdef LM_GGML_USE_BLAS
542
+ #include "ggml-blas.h"
457
543
  #endif
458
544
 
459
- #ifdef LM_GGML_USE_KOMPUTE
460
- extern LM_GGML_CALL void lm_ggml_backend_kompute_reg_devices(void);
461
- lm_ggml_backend_kompute_reg_devices();
462
- #endif
545
+ struct lm_ggml_backend_registry {
546
+ std::vector<lm_ggml_backend_reg_t> backends;
547
+ std::vector<lm_ggml_backend_dev_t> devices;
463
548
 
464
- #ifdef LM_GGML_USE_CANN
465
- extern LM_GGML_CALL int lm_ggml_backend_cann_reg_devices(void);
466
- lm_ggml_backend_cann_reg_devices();
549
+ lm_ggml_backend_registry() {
550
+ #ifdef LM_GGML_USE_CUDA
551
+ register_backend(lm_ggml_backend_cuda_reg());
552
+ #endif
553
+ #ifdef LM_GGML_USE_METAL
554
+ register_backend(lm_ggml_backend_metal_reg());
555
+ #endif
556
+ #ifdef LM_GGML_USE_BLAS
557
+ register_backend(lm_ggml_backend_blas_reg());
467
558
  #endif
468
- }
469
-
470
- LM_GGML_CALL void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
471
- LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_REG_MAX_BACKENDS);
472
559
 
473
- size_t id = lm_ggml_backend_registry_count;
560
+ // TODO: sycl, vulkan, kompute, cann
474
561
 
475
- lm_ggml_backend_registry[id] = (struct lm_ggml_backend_reg) {
476
- /* .name = */ {0},
477
- /* .fn = */ init_fn,
478
- /* .default_buffer_type = */ default_buffer_type,
479
- /* .user_data = */ user_data,
480
- };
562
+ register_backend(lm_ggml_backend_cpu_reg());
563
+ }
481
564
 
482
- snprintf(lm_ggml_backend_registry[id].name, sizeof(lm_ggml_backend_registry[id].name), "%s", name);
565
+ void register_backend(lm_ggml_backend_reg_t reg) {
566
+ #ifndef NDEBUG
567
+ fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
568
+ __func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
569
+ #endif
570
+ backends.push_back(reg);
571
+ for (size_t i = 0; i < lm_ggml_backend_reg_dev_count(reg); i++) {
572
+ register_device(lm_ggml_backend_reg_dev_get(reg, i));
573
+ }
574
+ }
483
575
 
576
+ void register_device(lm_ggml_backend_dev_t device) {
484
577
  #ifndef NDEBUG
485
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
578
+ fprintf(stderr, "%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
486
579
  #endif
580
+ devices.push_back(device);
581
+ }
582
+ };
487
583
 
488
- lm_ggml_backend_registry_count++;
584
+ static lm_ggml_backend_registry & get_reg() {
585
+ static lm_ggml_backend_registry reg;
586
+ return reg;
489
587
  }
490
588
 
491
- size_t lm_ggml_backend_reg_get_count(void) {
492
- lm_ggml_backend_registry_init();
589
+ // Internal API
590
+ void lm_ggml_backend_register(lm_ggml_backend_reg_t reg) {
591
+ get_reg().register_backend(reg);
592
+ }
493
593
 
494
- return lm_ggml_backend_registry_count;
594
+ void lm_ggml_backend_device_register(lm_ggml_backend_dev_t device) {
595
+ get_reg().register_device(device);
495
596
  }
496
597
 
497
- size_t lm_ggml_backend_reg_find_by_name(const char * name) {
498
- lm_ggml_backend_registry_init();
598
+ // Backend (reg) enumeration
599
+ size_t lm_ggml_backend_reg_count() {
600
+ return get_reg().backends.size();
601
+ }
499
602
 
500
- for (size_t i = 0; i < lm_ggml_backend_registry_count; i++) {
501
- // TODO: case insensitive in a portable way
502
- if (strcmp(lm_ggml_backend_registry[i].name, name) == 0) {
503
- return i;
603
+ lm_ggml_backend_reg_t lm_ggml_backend_reg_get(size_t index) {
604
+ LM_GGML_ASSERT(index < lm_ggml_backend_reg_count());
605
+ return get_reg().backends[index];
606
+ }
607
+
608
+ lm_ggml_backend_reg_t lm_ggml_backend_reg_by_name(const char * name) {
609
+ for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
610
+ lm_ggml_backend_reg_t reg = lm_ggml_backend_reg_get(i);
611
+ if (strcmp(lm_ggml_backend_reg_name(reg), name) == 0) {
612
+ return reg;
504
613
  }
505
614
  }
506
-
507
- // not found
508
- return SIZE_MAX;
615
+ return NULL;
509
616
  }
510
617
 
511
- // init from backend:params string
512
- lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend_str) {
513
- lm_ggml_backend_registry_init();
514
-
515
- const char * params = strchr(backend_str, ':');
516
- char backend_name[128];
517
- if (params == NULL) {
518
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
519
- params = "";
520
- } else {
521
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
522
- params++;
523
- }
618
+ // Device enumeration
619
+ size_t lm_ggml_backend_dev_count() {
620
+ return get_reg().devices.size();
621
+ }
524
622
 
525
- size_t backend_i = lm_ggml_backend_reg_find_by_name(backend_name);
623
+ lm_ggml_backend_dev_t lm_ggml_backend_dev_get(size_t index) {
624
+ LM_GGML_ASSERT(index < lm_ggml_backend_dev_count());
625
+ return get_reg().devices[index];
626
+ }
526
627
 
527
- if (backend_i == SIZE_MAX) {
528
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
529
- return NULL;
628
+ lm_ggml_backend_dev_t lm_ggml_backend_dev_by_name(const char * name) {
629
+ for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
630
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
631
+ if (strcmp(lm_ggml_backend_dev_name(dev), name) == 0) {
632
+ return dev;
633
+ }
530
634
  }
531
-
532
- return lm_ggml_backend_reg_init_backend(backend_i, params);
635
+ return NULL;
533
636
  }
534
637
 
535
- const char * lm_ggml_backend_reg_get_name(size_t i) {
536
- lm_ggml_backend_registry_init();
537
-
538
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
539
- return lm_ggml_backend_registry[i].name;
638
+ lm_ggml_backend_dev_t lm_ggml_backend_dev_by_type(enum lm_ggml_backend_dev_type type) {
639
+ for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
640
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
641
+ if (lm_ggml_backend_dev_type(dev) == type) {
642
+ return dev;
643
+ }
644
+ }
645
+ return NULL;
540
646
  }
541
647
 
542
- lm_ggml_backend_t lm_ggml_backend_reg_init_backend(size_t i, const char * params) {
543
- lm_ggml_backend_registry_init();
544
-
545
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
546
- return lm_ggml_backend_registry[i].init_fn(params, lm_ggml_backend_registry[i].user_data);
648
+ // Convenience functions
649
+ lm_ggml_backend_t lm_ggml_backend_init_by_name(const char * name, const char * params) {
650
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_name(name);
651
+ if (!dev) {
652
+ return NULL;
653
+ }
654
+ return lm_ggml_backend_dev_init(dev, params);
547
655
  }
548
656
 
549
- lm_ggml_backend_buffer_type_t lm_ggml_backend_reg_get_default_buffer_type(size_t i) {
550
- lm_ggml_backend_registry_init();
551
-
552
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
553
- return lm_ggml_backend_registry[i].default_buffer_type;
657
+ lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type type, const char * params) {
658
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(type);
659
+ if (!dev) {
660
+ return NULL;
661
+ }
662
+ return lm_ggml_backend_dev_init(dev, params);
554
663
  }
555
664
 
556
- lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
557
- lm_ggml_backend_registry_init();
558
-
559
- LM_GGML_ASSERT(i < lm_ggml_backend_registry_count);
560
- return lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_registry[i].default_buffer_type, size);
665
+ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
666
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
667
+ if (!dev) {
668
+ dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
669
+ }
670
+ if (!dev) {
671
+ return NULL;
672
+ }
673
+ return lm_ggml_backend_dev_init(dev, NULL);
561
674
  }
562
675
 
563
676
  // backend CPU
564
677
 
565
678
  static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
566
679
 
567
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_name(lm_ggml_backend_buffer_t buffer) {
680
+ static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
568
681
  return "CPU";
569
682
 
570
683
  LM_GGML_UNUSED(buffer);
571
684
  }
572
685
 
573
- LM_GGML_CALL static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
686
+ static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
574
687
  uintptr_t data = (uintptr_t)buffer->context;
575
688
 
576
689
  // align the buffer
@@ -581,29 +694,29 @@ LM_GGML_CALL static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_b
581
694
  return (void *)data;
582
695
  }
583
696
 
584
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
697
+ static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
585
698
  free(buffer->context);
586
699
  }
587
700
 
588
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
701
+ static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
589
702
  memset((char *)tensor->data + offset, value, size);
590
703
 
591
704
  LM_GGML_UNUSED(buffer);
592
705
  }
593
706
 
594
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
707
+ static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
595
708
  memcpy((char *)tensor->data + offset, data, size);
596
709
 
597
710
  LM_GGML_UNUSED(buffer);
598
711
  }
599
712
 
600
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
713
+ static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
601
714
  memcpy(data, (const char *)tensor->data + offset, size);
602
715
 
603
716
  LM_GGML_UNUSED(buffer);
604
717
  }
605
718
 
606
- LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
719
+ static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
607
720
  if (lm_ggml_backend_buffer_is_host(src->buffer)) {
608
721
  memcpy(dst->data, src->data, lm_ggml_nbytes(src));
609
722
  return true;
@@ -613,12 +726,12 @@ LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_b
613
726
  LM_GGML_UNUSED(buffer);
614
727
  }
615
728
 
616
- LM_GGML_CALL static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
729
+ static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
617
730
  memset(buffer->context, value, buffer->size);
618
731
  }
619
732
 
620
- static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = {
621
- /* .get_name = */ lm_ggml_backend_cpu_buffer_name,
733
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
734
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
622
735
  /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
623
736
  /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
624
737
  /* .init_tensor = */ NULL, // no initialization required
@@ -630,9 +743,8 @@ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = {
630
743
  /* .reset = */ NULL,
631
744
  };
632
745
 
633
- // for buffers from ptr, free is not called
634
- static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
635
- /* .get_name = */ lm_ggml_backend_cpu_buffer_name,
746
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
747
+ /* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
636
748
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
637
749
  /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
638
750
  /* .init_tensor = */ NULL, // no initialization required
@@ -644,13 +756,13 @@ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
644
756
  /* .reset = */ NULL,
645
757
  };
646
758
 
647
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
759
+ static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
648
760
  return "CPU";
649
761
 
650
762
  LM_GGML_UNUSED(buft);
651
763
  }
652
764
 
653
- LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
765
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
654
766
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
655
767
  void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
656
768
  if (data == NULL) {
@@ -658,24 +770,24 @@ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_all
658
770
  return NULL;
659
771
  }
660
772
 
661
- return lm_ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
773
+ return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
662
774
  }
663
775
 
664
- LM_GGML_CALL static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
776
+ static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
665
777
  return TENSOR_ALIGNMENT;
666
778
 
667
779
  LM_GGML_UNUSED(buft);
668
780
  }
669
781
 
670
- LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
782
+ static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
671
783
  return true;
672
784
 
673
785
  LM_GGML_UNUSED(buft);
674
786
  }
675
787
 
676
- LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
788
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
677
789
  static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
678
- /* .iface = */ {
790
+ /* .iface = */ {
679
791
  /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
680
792
  /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
681
793
  /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
@@ -683,6 +795,7 @@ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void)
683
795
  /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
684
796
  /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
685
797
  },
798
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
686
799
  /* .context = */ NULL,
687
800
  };
688
801
 
@@ -695,23 +808,23 @@ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void)
695
808
 
696
809
  #include <hbwmalloc.h>
697
810
 
698
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
811
+ static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
699
812
  return "CPU_HBM";
700
813
 
701
814
  LM_GGML_UNUSED(buft);
702
815
  }
703
816
 
704
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
817
+ static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
705
818
  return "CPU_HBM";
706
819
 
707
820
  LM_GGML_UNUSED(buf);
708
821
  }
709
822
 
710
- LM_GGML_CALL static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
823
+ static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
711
824
  hbw_free(buffer->context);
712
825
  }
713
826
 
714
- LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
827
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
715
828
  //void * ptr = hbw_malloc(size);
716
829
  void * ptr;
717
830
  int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -749,27 +862,27 @@ struct lm_ggml_backend_cpu_context {
749
862
  int n_threads;
750
863
  lm_ggml_threadpool_t threadpool;
751
864
 
752
- void * work_data;
865
+ uint8_t * work_data;
753
866
  size_t work_size;
754
867
 
755
868
  lm_ggml_abort_callback abort_callback;
756
869
  void * abort_callback_data;
757
870
  };
758
871
 
759
- LM_GGML_CALL static const char * lm_ggml_backend_cpu_name(lm_ggml_backend_t backend) {
872
+ static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
760
873
  return "CPU";
761
874
 
762
875
  LM_GGML_UNUSED(backend);
763
876
  }
764
877
 
765
- LM_GGML_CALL static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
878
+ static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
766
879
  struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
767
- free(cpu_ctx->work_data);
768
- free(cpu_ctx);
769
- free(backend);
880
+ delete[] cpu_ctx->work_data;
881
+ delete cpu_ctx;
882
+ delete backend;
770
883
  }
771
884
 
772
- LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
885
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
773
886
  return lm_ggml_backend_cpu_buffer_type();
774
887
 
775
888
  LM_GGML_UNUSED(backend);
@@ -780,18 +893,18 @@ struct lm_ggml_backend_plan_cpu {
780
893
  struct lm_ggml_cgraph cgraph;
781
894
  };
782
895
 
783
- LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
896
+ static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
784
897
  struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
785
898
 
786
- struct lm_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct lm_ggml_backend_plan_cpu));
899
+ struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
787
900
 
788
901
  cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
789
902
  cpu_plan->cgraph = *cgraph; // FIXME: deep copy
790
903
 
791
904
  if (cpu_plan->cplan.work_size > 0) {
792
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
905
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
793
906
  if (cpu_plan->cplan.work_data == NULL) {
794
- free(cpu_plan);
907
+ delete cpu_plan;
795
908
  return NULL;
796
909
  }
797
910
  }
@@ -802,16 +915,16 @@ LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_
802
915
  return cpu_plan;
803
916
  }
804
917
 
805
- LM_GGML_CALL static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
918
+ static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
806
919
  struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
807
920
 
808
- free(cpu_plan->cplan.work_data);
809
- free(cpu_plan);
921
+ delete[] cpu_plan->cplan.work_data;
922
+ delete cpu_plan;
810
923
 
811
924
  LM_GGML_UNUSED(backend);
812
925
  }
813
926
 
814
- LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
927
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
815
928
  struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
816
929
 
817
930
  return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -819,21 +932,21 @@ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(l
819
932
  LM_GGML_UNUSED(backend);
820
933
  }
821
934
 
822
- LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
935
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
823
936
  struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
824
937
 
825
938
  struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
826
939
 
827
940
  if (cpu_ctx->work_size < cplan.work_size) {
828
- free(cpu_ctx->work_data);
829
- cpu_ctx->work_data = malloc(cplan.work_size);
941
+ delete[] cpu_ctx->work_data;
942
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
830
943
  if (cpu_ctx->work_data == NULL) {
831
944
  cpu_ctx->work_size = 0;
832
945
  return LM_GGML_STATUS_ALLOC_FAILED;
833
946
  }
834
947
  cpu_ctx->work_size = cplan.work_size;
835
948
  }
836
- cplan.work_data = cpu_ctx->work_data;
949
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
837
950
 
838
951
  cplan.abort_callback = cpu_ctx->abort_callback;
839
952
  cplan.abort_callback_data = cpu_ctx->abort_callback_data;
@@ -841,35 +954,8 @@ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggm
841
954
  return lm_ggml_graph_compute(cgraph, &cplan);
842
955
  }
843
956
 
844
- LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
845
- switch (op->op) {
846
- case LM_GGML_OP_CPY:
847
- return
848
- op->type != LM_GGML_TYPE_IQ2_XXS &&
849
- op->type != LM_GGML_TYPE_IQ2_XS &&
850
- op->type != LM_GGML_TYPE_IQ1_S &&
851
- op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
852
- case LM_GGML_OP_MUL_MAT:
853
- return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
854
- case LM_GGML_OP_ROPE_BACK:
855
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
856
- case LM_GGML_OP_IM2COL_BACK:
857
- return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
858
- default:
859
- return true;
860
- }
861
-
862
- LM_GGML_UNUSED(backend);
863
- }
864
-
865
- LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
866
- return lm_ggml_backend_buft_is_host(buft);
867
-
868
- LM_GGML_UNUSED(backend);
869
- }
870
-
871
- static struct lm_ggml_backend_i cpu_backend_i = {
872
- /* .get_name = */ lm_ggml_backend_cpu_name,
957
+ static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
958
+ /* .get_name = */ lm_ggml_backend_cpu_get_name,
873
959
  /* .free = */ lm_ggml_backend_cpu_free,
874
960
  /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
875
961
  /* .set_tensor_async = */ NULL,
@@ -881,14 +967,11 @@ static struct lm_ggml_backend_i cpu_backend_i = {
881
967
  /* .graph_plan_update = */ NULL,
882
968
  /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
883
969
  /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
884
- /* .supports_op = */ lm_ggml_backend_cpu_supports_op,
885
- /* .supports_buft = */ lm_ggml_backend_cpu_supports_buft,
970
+ /* .supports_op = */ NULL,
971
+ /* .supports_buft = */ NULL,
886
972
  /* .offload_op = */ NULL,
887
- /* .event_new = */ NULL,
888
- /* .event_free = */ NULL,
889
973
  /* .event_record = */ NULL,
890
974
  /* .event_wait = */ NULL,
891
- /* .event_synchronize = */ NULL,
892
975
  };
893
976
 
894
977
  static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
@@ -897,7 +980,7 @@ static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
897
980
  }
898
981
 
899
982
  lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
900
- struct lm_ggml_backend_cpu_context * ctx = malloc(sizeof(struct lm_ggml_backend_cpu_context));
983
+ struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
901
984
  if (ctx == NULL) {
902
985
  return NULL;
903
986
  }
@@ -909,21 +992,22 @@ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
909
992
  ctx->abort_callback = NULL;
910
993
  ctx->abort_callback_data = NULL;
911
994
 
912
- lm_ggml_backend_t cpu_backend = malloc(sizeof(struct lm_ggml_backend));
995
+ lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
996
+ /* .guid = */ lm_ggml_backend_cpu_guid(),
997
+ /* .interface = */ lm_ggml_backend_cpu_i,
998
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
999
+ /* .context = */ ctx,
1000
+ };
1001
+
913
1002
  if (cpu_backend == NULL) {
914
- free(ctx);
1003
+ delete ctx;
915
1004
  return NULL;
916
1005
  }
917
1006
 
918
- *cpu_backend = (struct lm_ggml_backend) {
919
- /* .guid = */ lm_ggml_backend_cpu_guid(),
920
- /* .interface = */ cpu_backend_i,
921
- /* .context = */ ctx
922
- };
923
1007
  return cpu_backend;
924
1008
  }
925
1009
 
926
- LM_GGML_CALL bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
1010
+ bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
927
1011
  return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
928
1012
  }
929
1013
 
@@ -954,16 +1038,233 @@ void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_gg
954
1038
  ctx->abort_callback_data = abort_callback_data;
955
1039
  }
956
1040
 
957
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
1041
+ lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
958
1042
  LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
959
- return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
1043
+ return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
960
1044
  }
961
1045
 
962
- LM_GGML_CALL static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void * user_data) {
1046
+ ////////////////////////
1047
+
1048
+ struct lm_ggml_backend_cpu_device_context {
1049
+ std::string description = "CPU";
1050
+
1051
+ lm_ggml_backend_cpu_device_context() {
1052
+ #ifdef __APPLE__
1053
+ size_t len = 0;
1054
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
1055
+ description.resize(len);
1056
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
1057
+ }
1058
+ #elif defined(__linux__)
1059
+ FILE * f = fopen("/proc/cpuinfo", "r");
1060
+ if (f) {
1061
+ char buf[1024];
1062
+ while (fgets(buf, sizeof(buf), f)) {
1063
+ if (strncmp(buf, "model name", 10) == 0) {
1064
+ char * p = strchr(buf, ':');
1065
+ if (p) {
1066
+ p++;
1067
+ while (std::isspace(*p)) {
1068
+ p++;
1069
+ }
1070
+ while (std::isspace(p[strlen(p) - 1])) {
1071
+ p[strlen(p) - 1] = '\0';
1072
+ }
1073
+ description = p;
1074
+ break;
1075
+ }
1076
+ }
1077
+ }
1078
+ fclose(f);
1079
+ }
1080
+ #elif defined(_WIN32)
1081
+ HKEY hKey;
1082
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
1083
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
1084
+ 0,
1085
+ KEY_READ,
1086
+ &hKey) == ERROR_SUCCESS) {
1087
+ DWORD cpu_brand_size = 0;
1088
+ if (RegQueryValueExA(hKey,
1089
+ TEXT("ProcessorNameString"),
1090
+ NULL,
1091
+ NULL,
1092
+ NULL,
1093
+ &cpu_brand_size) == ERROR_SUCCESS) {
1094
+ description.resize(cpu_brand_size);
1095
+ if (RegQueryValueExA(hKey,
1096
+ TEXT("ProcessorNameString"),
1097
+ NULL,
1098
+ NULL,
1099
+ (LPBYTE)&description[0], // NOLINT
1100
+ &cpu_brand_size) == ERROR_SUCCESS) {
1101
+ if (description.find('\0') != std::string::npos) {
1102
+ description.resize(description.find('\0'));
1103
+ }
1104
+ }
1105
+ }
1106
+ RegCloseKey(hKey);
1107
+ }
1108
+ #endif
1109
+ }
1110
+ };
1111
+
1112
+ static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
1113
+ return "CPU";
1114
+
1115
+ LM_GGML_UNUSED(dev);
1116
+ }
1117
+
1118
+ static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
1119
+ struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
1120
+
1121
+ return ctx->description.c_str();
1122
+ }
1123
+
1124
+ static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
1125
+ // TODO
1126
+ *free = 0;
1127
+ *total = 0;
1128
+
1129
+ LM_GGML_UNUSED(dev);
1130
+ }
1131
+
1132
+ static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
1133
+ return LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
1134
+
1135
+ LM_GGML_UNUSED(dev);
1136
+ }
1137
+
1138
+ static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
1139
+ props->name = lm_ggml_backend_cpu_device_get_name(dev);
1140
+ props->description = lm_ggml_backend_cpu_device_get_description(dev);
1141
+ props->type = lm_ggml_backend_cpu_device_get_type(dev);
1142
+ lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
1143
+ props->caps = {
1144
+ /* .async = */ false,
1145
+ /* .host_buffer = */ false,
1146
+ /* .buffer_from_host_ptr = */ true,
1147
+ /* .events = */ false,
1148
+ };
1149
+ }
1150
+
1151
+ static lm_ggml_backend_t lm_ggml_backend_cpu_device_init(lm_ggml_backend_dev_t dev, const char * params) {
963
1152
  return lm_ggml_backend_cpu_init();
964
1153
 
1154
+ LM_GGML_UNUSED(dev);
965
1155
  LM_GGML_UNUSED(params);
966
- LM_GGML_UNUSED(user_data);
1156
+ }
1157
+
1158
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
1159
+ return lm_ggml_backend_cpu_buffer_type();
1160
+
1161
+ LM_GGML_UNUSED(dev);
1162
+ }
1163
+
1164
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1165
+ return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
1166
+
1167
+ LM_GGML_UNUSED(dev);
1168
+ LM_GGML_UNUSED(max_tensor_size);
1169
+ }
1170
+
1171
+ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
1172
+ switch (op->op) {
1173
+ case LM_GGML_OP_CPY:
1174
+ return
1175
+ op->type != LM_GGML_TYPE_IQ2_XXS &&
1176
+ op->type != LM_GGML_TYPE_IQ2_XS &&
1177
+ op->type != LM_GGML_TYPE_IQ1_S &&
1178
+ op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
1179
+ case LM_GGML_OP_MUL_MAT:
1180
+ return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
1181
+ case LM_GGML_OP_ROPE_BACK:
1182
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
1183
+ case LM_GGML_OP_IM2COL_BACK:
1184
+ return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
1185
+ case LM_GGML_OP_OUT_PROD:
1186
+ return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
1187
+ default:
1188
+ return true;
1189
+ }
1190
+
1191
+ LM_GGML_UNUSED(dev);
1192
+ }
1193
+
1194
+ static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
1195
+ return lm_ggml_backend_buft_is_host(buft);
1196
+
1197
+ LM_GGML_UNUSED(dev);
1198
+ }
1199
+
1200
+ static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
1201
+ /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
1202
+ /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
1203
+ /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
1204
+ /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
1205
+ /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
1206
+ /* .init_backend = */ lm_ggml_backend_cpu_device_init,
1207
+ /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
1208
+ /* .get_host_buffer_type = */ NULL,
1209
+ /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_ptr,
1210
+ /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
1211
+ /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
1212
+ /* .offload_op = */ NULL,
1213
+ /* .event_new = */ NULL,
1214
+ /* .event_free = */ NULL,
1215
+ /* .event_synchronize = */ NULL,
1216
+ };
1217
+
1218
+ ////////////////////////
1219
+
1220
+ static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
1221
+ return "CPU";
1222
+
1223
+ LM_GGML_UNUSED(reg);
1224
+ }
1225
+
1226
+ static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
1227
+ return 1;
1228
+
1229
+ LM_GGML_UNUSED(reg);
1230
+ }
1231
+
1232
+ static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
1233
+ LM_GGML_ASSERT(index == 0);
1234
+
1235
+ static lm_ggml_backend_cpu_device_context ctx;
1236
+ static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
1237
+ /* .iface = */ lm_ggml_backend_cpu_device_i,
1238
+ /* .reg = */ reg,
1239
+ /* .context = */ &ctx,
1240
+ };
1241
+
1242
+ return &lm_ggml_backend_cpu_device;
1243
+ }
1244
+
1245
+ static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
1246
+ if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
1247
+ return (void *)lm_ggml_backend_cpu_set_n_threads;
1248
+ }
1249
+ return NULL;
1250
+
1251
+ LM_GGML_UNUSED(reg);
1252
+ }
1253
+
1254
+ static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
1255
+ /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
1256
+ /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
1257
+ /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
1258
+ /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
1259
+ };
1260
+
1261
+ lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
1262
+ static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
1263
+ /* .iface = */ lm_ggml_backend_cpu_reg_i,
1264
+ /* .context = */ NULL,
1265
+ };
1266
+
1267
+ return &lm_ggml_backend_cpu_reg;
967
1268
  }
968
1269
 
969
1270
  // multi-buffer buffer
@@ -973,16 +1274,14 @@ struct lm_ggml_backend_multi_buffer_context {
973
1274
  size_t n_buffers;
974
1275
  };
975
1276
 
976
- typedef struct lm_ggml_backend_multi_buffer_context * lm_ggml_backend_multi_buffer_context_t;
977
-
978
- LM_GGML_CALL static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
979
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
1277
+ static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
1278
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
980
1279
 
981
1280
  return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
982
1281
  }
983
1282
 
984
- LM_GGML_CALL static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
985
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
1283
+ static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
1284
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
986
1285
  for (size_t i = 0; i < ctx->n_buffers; i++) {
987
1286
  lm_ggml_backend_buffer_free(ctx->buffers[i]);
988
1287
  }
@@ -991,32 +1290,28 @@ LM_GGML_CALL static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backen
991
1290
  free(ctx);
992
1291
  }
993
1292
 
994
- LM_GGML_CALL static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
995
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
1293
+ static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
1294
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
996
1295
  for (size_t i = 0; i < ctx->n_buffers; i++) {
997
1296
  lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
998
1297
  }
999
1298
  }
1000
1299
 
1001
- static struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_context_interface(void) {
1002
- static struct lm_ggml_backend_buffer_i multi_backend_buffer_i = {
1003
- /* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
1004
- /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
1005
- /* .get_base = */ NULL,
1006
- /* .init_tensor = */ NULL,
1007
- /* .memset_tensor = */ NULL,
1008
- /* .set_tensor = */ NULL,
1009
- /* .get_tensor = */ NULL,
1010
- /* .cpy_tensor = */ NULL,
1011
- /* .clear = */ lm_ggml_backend_multi_buffer_clear,
1012
- /* .reset = */ NULL,
1013
- };
1014
-
1015
- return multi_backend_buffer_i;
1016
- }
1300
+ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
1301
+ /* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
1302
+ /* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
1303
+ /* .get_base = */ NULL,
1304
+ /* .init_tensor = */ NULL,
1305
+ /* .memset_tensor = */ NULL,
1306
+ /* .set_tensor = */ NULL,
1307
+ /* .get_tensor = */ NULL,
1308
+ /* .cpy_tensor = */ NULL,
1309
+ /* .clear = */ lm_ggml_backend_multi_buffer_clear,
1310
+ /* .reset = */ NULL,
1311
+ };
1017
1312
 
1018
- LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
1019
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
1313
+ lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
1314
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
1020
1315
  ctx->n_buffers = n_buffers;
1021
1316
  ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
1022
1317
 
@@ -1028,16 +1323,16 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(
1028
1323
  total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
1029
1324
  }
1030
1325
 
1031
- return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_context_interface(), ctx, total_size);
1326
+ return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
1032
1327
  }
1033
1328
 
1034
- LM_GGML_CALL bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
1329
+ bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
1035
1330
  return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
1036
1331
  }
1037
1332
 
1038
- LM_GGML_CALL void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
1333
+ void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
1039
1334
  LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
1040
- lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
1335
+ lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
1041
1336
  for (size_t i = 0; i < ctx->n_buffers; i++) {
1042
1337
  lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1043
1338
  }
@@ -1592,7 +1887,8 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1592
1887
  i_split++;
1593
1888
  if (i_split >= sched->splits_capacity) {
1594
1889
  sched->splits_capacity *= 2;
1595
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1890
+ sched->splits = (lm_ggml_backend_sched_split *)
1891
+ realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
1596
1892
  LM_GGML_ASSERT(sched->splits != NULL);
1597
1893
  }
1598
1894
  split = &sched->splits[i_split];
@@ -1678,11 +1974,11 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1678
1974
  sched->prev_leaf_backend_ids = tmp;
1679
1975
  }
1680
1976
 
1681
- int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1977
+ int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1682
1978
  if (sched->graph.size < graph_size) {
1683
1979
  sched->graph.size = graph_size;
1684
- sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1685
- sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1980
+ sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
1981
+ sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
1686
1982
  LM_GGML_ASSERT(sched->graph.nodes != NULL);
1687
1983
  LM_GGML_ASSERT(sched->graph.leafs != NULL);
1688
1984
  }
@@ -1881,7 +2177,7 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_
1881
2177
  // record the event of this copy
1882
2178
  if (split->n_inputs > 0) {
1883
2179
  if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1884
- lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
2180
+ lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
1885
2181
  }
1886
2182
  }
1887
2183
  }
@@ -1901,7 +2197,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1901
2197
  LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1902
2198
  LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1903
2199
 
1904
- struct lm_ggml_backend_sched * sched = calloc(1, sizeof(struct lm_ggml_backend_sched));
2200
+ struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
1905
2201
 
1906
2202
  sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
1907
2203
  sched->n_backends = n_backends;
@@ -1910,21 +2206,21 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1910
2206
  // initialize hash table
1911
2207
  // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1912
2208
  sched->hash_set = lm_ggml_hash_set_new(graph_size);
1913
- sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1914
- sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
2209
+ sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
2210
+ sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
1915
2211
 
1916
2212
  const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1917
2213
  const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1918
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1919
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1920
- sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1921
- sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
2214
+ sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
2215
+ sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
2216
+ sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
2217
+ sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1922
2218
 
1923
2219
  sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
1924
- sched->context_buffer = malloc(sched->context_buffer_size);
2220
+ sched->context_buffer = (char *) malloc(sched->context_buffer_size);
1925
2221
 
1926
2222
  const int initial_splits_capacity = 16;
1927
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
2223
+ sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1928
2224
  sched->splits_capacity = initial_splits_capacity;
1929
2225
 
1930
2226
  for (int b = 0; b < n_backends; b++) {
@@ -1933,7 +2229,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1933
2229
  LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1934
2230
  if (sched->n_copies > 1) {
1935
2231
  for (int c = 0; c < sched->n_copies; c++) {
1936
- sched->events[b][c] = lm_ggml_backend_event_new(backends[b]);
2232
+ sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
1937
2233
  }
1938
2234
  }
1939
2235
  }
@@ -2169,8 +2465,8 @@ static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm
2169
2465
 
2170
2466
  struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
2171
2467
  struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
2172
- struct lm_ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2173
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2468
+ struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2469
+ bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
2174
2470
 
2175
2471
  struct lm_ggml_init_params params = {
2176
2472
  /* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
@@ -2188,7 +2484,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
2188
2484
  free(node_init);
2189
2485
  lm_ggml_free(ctx_allocated);
2190
2486
  lm_ggml_free(ctx_unallocated);
2191
- return (struct lm_ggml_backend_graph_copy) {
2487
+ return {
2192
2488
  /* .buffer = */ NULL,
2193
2489
  /* .ctx_allocated = */ NULL,
2194
2490
  /* .ctx_unallocated = */ NULL,
@@ -2211,7 +2507,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
2211
2507
  free(node_init);
2212
2508
  lm_ggml_free(ctx_allocated);
2213
2509
  lm_ggml_free(ctx_unallocated);
2214
- return (struct lm_ggml_backend_graph_copy) {
2510
+ return {
2215
2511
  /* .buffer = */ NULL,
2216
2512
  /* .ctx_allocated = */ NULL,
2217
2513
  /* .ctx_unallocated = */ NULL,
@@ -2240,7 +2536,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
2240
2536
  free(node_copies);
2241
2537
  free(node_init);
2242
2538
 
2243
- return (struct lm_ggml_backend_graph_copy) {
2539
+ return {
2244
2540
  /* .buffer = */ buffer,
2245
2541
  /* .ctx_allocated = */ ctx_allocated,
2246
2542
  /* .ctx_unallocated = */ ctx_unallocated,