cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +26 -6
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +228 -40
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/amx/amx.cpp +196 -0
  9. package/cpp/amx/amx.h +20 -0
  10. package/cpp/amx/common.h +101 -0
  11. package/cpp/amx/mmq.cpp +2524 -0
  12. package/cpp/amx/mmq.h +16 -0
  13. package/cpp/common.cpp +118 -251
  14. package/cpp/common.h +53 -30
  15. package/cpp/ggml-aarch64.c +46 -3395
  16. package/cpp/ggml-aarch64.h +0 -20
  17. package/cpp/ggml-alloc.c +6 -8
  18. package/cpp/ggml-backend-impl.h +33 -11
  19. package/cpp/ggml-backend-reg.cpp +423 -0
  20. package/cpp/ggml-backend.cpp +14 -676
  21. package/cpp/ggml-backend.h +46 -9
  22. package/cpp/ggml-common.h +6 -0
  23. package/cpp/ggml-cpu-aarch64.c +3823 -0
  24. package/cpp/ggml-cpu-aarch64.h +32 -0
  25. package/cpp/ggml-cpu-impl.h +14 -242
  26. package/cpp/ggml-cpu-quants.c +10835 -0
  27. package/cpp/ggml-cpu-quants.h +63 -0
  28. package/cpp/ggml-cpu.c +13971 -13720
  29. package/cpp/ggml-cpu.cpp +715 -0
  30. package/cpp/ggml-cpu.h +65 -63
  31. package/cpp/ggml-impl.h +285 -25
  32. package/cpp/ggml-metal.h +8 -8
  33. package/cpp/ggml-metal.m +1221 -728
  34. package/cpp/ggml-quants.c +189 -10681
  35. package/cpp/ggml-quants.h +78 -125
  36. package/cpp/ggml-threading.cpp +12 -0
  37. package/cpp/ggml-threading.h +12 -0
  38. package/cpp/ggml.c +688 -1460
  39. package/cpp/ggml.h +58 -244
  40. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  41. package/cpp/json.hpp +24766 -24766
  42. package/cpp/llama-sampling.cpp +5 -2
  43. package/cpp/llama.cpp +409 -123
  44. package/cpp/llama.h +8 -4
  45. package/cpp/rn-llama.hpp +89 -25
  46. package/cpp/sampling.cpp +42 -3
  47. package/cpp/sampling.h +22 -1
  48. package/cpp/sgemm.cpp +608 -0
  49. package/cpp/speculative.cpp +270 -0
  50. package/cpp/speculative.h +28 -0
  51. package/cpp/unicode.cpp +11 -0
  52. package/ios/RNLlama.mm +43 -20
  53. package/ios/RNLlamaContext.h +9 -3
  54. package/ios/RNLlamaContext.mm +146 -33
  55. package/jest/mock.js +0 -1
  56. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  57. package/lib/commonjs/grammar.js +4 -2
  58. package/lib/commonjs/grammar.js.map +1 -1
  59. package/lib/commonjs/index.js +52 -15
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/module/NativeRNLlama.js.map +1 -1
  62. package/lib/module/grammar.js +2 -1
  63. package/lib/module/grammar.js.map +1 -1
  64. package/lib/module/index.js +51 -15
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/typescript/NativeRNLlama.d.ts +122 -8
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  68. package/lib/typescript/grammar.d.ts +5 -6
  69. package/lib/typescript/grammar.d.ts.map +1 -1
  70. package/lib/typescript/index.d.ts +15 -6
  71. package/lib/typescript/index.d.ts.map +1 -1
  72. package/package.json +2 -1
  73. package/src/NativeRNLlama.ts +135 -13
  74. package/src/grammar.ts +10 -8
  75. package/src/index.ts +104 -28
@@ -252,6 +252,7 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
252
252
  }
253
253
 
254
254
  void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
255
+ LM_GGML_ASSERT(tensor);
255
256
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
256
257
 
257
258
  if (size == 0) {
@@ -266,6 +267,7 @@ void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * dat
266
267
  }
267
268
 
268
269
  void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
270
+ LM_GGML_ASSERT(tensor);
269
271
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
270
272
 
271
273
  if (size == 0) {
@@ -279,7 +281,7 @@ void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * dat
279
281
  buf->iface.get_tensor(buf, tensor, data, offset, size);
280
282
  }
281
283
 
282
- LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
284
+ void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
283
285
  lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
284
286
 
285
287
  if (size == 0) {
@@ -525,197 +527,6 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
525
527
  return reg->iface.get_proc_address(reg, name);
526
528
  }
527
529
 
528
- // Backend registry
529
-
530
- #ifdef LM_GGML_USE_CUDA
531
- #include "ggml-cuda.h"
532
- #endif
533
-
534
- #ifdef LM_GGML_USE_METAL
535
- #include "ggml-metal.h"
536
- #endif
537
-
538
- #ifdef LM_GGML_USE_SYCL
539
- #include "ggml-sycl.h"
540
- #endif
541
-
542
- #ifdef LM_GGML_USE_VULKAN
543
- #include "ggml-vulkan.h"
544
- #endif
545
-
546
- #ifdef LM_GGML_USE_BLAS
547
- #include "ggml-blas.h"
548
- #endif
549
-
550
- #ifdef LM_GGML_USE_RPC
551
- #include "ggml-rpc.h"
552
- #endif
553
-
554
- #ifndef __AMX_INT8__
555
- #undef LM_GGML_USE_AMX
556
- #endif
557
-
558
- #ifdef LM_GGML_USE_AMX
559
- # include "ggml-amx.h"
560
- #endif
561
-
562
- #ifdef LM_GGML_USE_CANN
563
- #include "ggml-cann.h"
564
- #endif
565
-
566
- #ifdef LM_GGML_USE_KOMPUTE
567
- #include "ggml-kompute.h"
568
- #endif
569
-
570
- #include "ggml-cpu.h"
571
-
572
- struct lm_ggml_backend_registry {
573
- std::vector<lm_ggml_backend_reg_t> backends;
574
- std::vector<lm_ggml_backend_dev_t> devices;
575
-
576
- lm_ggml_backend_registry() {
577
- #ifdef LM_GGML_USE_CUDA
578
- register_backend(lm_ggml_backend_cuda_reg());
579
- #endif
580
- #ifdef LM_GGML_USE_METAL
581
- register_backend(lm_ggml_backend_metal_reg());
582
- #endif
583
- #ifdef LM_GGML_USE_SYCL
584
- register_backend(lm_ggml_backend_sycl_reg());
585
- #endif
586
- #ifdef LM_GGML_USE_VULKAN
587
- register_backend(lm_ggml_backend_vk_reg());
588
- #endif
589
- #ifdef LM_GGML_USE_CANN
590
- register_backend(lm_ggml_backend_cann_reg());
591
- #endif
592
- #ifdef LM_GGML_USE_BLAS
593
- register_backend(lm_ggml_backend_blas_reg());
594
- #endif
595
- #ifdef LM_GGML_USE_RPC
596
- register_backend(lm_ggml_backend_rpc_reg());
597
- #endif
598
- #ifdef LM_GGML_USE_AMX
599
- register_backend(lm_ggml_backend_amx_reg());
600
- #endif
601
- #ifdef LM_GGML_USE_KOMPUTE
602
- register_backend(lm_ggml_backend_kompute_reg());
603
- #endif
604
-
605
- register_backend(lm_ggml_backend_cpu_reg());
606
- }
607
-
608
- void register_backend(lm_ggml_backend_reg_t reg) {
609
- #ifndef NDEBUG
610
- LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
611
- __func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
612
- #endif
613
- backends.push_back(reg);
614
- for (size_t i = 0; i < lm_ggml_backend_reg_dev_count(reg); i++) {
615
- register_device(lm_ggml_backend_reg_dev_get(reg, i));
616
- }
617
- }
618
-
619
- void register_device(lm_ggml_backend_dev_t device) {
620
- #ifndef NDEBUG
621
- LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
622
- #endif
623
- devices.push_back(device);
624
- }
625
- };
626
-
627
- static lm_ggml_backend_registry & get_reg() {
628
- static lm_ggml_backend_registry reg;
629
- return reg;
630
- }
631
-
632
- // Internal API
633
- void lm_ggml_backend_register(lm_ggml_backend_reg_t reg) {
634
- get_reg().register_backend(reg);
635
- }
636
-
637
- void lm_ggml_backend_device_register(lm_ggml_backend_dev_t device) {
638
- get_reg().register_device(device);
639
- }
640
-
641
- // Backend (reg) enumeration
642
- size_t lm_ggml_backend_reg_count() {
643
- return get_reg().backends.size();
644
- }
645
-
646
- lm_ggml_backend_reg_t lm_ggml_backend_reg_get(size_t index) {
647
- LM_GGML_ASSERT(index < lm_ggml_backend_reg_count());
648
- return get_reg().backends[index];
649
- }
650
-
651
- lm_ggml_backend_reg_t lm_ggml_backend_reg_by_name(const char * name) {
652
- for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
653
- lm_ggml_backend_reg_t reg = lm_ggml_backend_reg_get(i);
654
- if (strcmp(lm_ggml_backend_reg_name(reg), name) == 0) {
655
- return reg;
656
- }
657
- }
658
- return NULL;
659
- }
660
-
661
- // Device enumeration
662
- size_t lm_ggml_backend_dev_count() {
663
- return get_reg().devices.size();
664
- }
665
-
666
- lm_ggml_backend_dev_t lm_ggml_backend_dev_get(size_t index) {
667
- LM_GGML_ASSERT(index < lm_ggml_backend_dev_count());
668
- return get_reg().devices[index];
669
- }
670
-
671
- lm_ggml_backend_dev_t lm_ggml_backend_dev_by_name(const char * name) {
672
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
673
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
674
- if (strcmp(lm_ggml_backend_dev_name(dev), name) == 0) {
675
- return dev;
676
- }
677
- }
678
- return NULL;
679
- }
680
-
681
- lm_ggml_backend_dev_t lm_ggml_backend_dev_by_type(enum lm_ggml_backend_dev_type type) {
682
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
683
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
684
- if (lm_ggml_backend_dev_type(dev) == type) {
685
- return dev;
686
- }
687
- }
688
- return NULL;
689
- }
690
-
691
- // Convenience functions
692
- lm_ggml_backend_t lm_ggml_backend_init_by_name(const char * name, const char * params) {
693
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_name(name);
694
- if (!dev) {
695
- return NULL;
696
- }
697
- return lm_ggml_backend_dev_init(dev, params);
698
- }
699
-
700
- lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type type, const char * params) {
701
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(type);
702
- if (!dev) {
703
- return NULL;
704
- }
705
- return lm_ggml_backend_dev_init(dev, params);
706
- }
707
-
708
- lm_ggml_backend_t lm_ggml_backend_init_best(void) {
709
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU);
710
- if (!dev) {
711
- dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
712
- }
713
- if (!dev) {
714
- return NULL;
715
- }
716
- return lm_ggml_backend_dev_init(dev, NULL);
717
- }
718
-
719
530
  // multi-buffer buffer
720
531
 
721
532
  struct lm_ggml_backend_multi_buffer_context {
@@ -880,7 +691,7 @@ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_gg
880
691
  }
881
692
 
882
693
  static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
883
- lm_ggml_backend_buffer_t buffer = tensor->buffer;
694
+ lm_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
884
695
  if (buffer == NULL) {
885
696
  return -1;
886
697
  }
@@ -913,8 +724,6 @@ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBU
913
724
 
914
725
  // returns the backend that should be used for the node based on the current locations
915
726
  static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
916
- // TODO: use supports_op to check if the backend supports the op
917
-
918
727
  // assign pre-allocated nodes to their backend
919
728
  int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
920
729
  if (cur_backend_id != -1) {
@@ -933,7 +742,8 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
933
742
 
934
743
  if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
935
744
  // since the tensor is pre-allocated, it cannot be moved to another backend
936
- LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
745
+ lm_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
746
+ LM_GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, lm_ggml_backend_buffer_name(buffer), lm_ggml_op_name(tensor->op));
937
747
  }
938
748
 
939
749
  // graph input
@@ -1640,7 +1450,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
1640
1450
  bool parallel) {
1641
1451
  LM_GGML_ASSERT(n_backends > 0);
1642
1452
  LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
1643
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1453
+ LM_GGML_ASSERT(lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backends[n_backends - 1])) == LM_GGML_BACKEND_DEVICE_TYPE_CPU);
1644
1454
 
1645
1455
  struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
1646
1456
 
@@ -1729,12 +1539,13 @@ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml
1729
1539
 
1730
1540
  lm_ggml_backend_sched_split_graph(sched, measure_graph);
1731
1541
 
1542
+ lm_ggml_backend_sched_synchronize(sched);
1543
+
1732
1544
  if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1733
1545
  return false;
1734
1546
  }
1735
1547
 
1736
1548
  lm_ggml_backend_sched_reset(sched);
1737
- lm_ggml_backend_sched_synchronize(sched);
1738
1549
 
1739
1550
  return true;
1740
1551
  }
@@ -2036,17 +1847,6 @@ bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_b
2036
1847
  return true;
2037
1848
  }
2038
1849
 
2039
-
2040
-
2041
- #include "ggml-backend.h"
2042
- #include "ggml-backend-impl.h"
2043
- #include "ggml-cpu.h"
2044
- #include "ggml-impl.h"
2045
- #include <cctype>
2046
- #include <string>
2047
-
2048
- // ggml-backend interface
2049
-
2050
1850
  // CPU backend - buffer
2051
1851
 
2052
1852
  static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
@@ -2120,7 +1920,9 @@ static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr
2120
1920
  /* .reset = */ NULL,
2121
1921
  };
2122
1922
 
2123
- // CPU backend - buffer type
1923
+ // CPU backend buffer type
1924
+
1925
+ // this buffer type is defined here to make it available to all backends
2124
1926
 
2125
1927
  static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2126
1928
  return "CPU";
@@ -2161,7 +1963,7 @@ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
2161
1963
  /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2162
1964
  /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2163
1965
  },
2164
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
1966
+ /* .device = */ NULL, // FIXME lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2165
1967
  /* .context = */ NULL,
2166
1968
  };
2167
1969
 
@@ -2184,478 +1986,14 @@ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_from_ptr_type(vo
2184
1986
  /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2185
1987
  /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2186
1988
  },
2187
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
1989
+ /* .device = */ NULL, // FIXME lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2188
1990
  /* .context = */ NULL,
2189
1991
  };
2190
1992
 
2191
1993
  return &lm_ggml_backend_cpu_buffer_type;
2192
1994
  }
2193
1995
 
2194
- #ifdef LM_GGML_USE_CPU_HBM
2195
-
2196
- // buffer type HBM
2197
-
2198
- #include <hbwmalloc.h>
2199
-
2200
- static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
2201
- return "CPU_HBM";
2202
-
2203
- LM_GGML_UNUSED(buft);
2204
- }
2205
-
2206
- static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
2207
- hbw_free(buffer->context);
2208
- }
2209
-
2210
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
2211
- void * ptr;
2212
- int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
2213
- if (result != 0) {
2214
- LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
2215
- return NULL;
2216
- }
2217
-
2218
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2219
- buffer->buft = buft;
2220
- buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
2221
-
2222
- return buffer;
2223
- }
2224
-
2225
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
2226
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
2227
- /* .iface = */ {
2228
- /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
2229
- /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
2230
- /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
2231
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2232
- /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
2233
- /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
2234
- },
2235
- /* .context = */ NULL,
2236
- };
2237
-
2238
- return &lm_ggml_backend_cpu_buffer_type_hbm;
2239
- }
2240
- #endif
2241
-
2242
- static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
2243
- static lm_ggml_backend_buffer_type_t bufts[] = {
2244
- #ifdef LM_GGML_USE_CPU_HBM
2245
- lm_ggml_backend_cpu_hbm_buffer_type(),
2246
- #endif
2247
- NULL
2248
- };
2249
-
2250
- return bufts;
2251
-
2252
- LM_GGML_UNUSED(device);
2253
- }
2254
-
2255
- // CPU backend - backend (stream)
2256
-
2257
- struct lm_ggml_backend_cpu_context {
2258
- int n_threads;
2259
- lm_ggml_threadpool_t threadpool;
2260
-
2261
- uint8_t * work_data;
2262
- size_t work_size;
2263
-
2264
- lm_ggml_abort_callback abort_callback;
2265
- void * abort_callback_data;
2266
- };
2267
-
2268
- static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
2269
- return "CPU";
2270
-
2271
- LM_GGML_UNUSED(backend);
2272
- }
2273
-
2274
- static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
2275
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2276
- delete[] cpu_ctx->work_data;
2277
- delete cpu_ctx;
2278
- delete backend;
2279
- }
2280
-
2281
- struct lm_ggml_backend_plan_cpu {
2282
- struct lm_ggml_cplan cplan;
2283
- struct lm_ggml_cgraph cgraph;
2284
- };
2285
-
2286
- static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
2287
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2288
-
2289
- struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
2290
-
2291
- cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2292
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
2293
-
2294
- if (cpu_plan->cplan.work_size > 0) {
2295
- cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
2296
- if (cpu_plan->cplan.work_data == NULL) {
2297
- delete cpu_plan;
2298
- return NULL;
2299
- }
2300
- }
2301
-
2302
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
2303
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2304
-
2305
- return cpu_plan;
2306
- }
2307
-
2308
- static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2309
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2310
-
2311
- delete[] cpu_plan->cplan.work_data;
2312
- delete cpu_plan;
2313
-
2314
- LM_GGML_UNUSED(backend);
2315
- }
2316
-
2317
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
2318
- struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
2319
-
2320
- return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
2321
-
2322
- LM_GGML_UNUSED(backend);
2323
- }
2324
-
2325
- static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
2326
- struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
2327
-
2328
- struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
2329
-
2330
- if (cpu_ctx->work_size < cplan.work_size) {
2331
- delete[] cpu_ctx->work_data;
2332
- cpu_ctx->work_data = new uint8_t[cplan.work_size];
2333
- if (cpu_ctx->work_data == NULL) {
2334
- cpu_ctx->work_size = 0;
2335
- return LM_GGML_STATUS_ALLOC_FAILED;
2336
- }
2337
- cpu_ctx->work_size = cplan.work_size;
2338
- }
2339
- cplan.work_data = (uint8_t *)cpu_ctx->work_data;
2340
-
2341
- cplan.abort_callback = cpu_ctx->abort_callback;
2342
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
2343
-
2344
- return lm_ggml_graph_compute(cgraph, &cplan);
2345
- }
2346
-
2347
- static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
2348
- /* .get_name = */ lm_ggml_backend_cpu_get_name,
2349
- /* .free = */ lm_ggml_backend_cpu_free,
2350
- /* .set_tensor_async = */ NULL,
2351
- /* .get_tensor_async = */ NULL,
2352
- /* .cpy_tensor_async = */ NULL,
2353
- /* .synchronize = */ NULL,
2354
- /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
2355
- /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
2356
- /* .graph_plan_update = */ NULL,
2357
- /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
2358
- /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
2359
- /* .event_record = */ NULL,
2360
- /* .event_wait = */ NULL,
2361
- };
2362
-
2363
- static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
2364
- static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
2365
- return &guid;
2366
- }
2367
-
2368
- lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
2369
- // initialize CPU backend now to avoid slowing the first graph computation
2370
- lm_ggml_cpu_init();
2371
-
2372
- struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
2373
- if (ctx == NULL) {
2374
- return NULL;
2375
- }
2376
-
2377
- ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
2378
- ctx->threadpool = NULL;
2379
- ctx->work_data = NULL;
2380
- ctx->work_size = 0;
2381
- ctx->abort_callback = NULL;
2382
- ctx->abort_callback_data = NULL;
2383
-
2384
- lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
2385
- /* .guid = */ lm_ggml_backend_cpu_guid(),
2386
- /* .interface = */ lm_ggml_backend_cpu_i,
2387
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
2388
- /* .context = */ ctx,
2389
- };
2390
-
2391
- if (cpu_backend == NULL) {
2392
- delete ctx;
2393
- return NULL;
2394
- }
2395
-
2396
- return cpu_backend;
2397
- }
2398
-
2399
- bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
2400
- return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
2401
- }
2402
-
2403
- void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
2404
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2405
-
2406
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2407
- ctx->n_threads = n_threads;
2408
- }
2409
-
2410
- void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
2411
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2412
-
2413
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2414
-
2415
- if (ctx->threadpool && ctx->threadpool != threadpool) {
2416
- // already had a different threadpool, pause/suspend it before switching
2417
- lm_ggml_threadpool_pause(ctx->threadpool);
2418
- }
2419
- ctx->threadpool = threadpool;
2420
- }
2421
-
2422
- void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
2423
- LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
2424
-
2425
- struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
2426
- ctx->abort_callback = abort_callback;
2427
- ctx->abort_callback_data = abort_callback_data;
2428
- }
2429
-
2430
1996
  lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
2431
1997
  LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
2432
1998
  return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_from_ptr_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
2433
1999
  }
2434
-
2435
- // CPU backend - device
2436
-
2437
- struct lm_ggml_backend_cpu_device_context {
2438
- std::string description = "CPU";
2439
-
2440
- lm_ggml_backend_cpu_device_context() {
2441
- #ifdef __APPLE__
2442
- size_t len = 0;
2443
- if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
2444
- description.resize(len);
2445
- sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
2446
- }
2447
- #elif defined(__linux__)
2448
- FILE * f = fopen("/proc/cpuinfo", "r");
2449
- if (f) {
2450
- char buf[1024];
2451
- while (fgets(buf, sizeof(buf), f)) {
2452
- if (strncmp(buf, "model name", 10) == 0) {
2453
- char * p = strchr(buf, ':');
2454
- if (p) {
2455
- p++;
2456
- while (std::isspace(*p)) {
2457
- p++;
2458
- }
2459
- while (std::isspace(p[strlen(p) - 1])) {
2460
- p[strlen(p) - 1] = '\0';
2461
- }
2462
- description = p;
2463
- break;
2464
- }
2465
- }
2466
- }
2467
- fclose(f);
2468
- }
2469
- #elif defined(_WIN32)
2470
- HKEY hKey;
2471
- if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
2472
- TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
2473
- 0,
2474
- KEY_READ,
2475
- &hKey) == ERROR_SUCCESS) {
2476
- DWORD cpu_brand_size = 0;
2477
- if (RegQueryValueExA(hKey,
2478
- TEXT("ProcessorNameString"),
2479
- NULL,
2480
- NULL,
2481
- NULL,
2482
- &cpu_brand_size) == ERROR_SUCCESS) {
2483
- description.resize(cpu_brand_size);
2484
- if (RegQueryValueExA(hKey,
2485
- TEXT("ProcessorNameString"),
2486
- NULL,
2487
- NULL,
2488
- (LPBYTE)&description[0], // NOLINT
2489
- &cpu_brand_size) == ERROR_SUCCESS) {
2490
- if (description.find('\0') != std::string::npos) {
2491
- description.resize(description.find('\0'));
2492
- }
2493
- }
2494
- }
2495
- RegCloseKey(hKey);
2496
- }
2497
- #endif
2498
- }
2499
- };
2500
-
2501
- static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
2502
- return "CPU";
2503
-
2504
- LM_GGML_UNUSED(dev);
2505
- }
2506
-
2507
- static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
2508
- struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
2509
-
2510
- return ctx->description.c_str();
2511
- }
2512
-
2513
- static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
2514
- // TODO
2515
- *free = 0;
2516
- *total = 0;
2517
-
2518
- LM_GGML_UNUSED(dev);
2519
- }
2520
-
2521
- static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
2522
- return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
2523
-
2524
- LM_GGML_UNUSED(dev);
2525
- }
2526
-
2527
- static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
2528
- props->name = lm_ggml_backend_cpu_device_get_name(dev);
2529
- props->description = lm_ggml_backend_cpu_device_get_description(dev);
2530
- props->type = lm_ggml_backend_cpu_device_get_type(dev);
2531
- lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
2532
- props->caps = {
2533
- /* .async = */ false,
2534
- /* .host_buffer = */ false,
2535
- /* .buffer_from_host_ptr = */ true,
2536
- /* .events = */ false,
2537
- };
2538
- }
2539
-
2540
- static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
2541
- return lm_ggml_backend_cpu_init();
2542
-
2543
- LM_GGML_UNUSED(dev);
2544
- LM_GGML_UNUSED(params);
2545
- }
2546
-
2547
- static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
2548
- return lm_ggml_backend_cpu_buffer_type();
2549
-
2550
- LM_GGML_UNUSED(dev);
2551
- }
2552
-
2553
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
2554
- return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
2555
-
2556
- LM_GGML_UNUSED(dev);
2557
- LM_GGML_UNUSED(max_tensor_size);
2558
- }
2559
-
2560
- static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
2561
- switch (op->op) {
2562
- case LM_GGML_OP_CPY:
2563
- return
2564
- op->type != LM_GGML_TYPE_IQ2_XXS &&
2565
- op->type != LM_GGML_TYPE_IQ2_XS &&
2566
- op->type != LM_GGML_TYPE_IQ1_S &&
2567
- op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
2568
- case LM_GGML_OP_MUL_MAT:
2569
- return op->src[1]->type == LM_GGML_TYPE_F32;// FIXME || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
2570
- case LM_GGML_OP_ROPE_BACK:
2571
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
2572
- case LM_GGML_OP_IM2COL_BACK:
2573
- return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
2574
- case LM_GGML_OP_OUT_PROD:
2575
- return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
2576
- default:
2577
- return true;
2578
- }
2579
-
2580
- LM_GGML_UNUSED(dev);
2581
- }
2582
-
2583
- static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
2584
- return lm_ggml_backend_buft_is_host(buft);
2585
-
2586
- LM_GGML_UNUSED(dev);
2587
- }
2588
-
2589
- static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
2590
- /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
2591
- /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
2592
- /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
2593
- /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
2594
- /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
2595
- /* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
2596
- /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
2597
- /* .get_host_buffer_type = */ NULL,
2598
- /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
2599
- /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
2600
- /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
2601
- /* .offload_op = */ NULL,
2602
- /* .event_new = */ NULL,
2603
- /* .event_free = */ NULL,
2604
- /* .event_synchronize = */ NULL,
2605
- };
2606
-
2607
- // CPU backend - backend (reg)
2608
-
2609
- static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
2610
- return "CPU";
2611
-
2612
- LM_GGML_UNUSED(reg);
2613
- }
2614
-
2615
- static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
2616
- return 1;
2617
-
2618
- LM_GGML_UNUSED(reg);
2619
- }
2620
-
2621
- static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
2622
- LM_GGML_ASSERT(index == 0);
2623
-
2624
- static lm_ggml_backend_cpu_device_context ctx;
2625
- static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
2626
- /* .iface = */ lm_ggml_backend_cpu_device_i,
2627
- /* .reg = */ reg,
2628
- /* .context = */ &ctx,
2629
- };
2630
-
2631
- return &lm_ggml_backend_cpu_device;
2632
- }
2633
-
2634
- static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
2635
- if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
2636
- return (void *)lm_ggml_backend_cpu_set_n_threads;
2637
- }
2638
- if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
2639
- return (void *)lm_ggml_backend_cpu_get_extra_bufts;
2640
- }
2641
-
2642
- return NULL;
2643
-
2644
- LM_GGML_UNUSED(reg);
2645
- }
2646
-
2647
- static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
2648
- /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
2649
- /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
2650
- /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
2651
- /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
2652
- };
2653
-
2654
- lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
2655
- static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
2656
- /* .iface = */ lm_ggml_backend_cpu_reg_i,
2657
- /* .context = */ NULL,
2658
- };
2659
-
2660
- return &lm_ggml_backend_cpu_reg;
2661
- }