@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
294
294
  alloc->free_blocks[0].offset = 0;
295
295
  alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
296
296
  alloc->max_size = 0;
297
+
298
+ #ifdef GGML_ALLOCATOR_DEBUG
299
+ for (int i = 0; i < 1024; i++) {
300
+ alloc->allocated_tensors[i].tensor = NULL;
301
+ }
302
+ #endif
297
303
  }
298
304
 
299
305
  static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
@@ -9,144 +9,226 @@ extern "C" {
9
9
  #endif
10
10
 
11
11
  //
12
- // Backend buffer
12
+ // Backend buffer type
13
13
  //
14
14
 
15
- // buffer type
16
- typedef void * ggml_backend_buffer_type_context_t;
17
-
18
15
  struct ggml_backend_buffer_type_i {
19
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
16
+ const char * (*get_name) (ggml_backend_buffer_type_t buft);
20
17
  // allocate a buffer of this type
21
- ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
18
+ ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
22
19
  // tensor alignment
23
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
24
- // max buffer size that can be allocated
25
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
26
- // data size needed to allocate the tensor, including padding
27
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
28
- // check if tensor data is in host memory
29
- bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
20
+ size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
21
+ // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
22
+ size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
23
+ // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
24
+ size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25
+ // (optional) check if tensor data is in host memory (defaults to false)
26
+ bool (*is_host) (ggml_backend_buffer_type_t buft);
30
27
  };
31
28
 
32
29
  struct ggml_backend_buffer_type {
33
30
  struct ggml_backend_buffer_type_i iface;
34
- ggml_backend_buffer_type_context_t context;
31
+ ggml_backend_dev_t device;
32
+ void * context;
35
33
  };
36
34
 
37
- // buffer
38
- typedef void * ggml_backend_buffer_context_t;
35
+ //
36
+ // Backend buffer
37
+ //
39
38
 
40
39
  struct ggml_backend_buffer_i {
41
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
42
- void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
43
- void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
44
- void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
45
- void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
46
- void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
47
- bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
48
- void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
49
- void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
40
+ const char * (*get_name) (ggml_backend_buffer_t buffer);
41
+ // (optional) free the buffer
42
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
43
+ // base address of the buffer
44
+ void * (*get_base) (ggml_backend_buffer_t buffer);
45
+ // (optional) initialize a tensor in the buffer (eg. add tensor extras)
46
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
47
+ // tensor data access
48
+ void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
49
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
50
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
51
+ // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
52
+ bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
53
+ // clear the entire buffer
54
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
55
+ // (optional) reset any internal state due to tensor initialization, such as tensor extras
56
+ void (*reset) (ggml_backend_buffer_t buffer);
50
57
  };
51
58
 
52
59
  struct ggml_backend_buffer {
53
60
  struct ggml_backend_buffer_i iface;
54
61
  ggml_backend_buffer_type_t buft;
55
- ggml_backend_buffer_context_t context;
62
+ void * context;
56
63
  size_t size;
57
64
  enum ggml_backend_buffer_usage usage;
58
65
  };
59
66
 
60
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
61
- ggml_backend_buffer_type_t buft,
62
- struct ggml_backend_buffer_i iface,
63
- ggml_backend_buffer_context_t context,
64
- size_t size);
67
+ ggml_backend_buffer_t ggml_backend_buffer_init(
68
+ ggml_backend_buffer_type_t buft,
69
+ struct ggml_backend_buffer_i iface,
70
+ void * context,
71
+ size_t size);
65
72
 
66
73
  // do not use directly, use ggml_backend_tensor_copy instead
67
74
  bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
68
75
 
76
+ // multi-buffer
69
77
  // buffer that contains a collection of buffers
70
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
71
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
72
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
78
+ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
79
+ bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
80
+ void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
73
81
 
74
82
  //
75
- // Backend
83
+ // Backend (stream)
76
84
  //
77
85
 
78
- typedef void * ggml_backend_context_t;
79
-
80
86
  struct ggml_backend_i {
81
- const char * (*GGML_CALL get_name)(ggml_backend_t backend);
87
+ const char * (*get_name)(ggml_backend_t backend);
82
88
 
83
- void (*GGML_CALL free)(ggml_backend_t backend);
89
+ void (*free)(ggml_backend_t backend);
84
90
 
85
91
  // buffer allocation
86
- ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
92
+ ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
87
93
 
88
94
  // (optional) asynchronous tensor data access
89
- void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
90
- void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
91
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
95
+ void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
96
+ void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
97
+ bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
92
98
 
93
99
  // (optional) complete all pending operations
94
- void (*GGML_CALL synchronize)(ggml_backend_t backend);
100
+ void (*synchronize)(ggml_backend_t backend);
95
101
 
96
- // compute graph with a plan (not used currently)
97
- // create a new plan for a graph
98
- ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
99
- void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
102
+ // (optional) compute graph with a plan (not used currently)
103
+ ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
104
+ void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100
105
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
101
- void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
106
+ void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
102
107
  // compute the graph with the plan
103
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
108
+ enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
109
+
110
+ // compute graph (always async if supported by the backend)
111
+ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
104
112
 
105
- // compute graph without a plan (async)
106
- enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
113
+ // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
114
+ // new backends should implement the device interface instead
107
115
 
116
+ // These functions are being moved to the device interface
108
117
  // check if the backend can compute an operation
109
- bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
118
+ bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
110
119
 
111
120
  // check if the backend can use tensors allocated in a buffer type
112
- bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
121
+ bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
113
122
 
114
123
  // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
115
124
  // these should be expensive operations with large batch sizes that may benefit from running on this backend
116
125
  // even if the weight has to be copied from the CPU temporarily
117
- bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
126
+ bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
118
127
 
119
128
  // (optional) event synchronization
120
- // create a new event that can record events on this backend instance
121
- ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
122
- void (*GGML_CALL event_free) (ggml_backend_event_t event);
123
- // record an event on the backend instance that created it
124
- void (*GGML_CALL event_record) (ggml_backend_event_t event);
125
- // wait for an event on on a different backend instance
126
- void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
127
- // block until an event is recorded
128
- void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
129
+ // record an event on this stream
130
+ void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
131
+ // wait for an event on on a different stream
132
+ void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
129
133
  };
130
134
 
131
135
  struct ggml_backend {
132
136
  ggml_guid_t guid;
133
-
134
137
  struct ggml_backend_i iface;
135
- ggml_backend_context_t context;
138
+ ggml_backend_dev_t device;
139
+ void * context;
136
140
  };
137
141
 
138
142
  struct ggml_backend_event {
139
- ggml_backend_t backend;
143
+ struct ggml_backend_device * device;
144
+ void * context;
145
+ };
146
+
147
+ //
148
+ // Backend device
149
+ //
150
+
151
+ // Note: if additional properties are needed, we should add a struct with all of them
152
+ // the current functions to obtain the properties can remain, since they are more convenient for often used properties
153
+ struct ggml_backend_device_i {
154
+ // device name: short identifier for this device, such as "CPU" or "CUDA0"
155
+ const char * (*get_name)(ggml_backend_dev_t dev);
156
+
157
+ // device description: short informative description of the device, could be the model name
158
+ const char * (*get_description)(ggml_backend_dev_t dev);
159
+
160
+ // device memory in bytes
161
+ void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
162
+
163
+ // device type
164
+ enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
165
+
166
+ // device properties
167
+ void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
168
+
169
+ // backend (stream) initialization
170
+ ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
171
+
172
+ // preferred buffer type
173
+ ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
174
+
175
+ // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
176
+ ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
177
+
178
+ // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
179
+ ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
180
+
181
+ // check if the backend can compute an operation
182
+ bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
183
+
184
+ // check if the backend can use tensors allocated in a buffer type
185
+ bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
186
+
187
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
188
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
189
+ // even if the weight has to be copied from the CPU temporarily
190
+ bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
191
+
192
+ // (optional) event synchronization
193
+ ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
194
+ void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
195
+ void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
196
+ };
197
+
198
+ struct ggml_backend_device {
199
+ struct ggml_backend_device_i iface;
200
+ ggml_backend_reg_t reg;
140
201
  void * context;
141
202
  };
142
203
 
143
204
  //
144
- // Backend registry
205
+ // Backend (reg)
145
206
  //
146
207
 
147
- typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
208
+ struct ggml_backend_reg_i {
209
+ const char * (*get_name)(ggml_backend_reg_t reg);
210
+
211
+ // enumerate available devices
212
+ size_t (*get_device_count)(ggml_backend_reg_t reg);
213
+ ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
214
+
215
+ // (optional) get a pointer to a function in the backend
216
+ // backends can add custom functions that are not part of the standard ggml-backend interface
217
+ void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
218
+ };
219
+
220
+ struct ggml_backend_reg {
221
+ // int api_version; // TODO: for dynamic loading
222
+ struct ggml_backend_reg_i iface;
223
+ void * context;
224
+ };
225
+
148
226
 
149
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
227
+ // Internal backend registry API
228
+ void ggml_backend_register(ggml_backend_reg_t reg);
229
+ void ggml_backend_device_register(ggml_backend_dev_t device);
230
+ // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
231
+ // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
150
232
 
151
233
  #ifdef __cplusplus
152
234
  }