@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -26,20 +26,25 @@ struct seq_draft {
26
26
  std::vector<llama_token> tokens;
27
27
  std::vector<std::vector<llama_token_data>> dists;
28
28
 
29
- struct gpt_sampler * smpl = nullptr;
29
+ struct common_sampler * smpl = nullptr;
30
30
  };
31
31
 
32
32
  int main(int argc, char ** argv) {
33
- gpt_params params;
33
+ common_params params;
34
34
 
35
35
  // needed to get candidate probs even for temp <= 0.0
36
36
  params.sparams.n_probs = 128;
37
37
 
38
- if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
38
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
39
39
  return 1;
40
40
  }
41
41
 
42
- gpt_init();
42
+ if (params.n_predict < -1) {
43
+ LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
44
+ return 1;
45
+ }
46
+
47
+ common_init();
43
48
 
44
49
  if (params.model_draft.empty()) {
45
50
  LOG_ERR("%s: --model-draft is required\n", __func__);
@@ -66,7 +71,7 @@ int main(int argc, char ** argv) {
66
71
  llama_context * ctx_dft = NULL;
67
72
 
68
73
  // load the target model
69
- llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
74
+ common_init_result llama_init_tgt = common_init_from_params(params);
70
75
  model_tgt = llama_init_tgt.model;
71
76
  ctx_tgt = llama_init_tgt.context;
72
77
 
@@ -78,7 +83,7 @@ int main(int argc, char ** argv) {
78
83
  }
79
84
 
80
85
  params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
81
- llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
86
+ common_init_result llama_init_dft = common_init_from_params(params);
82
87
  model_dft = llama_init_dft.model;
83
88
  ctx_dft = llama_init_dft.context;
84
89
 
@@ -124,8 +129,8 @@ int main(int argc, char ** argv) {
124
129
  if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
125
130
  LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
126
131
  LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
127
- llama_token_to_piece(ctx_tgt, i).c_str(),
128
- llama_token_to_piece(ctx_dft, i).c_str());
132
+ common_token_to_piece(ctx_tgt, i).c_str(),
133
+ common_token_to_piece(ctx_dft, i).c_str());
129
134
  return 1;
130
135
  }
131
136
  }
@@ -134,7 +139,7 @@ int main(int argc, char ** argv) {
134
139
 
135
140
  // Tokenize the prompt
136
141
  std::vector<llama_token> inp;
137
- inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
142
+ inp = common_tokenize(ctx_tgt, params.prompt, true, true);
138
143
 
139
144
  const int max_context_size = llama_n_ctx(ctx_tgt);
140
145
  const int max_tokens_list_size = max_context_size - 4;
@@ -147,7 +152,7 @@ int main(int argc, char ** argv) {
147
152
  LOG("\n\n");
148
153
 
149
154
  for (auto id : inp) {
150
- LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
155
+ LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
151
156
  }
152
157
 
153
158
  const int n_input = inp.size();
@@ -155,9 +160,9 @@ int main(int argc, char ** argv) {
155
160
  const auto t_enc_start = ggml_time_us();
156
161
 
157
162
  // eval the prompt with both models
158
- llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
159
- llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
160
- llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0));
163
+ llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
164
+ llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1));
165
+ llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
161
166
 
162
167
  const auto t_enc_end = ggml_time_us();
163
168
 
@@ -178,20 +183,18 @@ int main(int argc, char ** argv) {
178
183
  bool has_eos = false;
179
184
 
180
185
  // target model sampling context (reuse the llama_context's sampling instance)
181
- struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
182
-
183
- struct llama_sampler * softmax = llama_sampler_init_softmax();
186
+ struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
184
187
 
185
188
  // draft sequence data
186
189
  std::vector<seq_draft> drafts(n_seq_dft);
187
190
 
188
191
  for (int s = 0; s < n_seq_dft; ++s) {
189
- // allocate gpt_sampler for each draft sequence
190
- drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
192
+ // allocate llama_sampler for each draft sequence
193
+ drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
191
194
  }
192
195
 
193
- llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
194
- llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
196
+ llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
197
+ llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
195
198
 
196
199
  const auto t_dec_start = ggml_time_us();
197
200
 
@@ -229,9 +232,9 @@ int main(int argc, char ** argv) {
229
232
  bool accept = false;
230
233
  if (params.sparams.temp > 0) {
231
234
  // stochastic verification
232
- gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
235
+ common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
233
236
 
234
- auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
237
+ auto & dist_tgt = *common_sampler_get_candidates(smpl);
235
238
 
236
239
  float p_tgt = 0.0f;
237
240
  float p_dft = 0.0f;
@@ -264,11 +267,12 @@ int main(int argc, char ** argv) {
264
267
  for (size_t i = 0; i < dist_tgt.size; i++) {
265
268
  if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
266
269
  p_tgt = dist_tgt.data[i].p;
270
+ break;
267
271
  }
272
+ }
273
+ for (size_t i = 0; i < dist_dft.size; i++) {
268
274
  if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
269
275
  p_dft = dist_dft.data[i].p;
270
- }
271
- if (p_tgt && p_dft) {
272
276
  break;
273
277
  }
274
278
  }
@@ -277,13 +281,13 @@ int main(int argc, char ** argv) {
277
281
  s_keep = s;
278
282
  accept = true;
279
283
  token_id = drafts[s].tokens[i_dft];
280
- token_str = llama_token_to_piece(ctx_tgt, token_id);
281
- gpt_sampler_accept(smpl, token_id, true);
284
+ token_str = common_token_to_piece(ctx_tgt, token_id);
285
+ common_sampler_accept(smpl, token_id, true);
282
286
 
283
287
  LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
284
288
  break;
285
289
  } else {
286
- LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
290
+ LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
287
291
  drafts[s].active = false;
288
292
 
289
293
  // calculate residual probability
@@ -349,19 +353,19 @@ int main(int argc, char ** argv) {
349
353
  const int idx = dist(rng);
350
354
 
351
355
  token_id = dist_tgt.data[idx].id;
352
- gpt_sampler_accept(smpl, token_id, true);
353
- token_str = llama_token_to_piece(ctx_tgt, token_id);
356
+ common_sampler_accept(smpl, token_id, true);
357
+ token_str = common_token_to_piece(ctx_tgt, token_id);
354
358
  }
355
359
  } else {
356
360
  // greedy verification
357
361
 
358
362
  // sample from the target model
359
363
  LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
360
- token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
364
+ token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
361
365
 
362
- gpt_sampler_accept(smpl, token_id, true);
366
+ common_sampler_accept(smpl, token_id, true);
363
367
 
364
- token_str = llama_token_to_piece(ctx_tgt, token_id);
368
+ token_str = common_token_to_piece(ctx_tgt, token_id);
365
369
 
366
370
  for (int s = 0; s < n_seq_dft; ++s) {
367
371
  if (!drafts[s].active) {
@@ -431,8 +435,8 @@ int main(int argc, char ** argv) {
431
435
  drafts[0].dists.push_back(std::vector<llama_token_data>());
432
436
  drafts[0].i_batch_tgt.push_back(0);
433
437
 
434
- llama_batch_clear(batch_dft);
435
- llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
438
+ common_batch_clear(batch_dft);
439
+ common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
436
440
 
437
441
  llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
438
442
  // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
@@ -441,14 +445,14 @@ int main(int argc, char ** argv) {
441
445
  ++n_past_dft;
442
446
  }
443
447
 
444
- if (n_predict > params.n_predict || has_eos) {
448
+ if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
445
449
  break;
446
450
  }
447
451
 
448
452
  if (drafts[0].smpl) {
449
- gpt_sampler_free(drafts[0].smpl);
453
+ common_sampler_free(drafts[0].smpl);
450
454
  }
451
- drafts[0].smpl = gpt_sampler_clone(smpl);
455
+ drafts[0].smpl = common_sampler_clone(smpl);
452
456
 
453
457
  int n_seq_cur = 1;
454
458
  int n_past_cur = n_past_dft;
@@ -461,8 +465,8 @@ int main(int argc, char ** argv) {
461
465
  drafts[0].drafting = true;
462
466
  drafts[0].i_batch_dft = 0;
463
467
 
464
- llama_batch_clear(batch_tgt);
465
- llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
468
+ common_batch_clear(batch_tgt);
469
+ common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
466
470
 
467
471
  // sample n_draft tokens from the draft model using tree-based sampling
468
472
  for (int i = 0; i < n_draft; ++i) {
@@ -477,13 +481,13 @@ int main(int argc, char ** argv) {
477
481
  continue;
478
482
  }
479
483
 
480
- gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
484
+ common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
481
485
 
482
- const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
486
+ const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
483
487
 
484
488
  for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
485
489
  LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
486
- k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
490
+ k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
487
491
  }
488
492
 
489
493
  std::vector<int> sa(1, s);
@@ -518,9 +522,9 @@ int main(int argc, char ** argv) {
518
522
  drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
519
523
 
520
524
  if (drafts[n_seq_cur].smpl) {
521
- gpt_sampler_free(drafts[n_seq_cur].smpl);
525
+ common_sampler_free(drafts[n_seq_cur].smpl);
522
526
  }
523
- drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
527
+ drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
524
528
 
525
529
  sa.push_back(n_seq_cur);
526
530
 
@@ -536,7 +540,7 @@ int main(int argc, char ** argv) {
536
540
 
537
541
  const int s = sa[is];
538
542
 
539
- gpt_sampler_accept(drafts[s].smpl, id, true);
543
+ common_sampler_accept(drafts[s].smpl, id, true);
540
544
 
541
545
  drafts[s].tokens.push_back(id);
542
546
  // save cur_p.data into drafts[s].dists
@@ -545,12 +549,12 @@ int main(int argc, char ** argv) {
545
549
  // add unique drafted tokens to the target batch
546
550
  drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
547
551
 
548
- llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
552
+ common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
549
553
 
550
554
  // add the token to the batch for batched decoding with the draft model
551
555
  drafts[s].i_batch_dft = batch_dft.n_tokens;
552
556
 
553
- llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
557
+ common_batch_add(batch_dft, id, n_past_cur, { s }, true);
554
558
 
555
559
  if (batch_tgt.n_tokens > n_draft) {
556
560
  drafts[s].drafting = false;
@@ -617,14 +621,13 @@ int main(int argc, char ** argv) {
617
621
 
618
622
  LOG_INF("\n");
619
623
  LOG_INF("target:\n\n");
620
- gpt_perf_print(ctx_tgt, smpl);
624
+ common_perf_print(ctx_tgt, smpl);
621
625
 
622
- gpt_sampler_free(smpl);
626
+ common_sampler_free(smpl);
623
627
  for (int s = 0; s < n_seq_dft; ++s) {
624
- gpt_sampler_free(drafts[s].smpl);
628
+ common_sampler_free(drafts[s].smpl);
625
629
  }
626
630
 
627
- llama_sampler_free(softmax);
628
631
  llama_batch_free(batch_dft);
629
632
 
630
633
  llama_free(ctx_tgt);
@@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
365
365
  const bool parse_special = !no_parse_special;
366
366
 
367
367
  std::vector<llama_token> tokens;
368
- tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
368
+ tokens = common_tokenize(model, prompt, add_bos, parse_special);
369
369
 
370
370
  if (printing_ids) {
371
371
  printf("[");
@@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
380
380
  } else {
381
381
  bool invalid_utf8 = false;
382
382
  printf("%6d -> '", tokens[i]);
383
- write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
383
+ write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
384
384
  if (invalid_utf8) {
385
385
  printf("' (utf-8 decode failure)\n");
386
386
  } else {
@@ -92,6 +92,7 @@ else()
92
92
  endif()
93
93
 
94
94
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
95
+ option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
95
96
 
96
97
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
97
98
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
@@ -99,6 +100,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
99
100
  option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
100
101
  option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
101
102
  option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
103
+ option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
104
+ option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
105
+ option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
102
106
  option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
103
107
  if (NOT MSVC)
104
108
  option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -113,6 +117,7 @@ endif()
113
117
 
114
118
  # ggml core
115
119
  set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
120
+ option(GGML_CPU "ggml: enable CPU backend" ON)
116
121
 
117
122
  # 3rd party libs / backends
118
123
  option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@@ -123,14 +128,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
123
128
 
124
129
  option(GGML_CUDA "ggml: use CUDA" OFF)
125
130
  option(GGML_MUSA "ggml: use MUSA" OFF)
126
- option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
127
131
  option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
128
132
  option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
129
- set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
130
- set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
131
133
  option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
132
- set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
133
- "ggml: iters./thread per block for Q2_K/Q6_K")
134
134
  set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
135
135
  "ggml: max. batch size for using peer access")
136
136
  option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
@@ -138,7 +138,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
138
138
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
139
139
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
140
140
 
141
- option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
141
+ option(GGML_HIP "ggml: use HIP" OFF)
142
142
  option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
143
143
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
144
144
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
@@ -150,6 +150,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
150
150
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
151
151
  option(GGML_KOMPUTE "ggml: use Kompute" OFF)
152
152
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
153
+ option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
153
154
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
154
155
  option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
155
156
  option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@@ -158,6 +159,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
158
159
  set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
159
160
  option(GGML_OPENMP "ggml: use OpenMP" ON)
160
161
  option(GGML_RPC "ggml: use RPC" OFF)
162
+ option(GGML_AMX "ggml: use AMX" OFF)
161
163
  option(GGML_SYCL "ggml: use SYCL" OFF)
162
164
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
163
165
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
@@ -214,13 +216,14 @@ include(CMakePackageConfigHelpers)
214
216
  # all public headers
215
217
  set(GGML_PUBLIC_HEADERS
216
218
  include/ggml.h
219
+ include/ggml-cpu.h
217
220
  include/ggml-alloc.h
218
221
  include/ggml-backend.h
219
222
  include/ggml-blas.h
220
223
  include/ggml-cann.h
221
224
  include/ggml-cuda.h
222
- include/ggml.h
223
225
  include/ggml-kompute.h
226
+ include/ggml-opt.h
224
227
  include/ggml-metal.h
225
228
  include/ggml-rpc.h
226
229
  include/ggml-sycl.h
@@ -233,12 +236,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
233
236
  install(TARGETS ggml PUBLIC_HEADER)
234
237
 
235
238
  if (BUILD_SHARED_LIBS)
236
- install(TARGETS ggml LIBRARY)
239
+ install(TARGETS ggml LIBRARY)
240
+ install(TARGETS ggml-base LIBRARY)
237
241
  endif()
238
242
 
243
+ # FIXME: this should be done in the backend cmake files
239
244
  if (GGML_METAL)
245
+ # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
240
246
  install(
241
- FILES src/ggml-metal.metal
247
+ FILES src/ggml-metal/ggml-metal.metal
242
248
  PERMISSIONS
243
249
  OWNER_READ
244
250
  OWNER_WRITE
@@ -0,0 +1,25 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ // buffer_type API
12
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
13
+
14
+ GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
15
+
16
+ // backend API
17
+ GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
18
+
19
+ GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
20
+
21
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
22
+
23
+ #ifdef __cplusplus
24
+ }
25
+ #endif
@@ -3,6 +3,20 @@
3
3
  #include "ggml.h"
4
4
  #include "ggml-alloc.h"
5
5
 
6
+ #ifdef GGML_BACKEND_SHARED
7
+ # if defined(_WIN32) && !defined(__MINGW32__)
8
+ # ifdef GGML_BACKEND_BUILD
9
+ # define GGML_BACKEND_API __declspec(dllexport) extern
10
+ # else
11
+ # define GGML_BACKEND_API __declspec(dllimport) extern
12
+ # endif
13
+ # else
14
+ # define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
15
+ # endif
16
+ #else
17
+ # define GGML_BACKEND_API extern
18
+ #endif
19
+
6
20
  #ifdef __cplusplus
7
21
  extern "C" {
8
22
  #endif
@@ -72,7 +86,7 @@ extern "C" {
72
86
  GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
73
87
  GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
74
88
 
75
- // "offset" refers to the offset of the tensor data for setting/getting data
89
+ // "offset" refers to the offset in tensor->data for setting/getting data
76
90
  GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
77
91
  GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
78
92
  GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
@@ -114,11 +128,12 @@ extern "C" {
114
128
  //
115
129
 
116
130
  enum ggml_backend_dev_type {
131
+ // CPU device using system memory
117
132
  GGML_BACKEND_DEVICE_TYPE_CPU,
133
+ // GPU device using dedicated memory
118
134
  GGML_BACKEND_DEVICE_TYPE_GPU,
119
- // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
120
- GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
121
- GGML_BACKEND_DEVICE_TYPE_GPU_FULL
135
+ // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
+ GGML_BACKEND_DEVICE_TYPE_ACCEL
122
137
  };
123
138
 
124
139
  // functionality supported by the device
@@ -127,6 +142,8 @@ extern "C" {
127
142
  bool async;
128
143
  // pinned host buffer
129
144
  bool host_buffer;
145
+ // creating buffers from host ptr
146
+ bool buffer_from_host_ptr;
130
147
  // event synchronization
131
148
  bool events;
132
149
  };
@@ -165,9 +182,14 @@ extern "C" {
165
182
  GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
166
183
  GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
167
184
 
185
+ // Common functions that may be obtained using ggml_backend_reg_get_proc_address
168
186
 
169
- // Functions that may be obtained using ggml_backend_reg_get_proc_address
170
- typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
187
+ // Split buffer type for tensor parallelism
188
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
189
+ // Set the number of threads for the backend
190
+ typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
191
+ // Get additional buffer types provided by the device (returns a NULL-terminated array)
192
+ typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
171
193
 
172
194
  //
173
195
  // Backend registry
@@ -189,7 +211,7 @@ extern "C" {
189
211
  GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
190
212
  // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
191
213
  GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
192
- // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
214
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
193
215
  GGML_API ggml_backend_t ggml_backend_init_best(void);
194
216
 
195
217
  //
@@ -220,14 +242,20 @@ extern "C" {
220
242
  ggml_backend_sched_reserve(sched, reserve_graph);
221
243
 
222
244
  // compute
223
- graph = build_graph(sched);
224
- ggml_backend_sched_graph_compute(sched, graph);
245
+ graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
246
+ for (int i = 0; i < 10; ++i) {
247
+ ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
248
+ }
225
249
 
226
250
  // if there are graph inputs:
227
- ggml_backend_sched_reset(sched);
228
- ggml_backend_sched_alloc_graph(sched, graph);
229
- ggml_backend_tensor_set(input_tensor, ...);
230
- ggml_backend_sched_graph_compute(sched, graph);
251
+ graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
252
+ ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
253
+ ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
254
+ ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
255
+ ggml_backend_sched_graph_compute(sched, graph); // execute the graph
256
+
257
+ // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
258
+ // allocate them statically via ggml_backend_alloc_ctx_tensors
231
259
  }
232
260
  */
233
261
 
@@ -242,7 +270,7 @@ extern "C" {
242
270
  //
243
271
  typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
244
272
 
245
- // Initialize a backend scheduler
273
+ // Initialize a backend scheduler, backends with low index are given priority over backends with high index
246
274
  GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
247
275
  GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
248
276
 
@@ -267,7 +295,9 @@ extern "C" {
267
295
  GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
268
296
  GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
269
297
 
270
- // Reset all assignments and allocators - must be called before changing the node backends
298
+ // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
299
+ // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
300
+ // The correct way to use this API is to discard the deallocated tensors and create new ones.
271
301
  GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
272
302
 
273
303
  // Set a callback to be called for each resulting node during graph compute
@@ -297,27 +327,10 @@ extern "C" {
297
327
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
298
328
  GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
299
329
 
300
- //
301
- // CPU backend
302
- //
303
-
304
- GGML_API ggml_backend_t ggml_backend_cpu_init(void);
305
-
306
- GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
307
- GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
308
- GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
309
- GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
310
-
311
- // Create a backend buffer from an existing pointer
330
+ // CPU buffer types are always available
312
331
  GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
313
332
  GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
314
333
 
315
- GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
316
-
317
- #ifdef GGML_USE_CPU_HBM
318
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
319
- #endif
320
-
321
334
  #ifdef __cplusplus
322
335
  }
323
336
  #endif
@@ -9,13 +9,15 @@ extern "C" {
9
9
  #endif
10
10
 
11
11
  // backend API
12
- GGML_API ggml_backend_t ggml_backend_blas_init(void);
12
+ GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
13
13
 
14
- GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
14
+ GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
15
15
 
16
16
  // number of threads used for conversion to float
17
17
  // for openblas and blis, this will also set the number of threads used for blas operations
18
- GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
18
+ GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
19
+
20
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
19
21
 
20
22
 
21
23
  #ifdef __cplusplus