@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -26,20 +26,25 @@ struct seq_draft {
|
|
|
26
26
|
std::vector<llama_token> tokens;
|
|
27
27
|
std::vector<std::vector<llama_token_data>> dists;
|
|
28
28
|
|
|
29
|
-
struct
|
|
29
|
+
struct common_sampler * smpl = nullptr;
|
|
30
30
|
};
|
|
31
31
|
|
|
32
32
|
int main(int argc, char ** argv) {
|
|
33
|
-
|
|
33
|
+
common_params params;
|
|
34
34
|
|
|
35
35
|
// needed to get candidate probs even for temp <= 0.0
|
|
36
36
|
params.sparams.n_probs = 128;
|
|
37
37
|
|
|
38
|
-
if (!
|
|
38
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
|
39
39
|
return 1;
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
if (params.n_predict < -1) {
|
|
43
|
+
LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
|
|
44
|
+
return 1;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
common_init();
|
|
43
48
|
|
|
44
49
|
if (params.model_draft.empty()) {
|
|
45
50
|
LOG_ERR("%s: --model-draft is required\n", __func__);
|
|
@@ -66,7 +71,7 @@ int main(int argc, char ** argv) {
|
|
|
66
71
|
llama_context * ctx_dft = NULL;
|
|
67
72
|
|
|
68
73
|
// load the target model
|
|
69
|
-
|
|
74
|
+
common_init_result llama_init_tgt = common_init_from_params(params);
|
|
70
75
|
model_tgt = llama_init_tgt.model;
|
|
71
76
|
ctx_tgt = llama_init_tgt.context;
|
|
72
77
|
|
|
@@ -78,7 +83,7 @@ int main(int argc, char ** argv) {
|
|
|
78
83
|
}
|
|
79
84
|
|
|
80
85
|
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
|
81
|
-
|
|
86
|
+
common_init_result llama_init_dft = common_init_from_params(params);
|
|
82
87
|
model_dft = llama_init_dft.model;
|
|
83
88
|
ctx_dft = llama_init_dft.context;
|
|
84
89
|
|
|
@@ -124,8 +129,8 @@ int main(int argc, char ** argv) {
|
|
|
124
129
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
|
125
130
|
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
|
126
131
|
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
|
127
|
-
|
|
128
|
-
|
|
132
|
+
common_token_to_piece(ctx_tgt, i).c_str(),
|
|
133
|
+
common_token_to_piece(ctx_dft, i).c_str());
|
|
129
134
|
return 1;
|
|
130
135
|
}
|
|
131
136
|
}
|
|
@@ -134,7 +139,7 @@ int main(int argc, char ** argv) {
|
|
|
134
139
|
|
|
135
140
|
// Tokenize the prompt
|
|
136
141
|
std::vector<llama_token> inp;
|
|
137
|
-
inp =
|
|
142
|
+
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
|
138
143
|
|
|
139
144
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
|
140
145
|
const int max_tokens_list_size = max_context_size - 4;
|
|
@@ -147,7 +152,7 @@ int main(int argc, char ** argv) {
|
|
|
147
152
|
LOG("\n\n");
|
|
148
153
|
|
|
149
154
|
for (auto id : inp) {
|
|
150
|
-
LOG("%s",
|
|
155
|
+
LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
|
|
151
156
|
}
|
|
152
157
|
|
|
153
158
|
const int n_input = inp.size();
|
|
@@ -155,9 +160,9 @@ int main(int argc, char ** argv) {
|
|
|
155
160
|
const auto t_enc_start = ggml_time_us();
|
|
156
161
|
|
|
157
162
|
// eval the prompt with both models
|
|
158
|
-
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1
|
|
159
|
-
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1
|
|
160
|
-
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input
|
|
163
|
+
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
|
|
164
|
+
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1));
|
|
165
|
+
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
|
|
161
166
|
|
|
162
167
|
const auto t_enc_end = ggml_time_us();
|
|
163
168
|
|
|
@@ -178,20 +183,18 @@ int main(int argc, char ** argv) {
|
|
|
178
183
|
bool has_eos = false;
|
|
179
184
|
|
|
180
185
|
// target model sampling context (reuse the llama_context's sampling instance)
|
|
181
|
-
struct
|
|
182
|
-
|
|
183
|
-
struct llama_sampler * softmax = llama_sampler_init_softmax();
|
|
186
|
+
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
|
|
184
187
|
|
|
185
188
|
// draft sequence data
|
|
186
189
|
std::vector<seq_draft> drafts(n_seq_dft);
|
|
187
190
|
|
|
188
191
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
189
|
-
// allocate
|
|
190
|
-
drafts[s].smpl =
|
|
192
|
+
// allocate llama_sampler for each draft sequence
|
|
193
|
+
drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
|
|
191
194
|
}
|
|
192
195
|
|
|
193
|
-
llama_batch batch_dft = llama_batch_init(
|
|
194
|
-
llama_batch batch_tgt = llama_batch_init(
|
|
196
|
+
llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
|
197
|
+
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
|
|
195
198
|
|
|
196
199
|
const auto t_dec_start = ggml_time_us();
|
|
197
200
|
|
|
@@ -229,9 +232,9 @@ int main(int argc, char ** argv) {
|
|
|
229
232
|
bool accept = false;
|
|
230
233
|
if (params.sparams.temp > 0) {
|
|
231
234
|
// stochastic verification
|
|
232
|
-
|
|
235
|
+
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
|
233
236
|
|
|
234
|
-
auto & dist_tgt = *
|
|
237
|
+
auto & dist_tgt = *common_sampler_get_candidates(smpl);
|
|
235
238
|
|
|
236
239
|
float p_tgt = 0.0f;
|
|
237
240
|
float p_dft = 0.0f;
|
|
@@ -264,11 +267,12 @@ int main(int argc, char ** argv) {
|
|
|
264
267
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
|
265
268
|
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
|
266
269
|
p_tgt = dist_tgt.data[i].p;
|
|
270
|
+
break;
|
|
267
271
|
}
|
|
272
|
+
}
|
|
273
|
+
for (size_t i = 0; i < dist_dft.size; i++) {
|
|
268
274
|
if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
|
|
269
275
|
p_dft = dist_dft.data[i].p;
|
|
270
|
-
}
|
|
271
|
-
if (p_tgt && p_dft) {
|
|
272
276
|
break;
|
|
273
277
|
}
|
|
274
278
|
}
|
|
@@ -277,13 +281,13 @@ int main(int argc, char ** argv) {
|
|
|
277
281
|
s_keep = s;
|
|
278
282
|
accept = true;
|
|
279
283
|
token_id = drafts[s].tokens[i_dft];
|
|
280
|
-
token_str =
|
|
281
|
-
|
|
284
|
+
token_str = common_token_to_piece(ctx_tgt, token_id);
|
|
285
|
+
common_sampler_accept(smpl, token_id, true);
|
|
282
286
|
|
|
283
287
|
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
|
284
288
|
break;
|
|
285
289
|
} else {
|
|
286
|
-
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft],
|
|
290
|
+
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
|
287
291
|
drafts[s].active = false;
|
|
288
292
|
|
|
289
293
|
// calculate residual probability
|
|
@@ -349,19 +353,19 @@ int main(int argc, char ** argv) {
|
|
|
349
353
|
const int idx = dist(rng);
|
|
350
354
|
|
|
351
355
|
token_id = dist_tgt.data[idx].id;
|
|
352
|
-
|
|
353
|
-
token_str =
|
|
356
|
+
common_sampler_accept(smpl, token_id, true);
|
|
357
|
+
token_str = common_token_to_piece(ctx_tgt, token_id);
|
|
354
358
|
}
|
|
355
359
|
} else {
|
|
356
360
|
// greedy verification
|
|
357
361
|
|
|
358
362
|
// sample from the target model
|
|
359
363
|
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
|
360
|
-
token_id =
|
|
364
|
+
token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
|
361
365
|
|
|
362
|
-
|
|
366
|
+
common_sampler_accept(smpl, token_id, true);
|
|
363
367
|
|
|
364
|
-
token_str =
|
|
368
|
+
token_str = common_token_to_piece(ctx_tgt, token_id);
|
|
365
369
|
|
|
366
370
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
367
371
|
if (!drafts[s].active) {
|
|
@@ -431,8 +435,8 @@ int main(int argc, char ** argv) {
|
|
|
431
435
|
drafts[0].dists.push_back(std::vector<llama_token_data>());
|
|
432
436
|
drafts[0].i_batch_tgt.push_back(0);
|
|
433
437
|
|
|
434
|
-
|
|
435
|
-
|
|
438
|
+
common_batch_clear(batch_dft);
|
|
439
|
+
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
|
436
440
|
|
|
437
441
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
|
438
442
|
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
|
@@ -441,14 +445,14 @@ int main(int argc, char ** argv) {
|
|
|
441
445
|
++n_past_dft;
|
|
442
446
|
}
|
|
443
447
|
|
|
444
|
-
if (n_predict > params.n_predict || has_eos) {
|
|
448
|
+
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
|
445
449
|
break;
|
|
446
450
|
}
|
|
447
451
|
|
|
448
452
|
if (drafts[0].smpl) {
|
|
449
|
-
|
|
453
|
+
common_sampler_free(drafts[0].smpl);
|
|
450
454
|
}
|
|
451
|
-
drafts[0].smpl =
|
|
455
|
+
drafts[0].smpl = common_sampler_clone(smpl);
|
|
452
456
|
|
|
453
457
|
int n_seq_cur = 1;
|
|
454
458
|
int n_past_cur = n_past_dft;
|
|
@@ -461,8 +465,8 @@ int main(int argc, char ** argv) {
|
|
|
461
465
|
drafts[0].drafting = true;
|
|
462
466
|
drafts[0].i_batch_dft = 0;
|
|
463
467
|
|
|
464
|
-
|
|
465
|
-
|
|
468
|
+
common_batch_clear(batch_tgt);
|
|
469
|
+
common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
|
|
466
470
|
|
|
467
471
|
// sample n_draft tokens from the draft model using tree-based sampling
|
|
468
472
|
for (int i = 0; i < n_draft; ++i) {
|
|
@@ -477,13 +481,13 @@ int main(int argc, char ** argv) {
|
|
|
477
481
|
continue;
|
|
478
482
|
}
|
|
479
483
|
|
|
480
|
-
|
|
484
|
+
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
|
481
485
|
|
|
482
|
-
const auto * cur_p =
|
|
486
|
+
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
|
|
483
487
|
|
|
484
488
|
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
|
485
489
|
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
486
|
-
k, s, i, cur_p->data[k].id, cur_p->data[k].p,
|
|
490
|
+
k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
|
487
491
|
}
|
|
488
492
|
|
|
489
493
|
std::vector<int> sa(1, s);
|
|
@@ -518,9 +522,9 @@ int main(int argc, char ** argv) {
|
|
|
518
522
|
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
|
519
523
|
|
|
520
524
|
if (drafts[n_seq_cur].smpl) {
|
|
521
|
-
|
|
525
|
+
common_sampler_free(drafts[n_seq_cur].smpl);
|
|
522
526
|
}
|
|
523
|
-
drafts[n_seq_cur].smpl =
|
|
527
|
+
drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
|
|
524
528
|
|
|
525
529
|
sa.push_back(n_seq_cur);
|
|
526
530
|
|
|
@@ -536,7 +540,7 @@ int main(int argc, char ** argv) {
|
|
|
536
540
|
|
|
537
541
|
const int s = sa[is];
|
|
538
542
|
|
|
539
|
-
|
|
543
|
+
common_sampler_accept(drafts[s].smpl, id, true);
|
|
540
544
|
|
|
541
545
|
drafts[s].tokens.push_back(id);
|
|
542
546
|
// save cur_p.data into drafts[s].dists
|
|
@@ -545,12 +549,12 @@ int main(int argc, char ** argv) {
|
|
|
545
549
|
// add unique drafted tokens to the target batch
|
|
546
550
|
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
|
547
551
|
|
|
548
|
-
|
|
552
|
+
common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
|
549
553
|
|
|
550
554
|
// add the token to the batch for batched decoding with the draft model
|
|
551
555
|
drafts[s].i_batch_dft = batch_dft.n_tokens;
|
|
552
556
|
|
|
553
|
-
|
|
557
|
+
common_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
|
554
558
|
|
|
555
559
|
if (batch_tgt.n_tokens > n_draft) {
|
|
556
560
|
drafts[s].drafting = false;
|
|
@@ -617,14 +621,13 @@ int main(int argc, char ** argv) {
|
|
|
617
621
|
|
|
618
622
|
LOG_INF("\n");
|
|
619
623
|
LOG_INF("target:\n\n");
|
|
620
|
-
|
|
624
|
+
common_perf_print(ctx_tgt, smpl);
|
|
621
625
|
|
|
622
|
-
|
|
626
|
+
common_sampler_free(smpl);
|
|
623
627
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
624
|
-
|
|
628
|
+
common_sampler_free(drafts[s].smpl);
|
|
625
629
|
}
|
|
626
630
|
|
|
627
|
-
llama_sampler_free(softmax);
|
|
628
631
|
llama_batch_free(batch_dft);
|
|
629
632
|
|
|
630
633
|
llama_free(ctx_tgt);
|
|
@@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
365
365
|
const bool parse_special = !no_parse_special;
|
|
366
366
|
|
|
367
367
|
std::vector<llama_token> tokens;
|
|
368
|
-
tokens =
|
|
368
|
+
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
|
369
369
|
|
|
370
370
|
if (printing_ids) {
|
|
371
371
|
printf("[");
|
|
@@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
380
380
|
} else {
|
|
381
381
|
bool invalid_utf8 = false;
|
|
382
382
|
printf("%6d -> '", tokens[i]);
|
|
383
|
-
write_utf8_cstr_to_stdout(
|
|
383
|
+
write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
|
384
384
|
if (invalid_utf8) {
|
|
385
385
|
printf("' (utf-8 decode failure)\n");
|
|
386
386
|
} else {
|
|
@@ -92,6 +92,7 @@ else()
|
|
|
92
92
|
endif()
|
|
93
93
|
|
|
94
94
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
95
|
+
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
95
96
|
|
|
96
97
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
97
98
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
@@ -99,6 +100,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
|
|
99
100
|
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
|
100
101
|
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
|
101
102
|
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
|
103
|
+
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
|
|
104
|
+
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
|
|
105
|
+
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
|
|
102
106
|
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
|
103
107
|
if (NOT MSVC)
|
|
104
108
|
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
|
|
@@ -113,6 +117,7 @@ endif()
|
|
|
113
117
|
|
|
114
118
|
# ggml core
|
|
115
119
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
|
120
|
+
option(GGML_CPU "ggml: enable CPU backend" ON)
|
|
116
121
|
|
|
117
122
|
# 3rd party libs / backends
|
|
118
123
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
|
@@ -123,14 +128,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
|
|
|
123
128
|
|
|
124
129
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
|
125
130
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
126
|
-
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
|
127
131
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
128
132
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
129
|
-
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
|
130
|
-
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
|
131
133
|
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
132
|
-
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
|
133
|
-
"ggml: iters./thread per block for Q2_K/Q6_K")
|
|
134
134
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
135
135
|
"ggml: max. batch size for using peer access")
|
|
136
136
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
@@ -138,7 +138,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
|
|
138
138
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
|
139
139
|
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
|
140
140
|
|
|
141
|
-
option(
|
|
141
|
+
option(GGML_HIP "ggml: use HIP" OFF)
|
|
142
142
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
143
143
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
144
144
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
@@ -150,6 +150,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
|
|
150
150
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
151
151
|
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
|
152
152
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
153
|
+
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
153
154
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
154
155
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
155
156
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
|
@@ -158,6 +159,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|
|
158
159
|
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
|
159
160
|
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
|
160
161
|
option(GGML_RPC "ggml: use RPC" OFF)
|
|
162
|
+
option(GGML_AMX "ggml: use AMX" OFF)
|
|
161
163
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
|
162
164
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
|
163
165
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
@@ -214,13 +216,14 @@ include(CMakePackageConfigHelpers)
|
|
|
214
216
|
# all public headers
|
|
215
217
|
set(GGML_PUBLIC_HEADERS
|
|
216
218
|
include/ggml.h
|
|
219
|
+
include/ggml-cpu.h
|
|
217
220
|
include/ggml-alloc.h
|
|
218
221
|
include/ggml-backend.h
|
|
219
222
|
include/ggml-blas.h
|
|
220
223
|
include/ggml-cann.h
|
|
221
224
|
include/ggml-cuda.h
|
|
222
|
-
include/ggml.h
|
|
223
225
|
include/ggml-kompute.h
|
|
226
|
+
include/ggml-opt.h
|
|
224
227
|
include/ggml-metal.h
|
|
225
228
|
include/ggml-rpc.h
|
|
226
229
|
include/ggml-sycl.h
|
|
@@ -233,12 +236,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
|
233
236
|
install(TARGETS ggml PUBLIC_HEADER)
|
|
234
237
|
|
|
235
238
|
if (BUILD_SHARED_LIBS)
|
|
236
|
-
install(TARGETS ggml
|
|
239
|
+
install(TARGETS ggml LIBRARY)
|
|
240
|
+
install(TARGETS ggml-base LIBRARY)
|
|
237
241
|
endif()
|
|
238
242
|
|
|
243
|
+
# FIXME: this should be done in the backend cmake files
|
|
239
244
|
if (GGML_METAL)
|
|
245
|
+
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
|
240
246
|
install(
|
|
241
|
-
FILES src/ggml-metal.metal
|
|
247
|
+
FILES src/ggml-metal/ggml-metal.metal
|
|
242
248
|
PERMISSIONS
|
|
243
249
|
OWNER_READ
|
|
244
250
|
OWNER_WRITE
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
#ifdef __cplusplus
|
|
8
|
+
extern "C" {
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
// buffer_type API
|
|
12
|
+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
|
13
|
+
|
|
14
|
+
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
|
15
|
+
|
|
16
|
+
// backend API
|
|
17
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
|
18
|
+
|
|
19
|
+
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
|
20
|
+
|
|
21
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
|
22
|
+
|
|
23
|
+
#ifdef __cplusplus
|
|
24
|
+
}
|
|
25
|
+
#endif
|
|
@@ -3,6 +3,20 @@
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
#include "ggml-alloc.h"
|
|
5
5
|
|
|
6
|
+
#ifdef GGML_BACKEND_SHARED
|
|
7
|
+
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
8
|
+
# ifdef GGML_BACKEND_BUILD
|
|
9
|
+
# define GGML_BACKEND_API __declspec(dllexport) extern
|
|
10
|
+
# else
|
|
11
|
+
# define GGML_BACKEND_API __declspec(dllimport) extern
|
|
12
|
+
# endif
|
|
13
|
+
# else
|
|
14
|
+
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
|
15
|
+
# endif
|
|
16
|
+
#else
|
|
17
|
+
# define GGML_BACKEND_API extern
|
|
18
|
+
#endif
|
|
19
|
+
|
|
6
20
|
#ifdef __cplusplus
|
|
7
21
|
extern "C" {
|
|
8
22
|
#endif
|
|
@@ -72,7 +86,7 @@ extern "C" {
|
|
|
72
86
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
73
87
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
74
88
|
|
|
75
|
-
// "offset" refers to the offset
|
|
89
|
+
// "offset" refers to the offset in tensor->data for setting/getting data
|
|
76
90
|
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
77
91
|
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
78
92
|
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
|
@@ -114,11 +128,12 @@ extern "C" {
|
|
|
114
128
|
//
|
|
115
129
|
|
|
116
130
|
enum ggml_backend_dev_type {
|
|
131
|
+
// CPU device using system memory
|
|
117
132
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
|
+
// GPU device using dedicated memory
|
|
118
134
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
119
|
-
// devices
|
|
120
|
-
|
|
121
|
-
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
|
135
|
+
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
|
+
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
122
137
|
};
|
|
123
138
|
|
|
124
139
|
// functionality supported by the device
|
|
@@ -127,6 +142,8 @@ extern "C" {
|
|
|
127
142
|
bool async;
|
|
128
143
|
// pinned host buffer
|
|
129
144
|
bool host_buffer;
|
|
145
|
+
// creating buffers from host ptr
|
|
146
|
+
bool buffer_from_host_ptr;
|
|
130
147
|
// event synchronization
|
|
131
148
|
bool events;
|
|
132
149
|
};
|
|
@@ -165,9 +182,14 @@ extern "C" {
|
|
|
165
182
|
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
|
166
183
|
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
|
167
184
|
|
|
185
|
+
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
|
168
186
|
|
|
169
|
-
//
|
|
170
|
-
typedef ggml_backend_buffer_type_t
|
|
187
|
+
// Split buffer type for tensor parallelism
|
|
188
|
+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
|
189
|
+
// Set the number of threads for the backend
|
|
190
|
+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
|
191
|
+
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
|
192
|
+
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
|
171
193
|
|
|
172
194
|
//
|
|
173
195
|
// Backend registry
|
|
@@ -189,7 +211,7 @@ extern "C" {
|
|
|
189
211
|
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
|
190
212
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
|
191
213
|
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
|
192
|
-
// = ggml_backend_dev_init(ggml_backend_dev_by_type(
|
|
214
|
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
|
193
215
|
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
|
194
216
|
|
|
195
217
|
//
|
|
@@ -220,14 +242,20 @@ extern "C" {
|
|
|
220
242
|
ggml_backend_sched_reserve(sched, reserve_graph);
|
|
221
243
|
|
|
222
244
|
// compute
|
|
223
|
-
graph = build_graph(sched);
|
|
224
|
-
|
|
245
|
+
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
|
246
|
+
for (int i = 0; i < 10; ++i) {
|
|
247
|
+
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
|
248
|
+
}
|
|
225
249
|
|
|
226
250
|
// if there are graph inputs:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
251
|
+
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
|
|
252
|
+
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
|
253
|
+
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
|
254
|
+
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
|
255
|
+
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
|
256
|
+
|
|
257
|
+
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
|
258
|
+
// allocate them statically via ggml_backend_alloc_ctx_tensors
|
|
231
259
|
}
|
|
232
260
|
*/
|
|
233
261
|
|
|
@@ -242,7 +270,7 @@ extern "C" {
|
|
|
242
270
|
//
|
|
243
271
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
|
244
272
|
|
|
245
|
-
// Initialize a backend scheduler
|
|
273
|
+
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
|
246
274
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
|
247
275
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
248
276
|
|
|
@@ -267,7 +295,9 @@ extern "C" {
|
|
|
267
295
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
268
296
|
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
|
269
297
|
|
|
270
|
-
// Reset all assignments and allocators - must be called before changing the node backends
|
|
298
|
+
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
|
299
|
+
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
|
300
|
+
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
|
271
301
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
|
272
302
|
|
|
273
303
|
// Set a callback to be called for each resulting node during graph compute
|
|
@@ -297,27 +327,10 @@ extern "C" {
|
|
|
297
327
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
298
328
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
|
299
329
|
|
|
300
|
-
//
|
|
301
|
-
// CPU backend
|
|
302
|
-
//
|
|
303
|
-
|
|
304
|
-
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
|
305
|
-
|
|
306
|
-
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
|
307
|
-
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
|
308
|
-
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
|
309
|
-
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
310
|
-
|
|
311
|
-
// Create a backend buffer from an existing pointer
|
|
330
|
+
// CPU buffer types are always available
|
|
312
331
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
|
313
332
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
|
314
333
|
|
|
315
|
-
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
316
|
-
|
|
317
|
-
#ifdef GGML_USE_CPU_HBM
|
|
318
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
|
319
|
-
#endif
|
|
320
|
-
|
|
321
334
|
#ifdef __cplusplus
|
|
322
335
|
}
|
|
323
336
|
#endif
|
|
@@ -9,13 +9,15 @@ extern "C" {
|
|
|
9
9
|
#endif
|
|
10
10
|
|
|
11
11
|
// backend API
|
|
12
|
-
|
|
12
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
|
15
15
|
|
|
16
16
|
// number of threads used for conversion to float
|
|
17
17
|
// for openblas and blis, this will also set the number of threads used for blas operations
|
|
18
|
-
|
|
18
|
+
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
|
19
|
+
|
|
20
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
#ifdef __cplusplus
|