npm - @fugood/llama.node - Versions diffs - 0.3.2 → 0.3.3 - Mend

@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (190) hide show

package/CMakeLists.txt +2 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +1 -1
package/src/DetokenizeWorker.cpp +1 -1
package/src/EmbeddingWorker.cpp +2 -2
package/src/LlamaCompletionWorker.cpp +8 -8
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +8 -9
package/src/TokenizeWorker.cpp +1 -1
package/src/common.hpp +4 -4
package/src/llama.cpp/.github/workflows/build.yml +43 -9
package/src/llama.cpp/.github/workflows/docker.yml +3 -0
package/src/llama.cpp/CMakeLists.txt +7 -4
package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
package/src/llama.cpp/common/CMakeLists.txt +0 -2
package/src/llama.cpp/common/arg.cpp +642 -607
package/src/llama.cpp/common/arg.h +22 -22
package/src/llama.cpp/common/common.cpp +79 -281
package/src/llama.cpp/common/common.h +130 -100
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +50 -50
package/src/llama.cpp/common/log.h +18 -18
package/src/llama.cpp/common/ngram-cache.cpp +36 -36
package/src/llama.cpp/common/ngram-cache.h +19 -19
package/src/llama.cpp/common/sampling.cpp +116 -108
package/src/llama.cpp/common/sampling.h +20 -20
package/src/llama.cpp/docs/build.md +37 -17
package/src/llama.cpp/examples/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched/batched.cpp +14 -14
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
package/src/llama.cpp/examples/infill/infill.cpp +40 -86
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
package/src/llama.cpp/examples/llava/clip.cpp +1 -0
package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
package/src/llama.cpp/examples/llava/llava.cpp +37 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
package/src/llama.cpp/examples/main/main.cpp +64 -109
package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
package/src/llama.cpp/examples/server/server.cpp +553 -691
package/src/llama.cpp/examples/server/utils.hpp +312 -25
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +128 -96
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
package/src/llama.cpp/ggml/include/ggml.h +53 -393
package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
package/src/llama.cpp/include/llama.h +67 -33
package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
package/src/llama.cpp/src/CMakeLists.txt +2 -1
package/src/llama.cpp/src/llama-sampling.cpp +745 -105
package/src/llama.cpp/src/llama-sampling.h +21 -2
package/src/llama.cpp/src/llama-vocab.cpp +49 -9
package/src/llama.cpp/src/llama-vocab.h +35 -11
package/src/llama.cpp/src/llama.cpp +2636 -2406
package/src/llama.cpp/src/unicode-data.cpp +2 -2
package/src/llama.cpp/tests/CMakeLists.txt +1 -2
package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
package/src/llama.cpp/tests/test-barrier.cpp +1 -0
package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
package/src/llama.cpp/tests/test-log.cpp +2 -2
package/src/llama.cpp/tests/test-opt.cpp +853 -142
package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
package/src/llama.cpp/tests/test-rope.cpp +1 -0
package/src/llama.cpp/tests/test-sampling.cpp +162 -137
package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
package/src/llama.cpp/common/train.cpp +0 -1515
package/src/llama.cpp/common/train.h +0 -233
package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
/package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
/package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
/package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0

package/src/llama.cpp/examples/speculative/speculative.cpp CHANGED Viewed

@@ -26,20 +26,25 @@ struct seq_draft {
     std::vector<llama_token> tokens;
     std::vector<std::vector<llama_token_data>> dists;
-    struct gpt_sampler * smpl = nullptr;
+    struct common_sampler * smpl = nullptr;
 };
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
     // needed to get candidate probs even for temp <= 0.0
     params.sparams.n_probs = 128;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
         return 1;
     }
-    gpt_init();
+    if (params.n_predict < -1) {
+        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
+        return 1;
+    }
+    common_init();
     if (params.model_draft.empty()) {
         LOG_ERR("%s: --model-draft is required\n", __func__);
@@ -66,7 +71,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx_dft = NULL;
     // load the target model
-    llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
+    common_init_result llama_init_tgt = common_init_from_params(params);
     model_tgt = llama_init_tgt.model;
     ctx_tgt = llama_init_tgt.context;
@@ -78,7 +83,7 @@ int main(int argc, char ** argv) {
     }
     params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
-    llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
+    common_init_result llama_init_dft = common_init_from_params(params);
     model_dft = llama_init_dft.model;
     ctx_dft = llama_init_dft.context;
@@ -124,8 +129,8 @@ int main(int argc, char ** argv) {
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                 LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
                 LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
-                        llama_token_to_piece(ctx_tgt, i).c_str(),
-                        llama_token_to_piece(ctx_dft, i).c_str());
+                        common_token_to_piece(ctx_tgt, i).c_str(),
+                        common_token_to_piece(ctx_dft, i).c_str());
                 return 1;
             }
         }
@@ -134,7 +139,7 @@ int main(int argc, char ** argv) {
     // Tokenize the prompt
     std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
+    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
     const int max_context_size     = llama_n_ctx(ctx_tgt);
     const int max_tokens_list_size = max_context_size - 4;
@@ -147,7 +152,7 @@ int main(int argc, char ** argv) {
     LOG("\n\n");
     for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
     }
     const int n_input = inp.size();
@@ -155,9 +160,9 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
     // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
+    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
+    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1));
+    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
     const auto t_enc_end = ggml_time_us();
@@ -178,20 +183,18 @@ int main(int argc, char ** argv) {
     bool has_eos = false;
     // target model sampling context (reuse the llama_context's sampling instance)
-    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
-    struct llama_sampler * softmax = llama_sampler_init_softmax();
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
     // draft sequence data
     std::vector<seq_draft> drafts(n_seq_dft);
     for (int s = 0; s < n_seq_dft; ++s) {
-        // allocate gpt_sampler for each draft sequence
-        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
+        // allocate llama_sampler for each draft sequence
+        drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
     }
-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
+    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
     const auto t_dec_start = ggml_time_us();
@@ -229,9 +232,9 @@ int main(int argc, char ** argv) {
                 bool accept = false;
                 if (params.sparams.temp > 0) {
                     // stochastic verification
-                    gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
+                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
-                    auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
+                    auto & dist_tgt = *common_sampler_get_candidates(smpl);
                     float p_tgt = 0.0f;
                     float p_dft = 0.0f;
@@ -264,11 +267,12 @@ int main(int argc, char ** argv) {
                         for (size_t i = 0; i < dist_tgt.size; i++) {
                             if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
                                 p_tgt = dist_tgt.data[i].p;
+                                break;
                             }
+                        }
+                        for (size_t i = 0; i < dist_dft.size; i++) {
                             if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
                                 p_dft = dist_dft.data[i].p;
-                            }
-                            if (p_tgt && p_dft) {
                                 break;
                             }
                         }
@@ -277,13 +281,13 @@ int main(int argc, char ** argv) {
                             s_keep = s;
                             accept = true;
                             token_id = drafts[s].tokens[i_dft];
-                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            gpt_sampler_accept(smpl, token_id, true);
+                            token_str = common_token_to_piece(ctx_tgt, token_id);
+                            common_sampler_accept(smpl, token_id, true);
                             LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                             break;
                         } else {
-                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                             drafts[s].active = false;
                             // calculate residual probability
@@ -349,19 +353,19 @@ int main(int argc, char ** argv) {
                         const int idx = dist(rng);
                         token_id = dist_tgt.data[idx].id;
-                        gpt_sampler_accept(smpl, token_id, true);
-                        token_str = llama_token_to_piece(ctx_tgt, token_id);
+                        common_sampler_accept(smpl, token_id, true);
+                        token_str = common_token_to_piece(ctx_tgt, token_id);
                     }
                 } else {
                     // greedy verification
                     // sample from the target model
                     LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
-                    gpt_sampler_accept(smpl, token_id, true);
+                    common_sampler_accept(smpl, token_id, true);
-                    token_str = llama_token_to_piece(ctx_tgt, token_id);
+                    token_str = common_token_to_piece(ctx_tgt, token_id);
                     for (int s = 0; s < n_seq_dft; ++s) {
                         if (!drafts[s].active) {
@@ -431,8 +435,8 @@ int main(int argc, char ** argv) {
             drafts[0].dists.push_back(std::vector<llama_token_data>());
             drafts[0].i_batch_tgt.push_back(0);
-            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
+            common_batch_clear(batch_dft);
+            common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
             llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
             // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
@@ -441,14 +445,14 @@ int main(int argc, char ** argv) {
             ++n_past_dft;
         }
-        if (n_predict > params.n_predict || has_eos) {
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
             break;
         }
         if (drafts[0].smpl) {
-            gpt_sampler_free(drafts[0].smpl);
+            common_sampler_free(drafts[0].smpl);
         }
-        drafts[0].smpl = gpt_sampler_clone(smpl);
+        drafts[0].smpl = common_sampler_clone(smpl);
         int n_seq_cur  = 1;
         int n_past_cur = n_past_dft;
@@ -461,8 +465,8 @@ int main(int argc, char ** argv) {
         drafts[0].drafting    = true;
         drafts[0].i_batch_dft = 0;
-        llama_batch_clear(batch_tgt);
-        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+        common_batch_clear(batch_tgt);
+        common_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
         // sample n_draft tokens from the draft model using tree-based sampling
         for (int i = 0; i < n_draft; ++i) {
@@ -477,13 +481,13 @@ int main(int argc, char ** argv) {
                     continue;
                 }
-                gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
+                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
-                const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
+                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                     LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                 }
                 std::vector<int> sa(1, s);
@@ -518,9 +522,9 @@ int main(int argc, char ** argv) {
                         drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
                         if (drafts[n_seq_cur].smpl) {
-                            gpt_sampler_free(drafts[n_seq_cur].smpl);
+                            common_sampler_free(drafts[n_seq_cur].smpl);
                         }
-                        drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
+                        drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
                         sa.push_back(n_seq_cur);
@@ -536,7 +540,7 @@ int main(int argc, char ** argv) {
                     const int s = sa[is];
-                    gpt_sampler_accept(drafts[s].smpl, id, true);
+                    common_sampler_accept(drafts[s].smpl, id, true);
                     drafts[s].tokens.push_back(id);
                     // save cur_p.data into drafts[s].dists
@@ -545,12 +549,12 @@ int main(int argc, char ** argv) {
                     // add unique drafted tokens to the target batch
                     drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
                     // add the token to the batch for batched decoding with the draft model
                     drafts[s].i_batch_dft = batch_dft.n_tokens;
-                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
                     if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
@@ -617,14 +621,13 @@ int main(int argc, char ** argv) {
     LOG_INF("\n");
     LOG_INF("target:\n\n");
-    gpt_perf_print(ctx_tgt, smpl);
+    common_perf_print(ctx_tgt, smpl);
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
     for (int s = 0; s < n_seq_dft; ++s) {
-        gpt_sampler_free(drafts[s].smpl);
+        common_sampler_free(drafts[s].smpl);
     }
-    llama_sampler_free(softmax);
     llama_batch_free(batch_dft);
     llama_free(ctx_tgt);

package/src/llama.cpp/examples/tokenize/tokenize.cpp CHANGED Viewed

@@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
     const bool parse_special = !no_parse_special;
     std::vector<llama_token> tokens;
-    tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
+    tokens = common_tokenize(model, prompt, add_bos, parse_special);
     if (printing_ids) {
         printf("[");
@@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
         } else {
             bool invalid_utf8 = false;
             printf("%6d -> '", tokens[i]);
-            write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
+            write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
             if (invalid_utf8) {
                 printf("' (utf-8 decode failure)\n");
             } else {

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -92,6 +92,7 @@ else()
 endif()
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
+option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
@@ -99,6 +100,9 @@ option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
 option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
+option(GGML_AMX_TILE    "ggml: enable AMX-TILE"         OFF)
+option(GGML_AMX_INT8    "ggml: enable AMX-INT8"         OFF)
+option(GGML_AMX_BF16    "ggml: enable AMX-BF16"         OFF)
 option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
 if (NOT MSVC)
     option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -113,6 +117,7 @@ endif()
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
@@ -123,14 +128,9 @@ option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
-option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
-set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
-                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
@@ -138,7 +138,7 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
-option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
@@ -150,6 +150,7 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
@@ -158,6 +159,7 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
+option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
@@ -214,13 +216,14 @@ include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
     include/ggml.h
+    include/ggml-cpu.h
     include/ggml-alloc.h
     include/ggml-backend.h
     include/ggml-blas.h
     include/ggml-cann.h
     include/ggml-cuda.h
-    include/ggml.h
     include/ggml-kompute.h
+    include/ggml-opt.h
     include/ggml-metal.h
     include/ggml-rpc.h
     include/ggml-sycl.h
@@ -233,12 +236,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
 if (BUILD_SHARED_LIBS)
-    install(TARGETS ggml LIBRARY)
+    install(TARGETS ggml      LIBRARY)
+    install(TARGETS ggml-base LIBRARY)
 endif()
+# FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
+    # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
     install(
-        FILES src/ggml-metal.metal
+        FILES src/ggml-metal/ggml-metal.metal
         PERMISSIONS
             OWNER_READ
             OWNER_WRITE

package/src/llama.cpp/ggml/include/ggml-amx.h ADDED Viewed

@@ -0,0 +1,25 @@
+#pragma once
+#include "ggml.h"
+#include "ggml-backend.h"
+#ifdef  __cplusplus
+extern "C" {
+#endif
+// buffer_type API
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
+GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
+#ifdef  __cplusplus
+}
+#endif

package/src/llama.cpp/ggml/include/ggml-backend.h CHANGED Viewed

@@ -3,6 +3,20 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -72,7 +86,7 @@ extern "C" {
     GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    // "offset" refers to the offset of the tensor data for setting/getting data
+    // "offset" refers to the offset in tensor->data for setting/getting data
     GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
@@ -114,11 +128,12 @@ extern "C" {
     //
     enum ggml_backend_dev_type {
+        // CPU device using system memory
         GGML_BACKEND_DEVICE_TYPE_CPU,
+        // GPU device using dedicated memory
         GGML_BACKEND_DEVICE_TYPE_GPU,
-        // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
-        GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
-        GGML_BACKEND_DEVICE_TYPE_GPU_FULL
+        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+        GGML_BACKEND_DEVICE_TYPE_ACCEL
     };
     // functionality supported by the device
@@ -127,6 +142,8 @@ extern "C" {
         bool async;
         // pinned host buffer
         bool host_buffer;
+        // creating buffers from host ptr
+        bool buffer_from_host_ptr;
         // event synchronization
         bool events;
     };
@@ -165,9 +182,14 @@ extern "C" {
     GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
     GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
+    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
-    // Functions that may be obtained using ggml_backend_reg_get_proc_address
-    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
+    // Split buffer type for tensor parallelism
+    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+    // Set the number of threads for the backend
+    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+    // Get additional buffer types provided by the device (returns a NULL-terminated array)
+    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
     //
     // Backend registry
@@ -189,7 +211,7 @@ extern "C" {
     GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
     // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
     GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
-    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
     GGML_API ggml_backend_t ggml_backend_init_best(void);
     //
@@ -220,14 +242,20 @@ extern "C" {
         ggml_backend_sched_reserve(sched, reserve_graph);
         // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }
         // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
-        ggml_backend_sched_alloc_graph(sched, graph);
-        ggml_backend_tensor_set(input_tensor, ...);
-        ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via ggml_backend_alloc_ctx_tensors
     }
     */
@@ -242,7 +270,7 @@ extern "C" {
     //
     typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-    // Initialize a backend scheduler
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
     GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -267,7 +295,9 @@ extern "C" {
     GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-    // Reset all assignments and allocators - must be called before changing the node backends
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
     GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
     // Set a callback to be called for each resulting node during graph compute
@@ -297,27 +327,10 @@ extern "C" {
     GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
     GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
-    //
-    // CPU backend
-    //
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-    // Create a backend buffer from an existing pointer
+    // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
 #ifdef  __cplusplus
 }
 #endif

package/src/llama.cpp/ggml/include/ggml-blas.h CHANGED Viewed

@@ -9,13 +9,15 @@ extern "C" {
 #endif
 // backend API
-GGML_API ggml_backend_t ggml_backend_blas_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
-GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
 #ifdef  __cplusplus