npm - @fugood/llama.node - Versions diffs - 0.3.2 → 0.3.3 - Mend

@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (190) hide show

package/CMakeLists.txt +2 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +1 -1
package/src/DetokenizeWorker.cpp +1 -1
package/src/EmbeddingWorker.cpp +2 -2
package/src/LlamaCompletionWorker.cpp +8 -8
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +8 -9
package/src/TokenizeWorker.cpp +1 -1
package/src/common.hpp +4 -4
package/src/llama.cpp/.github/workflows/build.yml +43 -9
package/src/llama.cpp/.github/workflows/docker.yml +3 -0
package/src/llama.cpp/CMakeLists.txt +7 -4
package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
package/src/llama.cpp/common/CMakeLists.txt +0 -2
package/src/llama.cpp/common/arg.cpp +642 -607
package/src/llama.cpp/common/arg.h +22 -22
package/src/llama.cpp/common/common.cpp +79 -281
package/src/llama.cpp/common/common.h +130 -100
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +50 -50
package/src/llama.cpp/common/log.h +18 -18
package/src/llama.cpp/common/ngram-cache.cpp +36 -36
package/src/llama.cpp/common/ngram-cache.h +19 -19
package/src/llama.cpp/common/sampling.cpp +116 -108
package/src/llama.cpp/common/sampling.h +20 -20
package/src/llama.cpp/docs/build.md +37 -17
package/src/llama.cpp/examples/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched/batched.cpp +14 -14
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
package/src/llama.cpp/examples/infill/infill.cpp +40 -86
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
package/src/llama.cpp/examples/llava/clip.cpp +1 -0
package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
package/src/llama.cpp/examples/llava/llava.cpp +37 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
package/src/llama.cpp/examples/main/main.cpp +64 -109
package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
package/src/llama.cpp/examples/server/server.cpp +553 -691
package/src/llama.cpp/examples/server/utils.hpp +312 -25
package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/simple/simple.cpp +128 -96
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
package/src/llama.cpp/ggml/include/ggml.h +53 -393
package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
package/src/llama.cpp/include/llama.h +67 -33
package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
package/src/llama.cpp/src/CMakeLists.txt +2 -1
package/src/llama.cpp/src/llama-sampling.cpp +745 -105
package/src/llama.cpp/src/llama-sampling.h +21 -2
package/src/llama.cpp/src/llama-vocab.cpp +49 -9
package/src/llama.cpp/src/llama-vocab.h +35 -11
package/src/llama.cpp/src/llama.cpp +2636 -2406
package/src/llama.cpp/src/unicode-data.cpp +2 -2
package/src/llama.cpp/tests/CMakeLists.txt +1 -2
package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
package/src/llama.cpp/tests/test-barrier.cpp +1 -0
package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
package/src/llama.cpp/tests/test-log.cpp +2 -2
package/src/llama.cpp/tests/test-opt.cpp +853 -142
package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
package/src/llama.cpp/tests/test-rope.cpp +1 -0
package/src/llama.cpp/tests/test-sampling.cpp +162 -137
package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
package/src/llama.cpp/common/train.cpp +0 -1515
package/src/llama.cpp/common/train.h +0 -233
package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
/package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
/package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
/package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0

package/src/llama.cpp/examples/main/main.cpp CHANGED Viewed

@@ -33,8 +33,8 @@
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
-static gpt_params               * g_params;
+static common_sampler          ** g_smpl;
+static common_params            * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
@@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) {
     return f.tellg() == 0;
 }
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-    const std::string timestamp = string_get_sortable_timestamp();
-    const bool success = fs_create_directory_with_parents(params.logdir);
-    if (!success) {
-        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
-        return;
-    }
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
-    llama_perf_dump_yaml(logfile, ctx);
-    fclose(logfile);
-}
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
     if (signo == SIGINT) {
@@ -114,12 +71,11 @@ static void sigint_handler(int signo) {
         } else {
             console::cleanup();
             LOG("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            common_perf_print(*g_ctx, *g_smpl);
             // make sure all logs are flushed
             LOG("Interrupted by user\n");
-            gpt_log_pause(gpt_log_main());
+            common_log_pause(common_log_main());
             _exit(130);
         }
@@ -127,22 +83,22 @@ static void sigint_handler(int signo) {
 }
 #endif
-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
-    llama_chat_msg new_msg{role, content};
-    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
+    common_chat_msg new_msg{role, content};
+    auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
     LOG_DBG("formatted: '%s'\n", formatted.c_str());
     return formatted;
 }
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
     g_params = &params;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
-    gpt_init();
+    common_init();
     auto & sparams = params.sparams;
@@ -187,9 +143,9 @@ int main(int argc, char ** argv) {
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    gpt_sampler * smpl = nullptr;
+    common_sampler * smpl = nullptr;
-    std::vector<llama_chat_msg> chat_msgs;
+    std::vector<common_chat_msg> chat_msgs;
     g_model = &model;
     g_ctx = &ctx;
@@ -197,7 +153,7 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
     model = llama_init.model;
     ctx = llama_init.context;
@@ -246,7 +202,7 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation) {
         if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
         } else {
             LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
@@ -255,7 +211,7 @@ int main(int argc, char ** argv) {
     // print system information
     {
         LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
         LOG_INF("\n");
     }
@@ -296,7 +252,7 @@ int main(int argc, char ** argv) {
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
             LOG_DBG("tokenize the prompt\n");
-            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
+            embd_inp = common_tokenize(ctx, prompt, true, true);
         } else {
             LOG_DBG("use session tokens\n");
             embd_inp = session_tokens;
@@ -379,13 +335,13 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
         }
         if (params.n_keep > add_bos) {
             LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
             }
             LOG_CNT("'\n");
         }
@@ -415,9 +371,9 @@ int main(int argc, char ** argv) {
             for (const auto & antiprompt : params.antiprompt) {
                 LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    auto tmp = common_tokenize(ctx, antiprompt, false, true);
                     for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                     }
                 }
             }
@@ -430,9 +386,9 @@ int main(int argc, char ** argv) {
         if (!params.input_prefix.empty()) {
             LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
@@ -440,23 +396,23 @@ int main(int argc, char ** argv) {
         if (!params.input_suffix.empty()) {
             LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
     }
-    smpl = gpt_sampler_init(model, sparams);
+    smpl = common_sampler_init(model, sparams);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         return 1;
     }
-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
     LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
     LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
@@ -521,14 +477,14 @@ int main(int argc, char ** argv) {
     antiprompt_ids.reserve(params.antiprompt.size());
     for (const std::string & antiprompt : params.antiprompt) {
-        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
+        antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
     }
     if (llama_model_has_encoder(model)) {
         int enc_input_size = embd_inp.size();
         llama_token * enc_input_buf = embd_inp.data();
-        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
+        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
             LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
@@ -569,30 +525,30 @@ int main(int argc, char ** argv) {
                     if (!params.ctx_shift){
                         LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
                         break;
-                    } else {
-                        if (params.n_predict == -2) {
-                            LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                            break;
-                        }
+                    }
-                        const int n_left    = n_past - params.n_keep;
-                        const int n_discard = n_left/2;
+                    if (params.n_predict == -2) {
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        break;
+                    }
-                        LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                                n_past, n_left, n_ctx, params.n_keep, n_discard);
+                    const int n_left    = n_past - params.n_keep;
+                    const int n_discard = n_left/2;
-                        llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
-                        n_past -= n_discard;
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
-                        LOG_DBG("after swap: n_past = %d\n", n_past);
+                    n_past -= n_discard;
-                        LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
-                        LOG_DBG("clear session path\n");
-                        path_session.clear();
-                    }
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+                    LOG_DBG("clear session path\n");
+                    path_session.clear();
                 }
             } else {
                 // context extension via Self-Extend
@@ -648,7 +604,7 @@ int main(int argc, char ** argv) {
                 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
                     LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
@@ -679,9 +635,9 @@ int main(int argc, char ** argv) {
                 LOG_DBG("saved session to %s\n", path_session.c_str());
             }
-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = common_sampler_sample(smpl, ctx, -1);
-            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
+            common_sampler_accept(smpl, id, /* accept_grammar= */ true);
             // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
@@ -702,7 +658,7 @@ int main(int argc, char ** argv) {
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
+                common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -714,7 +670,7 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+                const std::string token_str = common_token_to_piece(ctx, id, params.special);
                 // Console/Stream Output
                 LOG("%s", token_str.c_str());
@@ -743,7 +699,7 @@ int main(int argc, char ** argv) {
             // check for reverse prompt in the last n_prev tokens
             if (!params.antiprompt.empty()) {
                 const int n_prev = 32;
-                const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
+                const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
                 is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
@@ -765,7 +721,7 @@ int main(int argc, char ** argv) {
                 }
                 // check for reverse prompt using special tokens
-                llama_token last_token = gpt_sampler_last(smpl);
+                llama_token last_token = common_sampler_last(smpl);
                 for (std::vector<llama_token> ids : antiprompt_ids) {
                     if (ids.size() == 1 && last_token == ids[0]) {
                         if (params.interactive) {
@@ -782,13 +738,13 @@ int main(int argc, char ** argv) {
             }
             // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
+            if (llama_token_is_eog(model, common_sampler_last(smpl))) {
                 LOG_DBG("found an EOG token\n");
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
                         // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                        const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
                         embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                         is_antiprompt = true;
                     }
@@ -803,8 +759,8 @@ int main(int argc, char ** argv) {
             // if current token is not EOG, we add it to current assistant message
             if (params.conversation) {
-                const auto id = gpt_sampler_last(smpl);
-                assistant_ss << llama_token_to_piece(ctx, id, false);
+                const auto id = common_sampler_last(smpl);
+                assistant_ss << common_token_to_piece(ctx, id, false);
             }
             if (n_past > 0 && is_interacting) {
@@ -862,9 +818,9 @@ int main(int argc, char ** argv) {
                         ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                         : std::move(buffer);
                     // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = common_tokenize(ctx, user_inp,            false, format_chat);
+                    const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
                     LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
@@ -882,7 +838,7 @@ int main(int argc, char ** argv) {
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
+                        output_ss << common_token_to_piece(ctx, token);
                     }
                     // reset assistant message
@@ -899,7 +855,7 @@ int main(int argc, char ** argv) {
             if (n_past > 0) {
                 if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    common_sampler_reset(smpl);
                 }
                 is_interacting = false;
             }
@@ -925,10 +881,9 @@ int main(int argc, char ** argv) {
     }
     LOG("\n\n");
-    gpt_perf_print(ctx, smpl);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    common_perf_print(ctx, smpl);
-    gpt_sampler_free(smpl);
+    common_sampler_free(smpl);
     llama_free(ctx);
     llama_free_model(model);

package/src/llama.cpp/examples/parallel/parallel.cpp CHANGED Viewed

@@ -54,7 +54,7 @@ static std::vector<std::string> k_prompts = {
 struct client {
     ~client() {
         if (smpl) {
-            gpt_sampler_free(smpl);
+            common_sampler_free(smpl);
         }
     }
@@ -75,7 +75,7 @@ struct client {
     std::string prompt;
     std::string response;
-    struct gpt_sampler * smpl = nullptr;
+    struct common_sampler * smpl = nullptr;
 };
 static void print_date_time() {
@@ -103,13 +103,13 @@ static std::vector<std::string> split_string(const std::string& input, char deli
 int main(int argc, char ** argv) {
     srand(1234);
-    gpt_params params;
+    common_params params;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
     }
-    gpt_init();
+    common_init();
     // number of simultaneous "clients" to simulate
     const int32_t n_clients = params.n_parallel;
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
     // load the target model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    common_init_result llama_init = common_init_from_params(params);
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
@@ -160,11 +160,11 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.smpl = gpt_sampler_init(model, params.sparams);
+        client.smpl = common_sampler_init(model, params.sparams);
     }
     std::vector<llama_token> tokens_system;
-    tokens_system = ::llama_tokenize(ctx, k_system, true);
+    tokens_system = common_tokenize(ctx, k_system, true);
     const int32_t n_tokens_system = tokens_system.size();
     llama_seq_id g_seq_id = 0;
@@ -189,7 +189,7 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
         for (int32_t i = 0; i < n_tokens_system; ++i) {
-            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
+            common_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
         if (llama_decode(ctx, batch) != 0) {
@@ -210,10 +210,10 @@ int main(int argc, char ** argv) {
     while (true) {
         if (dump_kv_cache) {
             llama_kv_cache_view_update(ctx, &kvc_view);
-            llama_kv_cache_dump_view_seqs(kvc_view, 40);
+            common_kv_cache_dump_view_seqs(kvc_view, 40);
         }
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
         // decode any currently ongoing sequences
         for (auto & client : clients) {
@@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
             client.i_batch = batch.n_tokens;
-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
+            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
             client.n_decoded += 1;
         }
@@ -252,14 +252,14 @@ int main(int argc, char ** argv) {
                     client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
-                    gpt_sampler_reset(client.smpl);
+                    common_sampler_reset(client.smpl);
                     // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
-                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
+                    tokens_prompt = common_tokenize(ctx, client.prompt, false);
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
+                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                     }
                     // extract the logits only for the last token
@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
                 batch.n_seq_id + i,
                 batch.seq_id   + i,
                 batch.logits   + i,
-                0, 0, 0, // unused
             };
             const int ret = llama_decode(ctx, batch_view);
@@ -340,9 +339,9 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
-                const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
+                const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
-                gpt_sampler_accept(client.smpl, id, true);
+                common_sampler_accept(client.smpl, id, true);
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -350,7 +349,7 @@ int main(int argc, char ** argv) {
                     client.t_start_gen = ggml_time_us();
                 }
-                const std::string token_str = llama_token_to_piece(ctx, id);
+                const std::string token_str = common_token_to_piece(ctx, id);
                 client.response += token_str;
                 client.sampled = id;

package/src/llama.cpp/examples/passkey/passkey.cpp CHANGED Viewed

@@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) {
 }
 int main(int argc, char ** argv) {
-    gpt_params params;
+    common_params params;
     params.n_junk = 250;
     params.n_keep = 32;
     params.i_pos  = -1;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
         return 1;
     }
-    gpt_init();
+    common_init();
     int n_junk = params.n_junk;
     int n_keep = params.n_keep;
@@ -61,7 +61,7 @@ int main(int argc, char ** argv) {
     // initialize the model
-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = common_model_params_to_llama(params);
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
     // initialize the context
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = common_context_params_to_llama(params);
     ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
@@ -92,10 +92,10 @@ int main(int argc, char ** argv) {
     // tokenize the prompt
     std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+    tokens_list = common_tokenize(ctx, params.prompt, true);
     // tokenize the prefix and use it as a sink
-    const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
+    const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size();
     const int n_tokens_all = tokens_list.size();
@@ -137,10 +137,10 @@ int main(int argc, char ** argv) {
             n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
         }
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
         for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
         }
         if (i + n_batch >= n_tokens_all) {
@@ -171,10 +171,10 @@ int main(int argc, char ** argv) {
         n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
-        llama_batch_clear(batch);
+        common_batch_clear(batch);
         for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
         }
         if (i + n_batch >= n_tokens_all) {
@@ -229,15 +229,15 @@ int main(int argc, char ** argv) {
                 break;
             }
-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
             n_decode += 1;
             // prepare the next batch
-            llama_batch_clear(batch);
+            common_batch_clear(batch);
             // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
+            common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
         }
         n_cur += 1;