npm - @fugood/llama.node - Versions diffs - 0.0.1-alpha.1 - Mend

@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (204) hide show

package/CMakeLists.txt +85 -0
package/README.md +56 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/lib/binding.js +13 -0
package/lib/binding.ts +57 -0
package/lib/index.js +24 -0
package/lib/index.ts +13 -0
package/package.json +65 -0
package/src/addons.cpp +506 -0
package/src/llama.cpp/CMakeLists.txt +1320 -0
package/src/llama.cpp/build.zig +172 -0
package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
package/src/llama.cpp/common/CMakeLists.txt +87 -0
package/src/llama.cpp/common/base64.hpp +392 -0
package/src/llama.cpp/common/common.cpp +2949 -0
package/src/llama.cpp/common/common.h +324 -0
package/src/llama.cpp/common/console.cpp +501 -0
package/src/llama.cpp/common/console.h +19 -0
package/src/llama.cpp/common/grammar-parser.cpp +440 -0
package/src/llama.cpp/common/grammar-parser.h +29 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
package/src/llama.cpp/common/json.hpp +24766 -0
package/src/llama.cpp/common/log.h +724 -0
package/src/llama.cpp/common/ngram-cache.cpp +282 -0
package/src/llama.cpp/common/ngram-cache.h +94 -0
package/src/llama.cpp/common/sampling.cpp +353 -0
package/src/llama.cpp/common/sampling.h +147 -0
package/src/llama.cpp/common/stb_image.h +8396 -0
package/src/llama.cpp/common/train.cpp +1513 -0
package/src/llama.cpp/common/train.h +233 -0
package/src/llama.cpp/examples/CMakeLists.txt +52 -0
package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/batched/batched.cpp +262 -0
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/infill/infill.cpp +767 -0
package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
package/src/llama.cpp/examples/llava/clip.h +85 -0
package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
package/src/llama.cpp/examples/llava/llava.cpp +426 -0
package/src/llama.cpp/examples/llava/llava.h +50 -0
package/src/llama.cpp/examples/llava/requirements.txt +3 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/main/main.cpp +957 -0
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
package/src/llama.cpp/examples/server/httplib.h +9465 -0
package/src/llama.cpp/examples/server/server.cpp +3826 -0
package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
package/src/llama.cpp/examples/server/utils.hpp +653 -0
package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple/simple.cpp +183 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
package/src/llama.cpp/ggml-alloc.c +985 -0
package/src/llama.cpp/ggml-alloc.h +76 -0
package/src/llama.cpp/ggml-backend-impl.h +141 -0
package/src/llama.cpp/ggml-backend.c +2099 -0
package/src/llama.cpp/ggml-backend.h +233 -0
package/src/llama.cpp/ggml-common.h +1853 -0
package/src/llama.cpp/ggml-cuda.h +43 -0
package/src/llama.cpp/ggml-impl.h +265 -0
package/src/llama.cpp/ggml-kompute.cpp +2006 -0
package/src/llama.cpp/ggml-kompute.h +46 -0
package/src/llama.cpp/ggml-metal.h +66 -0
package/src/llama.cpp/ggml-mpi.c +216 -0
package/src/llama.cpp/ggml-mpi.h +39 -0
package/src/llama.cpp/ggml-opencl.cpp +2301 -0
package/src/llama.cpp/ggml-opencl.h +36 -0
package/src/llama.cpp/ggml-quants.c +12678 -0
package/src/llama.cpp/ggml-quants.h +133 -0
package/src/llama.cpp/ggml-sycl.cpp +17882 -0
package/src/llama.cpp/ggml-sycl.h +49 -0
package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
package/src/llama.cpp/ggml-vulkan.h +29 -0
package/src/llama.cpp/ggml.c +21819 -0
package/src/llama.cpp/ggml.h +2403 -0
package/src/llama.cpp/llama.cpp +17468 -0
package/src/llama.cpp/llama.h +1117 -0
package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
package/src/llama.cpp/prompts/alpaca.txt +1 -0
package/src/llama.cpp/prompts/assistant.txt +31 -0
package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
package/src/llama.cpp/prompts/chat.txt +28 -0
package/src/llama.cpp/prompts/dan-modified.txt +1 -0
package/src/llama.cpp/prompts/dan.txt +1 -0
package/src/llama.cpp/prompts/mnemonics.txt +93 -0
package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
package/src/llama.cpp/prompts/reason-act.txt +18 -0
package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
package/src/llama.cpp/requirements.txt +12 -0
package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
package/src/llama.cpp/scripts/xxd.cmake +16 -0
package/src/llama.cpp/sgemm.cpp +999 -0
package/src/llama.cpp/sgemm.h +12 -0
package/src/llama.cpp/tests/CMakeLists.txt +78 -0
package/src/llama.cpp/tests/get-model.cpp +21 -0
package/src/llama.cpp/tests/get-model.h +2 -0
package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
package/src/llama.cpp/tests/test-c.c +7 -0
package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
package/src/llama.cpp/tests/test-double-float.cpp +57 -0
package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
package/src/llama.cpp/tests/test-opt.cpp +181 -0
package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
package/src/llama.cpp/tests/test-rope.cpp +221 -0
package/src/llama.cpp/tests/test-sampling.cpp +301 -0
package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
package/src/llama.cpp/unicode-data.cpp +1651 -0
package/src/llama.cpp/unicode-data.h +16 -0
package/src/llama.cpp/unicode.cpp +277 -0
package/src/llama.cpp/unicode.h +28 -0

package/src/llama.cpp/examples/batched/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,5 @@
+set(TARGET batched)
+add_executable(${TARGET} batched.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

package/src/llama.cpp/examples/batched/batched.cpp ADDED Viewed

@@ -0,0 +1,262 @@
+#include "common.h"
+#include "llama.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+int main(int argc, char ** argv) {
+    gpt_params params;
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
+        return 1 ;
+    }
+    // number of parallel batches
+    int n_parallel = 1;
+    // total length of the sequences including the prompt
+    int n_len = 32;
+    // number of layers to offload to the GPU
+    int n_gpu_layers = 0;
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+    if (argc >= 3) {
+        params.prompt = argv[2];
+    }
+    if (argc >= 4) {
+        n_parallel = std::atoi(argv[3]);
+    }
+    if (argc >= 5) {
+        n_len = std::atoi(argv[4]);
+    }
+    if (argc >= 6) {
+        n_gpu_layers = std::atoi(argv[5]);
+    }
+    if (params.prompt.empty()) {
+        params.prompt = "Hello my name is";
+    }
+    process_escapes(params.prompt);
+    // init LLM
+    llama_backend_init();
+    llama_numa_init(params.numa);
+    // initialize the model
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = n_gpu_layers;
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+    // tokenize the prompt
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize(model, params.prompt, true);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+    // initialize the context
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx   = n_kv_req;
+    ctx_params.n_batch = std::max(n_len, n_parallel);
+    ctx_params.n_seq_max       = n_parallel;
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+    const int n_ctx    = llama_n_ctx(ctx);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
+    if (n_kv_req > n_ctx) {
+        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        return 1;
+    }
+    // print the prompt token-by-token
+    fprintf(stderr, "\n");
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    }
+    fflush(stderr);
+    // create a llama_batch
+    // we use this object to submit token data for decoding
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
+    // evaluate the initial prompt
+    for (size_t i = 0; i < tokens_list.size(); ++i) {
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+    }
+    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+    if (llama_decode(ctx, batch) != 0) {
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+    // assign the system KV cache to all parallel sequences
+    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    for (int32_t i = 1; i < n_parallel; ++i) {
+        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+    }
+    if (n_parallel > 1) {
+        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+    }
+    // main loop
+    // we will store the parallel decoded sequences in this vector
+    std::vector<std::string> streams(n_parallel);
+    // remember the batch index of the last token for each parallel sequence
+    // we need this to determine which logits to sample from
+    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+    int n_cur    = batch.n_tokens;
+    int n_decode = 0;
+    const auto t_main_start = ggml_time_us();
+    while (n_cur <= n_len) {
+        // prepare the next batch
+        llama_batch_clear(batch);
+        // sample the next token for each parallel sequence / stream
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            if (i_batch[i] < 0) {
+                // the stream has already finished
+                continue;
+            }
+            auto   n_vocab = llama_n_vocab(model);
+            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            const int   top_k = 40;
+            const float top_p = 0.9f;
+            const float temp  = 0.4f;
+            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temp (ctx, &candidates_p, temp);
+            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+            // is it an end of generation? -> mark the stream as finished
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+                i_batch[i] = -1;
+                LOG_TEE("\n");
+                if (n_parallel > 1) {
+                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                }
+                continue;
+            }
+            // if there is only one stream, we print immediately to stdout
+            if (n_parallel == 1) {
+                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                fflush(stdout);
+            }
+            streams[i] += llama_token_to_piece(ctx, new_token_id);
+            i_batch[i] = batch.n_tokens;
+            // push this new token for next evaluation
+            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
+            n_decode += 1;
+        }
+        // all streams are finished
+        if (batch.n_tokens == 0) {
+            break;
+        }
+        n_cur += 1;
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
+    }
+    LOG_TEE("\n");
+    if (n_parallel > 1) {
+        LOG_TEE("\n");
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+        }
+    }
+    const auto t_main_end = ggml_time_us();
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+    llama_print_timings(ctx);
+    fprintf(stderr, "\n");
+    llama_batch_free(batch);
+    llama_free(ctx);
+    llama_free_model(model);
+    llama_backend_free();
+    return 0;
+}

package/src/llama.cpp/examples/batched-bench/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,5 @@
+set(TARGET batched-bench)
+add_executable(${TARGET} batched-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

package/src/llama.cpp/examples/batched-bench/batched-bench.cpp ADDED Viewed

@@ -0,0 +1,261 @@
+#include "common.h"
+#include "llama.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+// mutates the input string
+static std::vector<int> parse_list(char * p) {
+    std::vector<int> ret;
+    char * q = p;
+    while (*p) {
+        if (*p == ',') {
+            *p = '\0';
+            ret.push_back(std::atoi(q));
+            q = p + 1;
+        }
+        ++p;
+    }
+    ret.push_back(std::atoi(q));
+    return ret;
+}
+int main(int argc, char ** argv) {
+    gpt_params params;
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
+        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        return 1 ;
+    }
+    int n_kv_max     = 2048;
+    int n_batch      = 2048;
+    int n_ubatch     = 512;
+    int is_pp_shared = 0;
+    int n_gpu_layers = 0;
+    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
+    std::vector<int> n_tg = { 128, 256, };
+    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
+    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+    if (argc >= 3) {
+        n_kv_max = std::atoi(argv[2]);
+    }
+    if (argc >= 4) {
+        n_batch = std::atoi(argv[3]);
+    }
+    if (argc >= 5) {
+        n_ubatch = std::atoi(argv[4]);
+    }
+    if (argc >= 6) {
+        is_pp_shared = std::atoi(argv[5]);
+    }
+    if (argc >= 7) {
+        n_gpu_layers = std::atoi(argv[6]);
+    }
+    if (argc >= 8) {
+        n_pp = parse_list(argv[7]);
+    }
+    if (argc >= 9) {
+        n_tg = parse_list(argv[8]);
+    }
+    if (argc >= 10) {
+        n_pl = parse_list(argv[9]);
+    }
+    // init LLM
+    llama_backend_init();
+    llama_numa_init(params.numa);
+    // initialize the model
+    llama_model_params model_params = llama_model_default_params();
+    const std::vector<float> t_split(llama_max_devices(), 0.0f);
+    model_params.n_gpu_layers = n_gpu_layers;
+    model_params.tensor_split = t_split.data();
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.seed      = 1234;
+    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.n_batch   = n_batch;
+    ctx_params.n_ubatch  = n_ubatch;
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    // ensure enough sequences are available
+    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+    // decode in batches of ctx_params.n_batch tokens
+    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            llama_batch batch_view = {
+                n_tokens,
+                batch.token    + i,
+                nullptr,
+                batch.pos      + i,
+                batch.n_seq_id + i,
+                batch.seq_id   + i,
+                batch.logits   + i,
+                0, 0, 0, // unused
+            };
+            const int ret = llama_decode(ctx, batch_view);
+            if (ret != 0) {
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                return false;
+            }
+            llama_synchronize(ctx);
+        }
+        return true;
+    };
+    // warm up
+    {
+        for (int i = 0; i < 16; ++i) {
+            llama_batch_add(batch, 0, i, { 0 }, false);
+        }
+        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+    }
+    LOG_TEE("\n");
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("\n");
+    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
+    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
+        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
+            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
+                const int pp = n_pp[i_pp];
+                const int tg = n_tg[i_tg];
+                const int pl = n_pl[i_pl];
+                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
+                if (n_ctx_req > n_kv_max) {
+                    continue;
+                }
+                llama_batch_clear(batch);
+                for (int i = 0; i < pp; ++i) {
+                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
+                        llama_batch_add(batch, 0, i, { j }, false);
+                    }
+                }
+                batch.logits[batch.n_tokens - 1] = true;
+                const auto t_pp_start = ggml_time_us();
+                llama_kv_cache_clear(ctx);
+                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    return 1;
+                }
+                if (is_pp_shared) {
+                    for (int32_t i = 1; i < pl; ++i) {
+                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                    }
+                }
+                const auto t_pp_end = ggml_time_us();
+                const auto t_tg_start = ggml_time_us();
+                for (int i = 0; i < tg; ++i) {
+                    llama_batch_clear(batch);
+                    for (int j = 0; j < pl; ++j) {
+                        llama_batch_add(batch, 0, pp + i, { j }, true);
+                    }
+                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        return 1;
+                    }
+                }
+                const auto t_tg_end = ggml_time_us();
+                const int32_t n_kv = n_ctx_req;
+                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
+                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
+                const float t    = t_pp + t_tg;
+                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
+                const float speed_tg = pl*tg / t_tg;
+                const float speed    = n_kv / t;
+                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+            }
+        }
+    }
+    llama_print_timings(ctx);
+    llama_batch_free(batch);
+    llama_free(ctx);
+    llama_free_model(model);
+    llama_backend_free();
+    fprintf(stderr, "\n\n");
+    return 0;
+}

package/src/llama.cpp/examples/beam-search/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,5 @@
+set(TARGET beam-search)
+add_executable(${TARGET} beam-search.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)