npm - @fugood/llama.node - Versions diffs - 0.0.1-alpha.1 - Mend

@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (204) hide show

package/CMakeLists.txt +85 -0
package/README.md +56 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/lib/binding.js +13 -0
package/lib/binding.ts +57 -0
package/lib/index.js +24 -0
package/lib/index.ts +13 -0
package/package.json +65 -0
package/src/addons.cpp +506 -0
package/src/llama.cpp/CMakeLists.txt +1320 -0
package/src/llama.cpp/build.zig +172 -0
package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
package/src/llama.cpp/common/CMakeLists.txt +87 -0
package/src/llama.cpp/common/base64.hpp +392 -0
package/src/llama.cpp/common/common.cpp +2949 -0
package/src/llama.cpp/common/common.h +324 -0
package/src/llama.cpp/common/console.cpp +501 -0
package/src/llama.cpp/common/console.h +19 -0
package/src/llama.cpp/common/grammar-parser.cpp +440 -0
package/src/llama.cpp/common/grammar-parser.h +29 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
package/src/llama.cpp/common/json.hpp +24766 -0
package/src/llama.cpp/common/log.h +724 -0
package/src/llama.cpp/common/ngram-cache.cpp +282 -0
package/src/llama.cpp/common/ngram-cache.h +94 -0
package/src/llama.cpp/common/sampling.cpp +353 -0
package/src/llama.cpp/common/sampling.h +147 -0
package/src/llama.cpp/common/stb_image.h +8396 -0
package/src/llama.cpp/common/train.cpp +1513 -0
package/src/llama.cpp/common/train.h +233 -0
package/src/llama.cpp/examples/CMakeLists.txt +52 -0
package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/batched/batched.cpp +262 -0
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/infill/infill.cpp +767 -0
package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
package/src/llama.cpp/examples/llava/clip.h +85 -0
package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
package/src/llama.cpp/examples/llava/llava.cpp +426 -0
package/src/llama.cpp/examples/llava/llava.h +50 -0
package/src/llama.cpp/examples/llava/requirements.txt +3 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/main/main.cpp +957 -0
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
package/src/llama.cpp/examples/server/httplib.h +9465 -0
package/src/llama.cpp/examples/server/server.cpp +3826 -0
package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
package/src/llama.cpp/examples/server/utils.hpp +653 -0
package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple/simple.cpp +183 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
package/src/llama.cpp/ggml-alloc.c +985 -0
package/src/llama.cpp/ggml-alloc.h +76 -0
package/src/llama.cpp/ggml-backend-impl.h +141 -0
package/src/llama.cpp/ggml-backend.c +2099 -0
package/src/llama.cpp/ggml-backend.h +233 -0
package/src/llama.cpp/ggml-common.h +1853 -0
package/src/llama.cpp/ggml-cuda.h +43 -0
package/src/llama.cpp/ggml-impl.h +265 -0
package/src/llama.cpp/ggml-kompute.cpp +2006 -0
package/src/llama.cpp/ggml-kompute.h +46 -0
package/src/llama.cpp/ggml-metal.h +66 -0
package/src/llama.cpp/ggml-mpi.c +216 -0
package/src/llama.cpp/ggml-mpi.h +39 -0
package/src/llama.cpp/ggml-opencl.cpp +2301 -0
package/src/llama.cpp/ggml-opencl.h +36 -0
package/src/llama.cpp/ggml-quants.c +12678 -0
package/src/llama.cpp/ggml-quants.h +133 -0
package/src/llama.cpp/ggml-sycl.cpp +17882 -0
package/src/llama.cpp/ggml-sycl.h +49 -0
package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
package/src/llama.cpp/ggml-vulkan.h +29 -0
package/src/llama.cpp/ggml.c +21819 -0
package/src/llama.cpp/ggml.h +2403 -0
package/src/llama.cpp/llama.cpp +17468 -0
package/src/llama.cpp/llama.h +1117 -0
package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
package/src/llama.cpp/prompts/alpaca.txt +1 -0
package/src/llama.cpp/prompts/assistant.txt +31 -0
package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
package/src/llama.cpp/prompts/chat.txt +28 -0
package/src/llama.cpp/prompts/dan-modified.txt +1 -0
package/src/llama.cpp/prompts/dan.txt +1 -0
package/src/llama.cpp/prompts/mnemonics.txt +93 -0
package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
package/src/llama.cpp/prompts/reason-act.txt +18 -0
package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
package/src/llama.cpp/requirements.txt +12 -0
package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
package/src/llama.cpp/scripts/xxd.cmake +16 -0
package/src/llama.cpp/sgemm.cpp +999 -0
package/src/llama.cpp/sgemm.h +12 -0
package/src/llama.cpp/tests/CMakeLists.txt +78 -0
package/src/llama.cpp/tests/get-model.cpp +21 -0
package/src/llama.cpp/tests/get-model.h +2 -0
package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
package/src/llama.cpp/tests/test-c.c +7 -0
package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
package/src/llama.cpp/tests/test-double-float.cpp +57 -0
package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
package/src/llama.cpp/tests/test-opt.cpp +181 -0
package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
package/src/llama.cpp/tests/test-rope.cpp +221 -0
package/src/llama.cpp/tests/test-sampling.cpp +301 -0
package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
package/src/llama.cpp/unicode-data.cpp +1651 -0
package/src/llama.cpp/unicode-data.h +16 -0
package/src/llama.cpp/unicode.cpp +277 -0
package/src/llama.cpp/unicode.h +28 -0

package/src/llama.cpp/examples/retrieval/retrieval.cpp ADDED Viewed

@@ -0,0 +1,350 @@
+#include "common.h"
+#include "llama.h"
+#include <algorithm>
+#include <fstream>
+struct retrieval_params {
+    std::vector<std::string> context_files; // context files to embed
+    int32_t chunk_size            = 64;     // chunk size for context embedding
+    std::string chunk_separator   = "\n";   // chunk separator for context embedding
+};
+static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
+    gpt_print_usage(argc, argv, gpt_params);
+    printf("retrieval options:\n");
+    printf("  --context-file FNAME  file containing context to embed.\n");
+    printf("                        specify multiple files by providing --context-file option multiple times.\n");
+    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
+    printf("  --chunk-separator STRING\n");
+    printf("                        string to separate chunks (default: \"\\n\")\n");
+    printf("\n");
+}
+static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
+    int i = 1;
+    std::string arg;
+    while (i < argc) {
+        arg = argv[i];
+        bool invalid_gpt_param = false;
+        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
+            if (invalid_gpt_param) {
+                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            // option was parsed by gpt_params_find_arg
+        } else if (arg == "--context-file") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --context-file\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            // store the external file name in params
+            retrieval_params.context_files.push_back(argv[i]);
+        } else if (arg == "--chunk-size") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --chunk-size\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            retrieval_params.chunk_size = std::stoi(argv[i]);
+        } else if (arg == "--chunk-separator") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --chunk-separator\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            retrieval_params.chunk_separator = argv[i];
+        } else {
+            // unknown argument
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+            exit(1);
+        }
+        i++;
+    }
+}
+struct chunk {
+    // filename
+    std::string filename;
+    // original file position
+    size_t filepos;
+    // original text data
+    std::string textdata = "";
+    // tokenized text data
+    std::vector<llama_token> tokens;
+    // embedding
+    std::vector<float> embedding;
+};
+// chunk file data to chunks of size >= chunk_size
+// chunk_separator is the separator between chunks
+static std::vector<chunk> chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) {
+    std::vector<chunk> chunks;
+    std::ifstream f(filename.c_str());
+    if (!f.is_open()) {
+        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        return chunks;
+    }
+    chunk current_chunk;
+    char buffer[1024];
+    int64_t filepos = 0;
+    std::string current = "";
+    while (f.read(buffer, 1024)) {
+        current += std::string(buffer, f.gcount());
+        size_t pos;
+        while ((pos = current.find(chunk_separator)) != std::string::npos) {
+            current_chunk.textdata += current.substr(0, pos + chunk_separator.size());
+            if ((int) current_chunk.textdata.size() > chunk_size) {
+                // save chunk
+                current_chunk.filepos = filepos;
+                current_chunk.filename = filename;
+                chunks.push_back(current_chunk);
+                // update filepos
+                filepos += (int) current_chunk.textdata.size();
+                // reset current_chunk
+                current_chunk = chunk();
+            }
+            current = current.substr(pos + chunk_separator.size());
+        }
+    }
+    // add leftover data to last chunk
+    if (current_chunk.textdata.size() > 0) {
+        if (chunks.empty()) {
+            current_chunk.filepos = filepos;
+            current_chunk.filename = filename;
+            chunks.push_back(current_chunk);
+        } else {
+            chunks.back().textdata += current_chunk.textdata;
+        }
+    }
+    f.close();
+    return chunks;
+}
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+    for (size_t i = 0; i < tokens.size(); i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+    }
+}
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    // clear previous kv_cache values (irrelevant for embeddings)
+    llama_kv_cache_clear(ctx);
+    // run model
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_decode(ctx, batch) < 0) {
+        fprintf(stderr, "%s : failed to decode\n", __func__);
+    }
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+        // try to get sequence embeddings - supported only when pooling_type is not NONE
+        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        if (embd == NULL) {
+            embd = llama_get_embeddings_ith(ctx, i);
+            if (embd == NULL) {
+                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                continue;
+            }
+        }
+        float * out = output + batch.seq_id[i][0] * n_embd;
+        llama_embd_normalize(embd, out, n_embd);
+    }
+}
+int main(int argc, char ** argv) {
+    gpt_params params;
+    retrieval_params retrieval_params;
+    retrieval_params_parse(argc, argv, params, retrieval_params);
+    // For BERT models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
+    if (retrieval_params.chunk_size <= 0) {
+        fprintf(stderr, "chunk_size must be positive\n");
+        return 1;
+    }
+    if (retrieval_params.context_files.empty()) {
+        fprintf(stderr, "context_files must be specified\n");
+        return 1;
+    }
+    params.embedding = true;
+    print_build_info();
+    printf("processing files:\n");
+    for (auto & context_file : retrieval_params.context_files) {
+        printf("%s\n", context_file.c_str());
+    }
+    std::vector<chunk> chunks;
+    for (auto & context_file : retrieval_params.context_files) {
+        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
+    }
+    printf("Number of chunks: %ld\n", chunks.size());
+    llama_backend_init();
+    llama_numa_init(params.numa);
+    llama_model * model;
+    llama_context * ctx;
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+    if (n_ctx > n_ctx_train) {
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
+    }
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+    }
+    // max batch size
+    const uint64_t n_batch = params.n_batch;
+    GGML_ASSERT(params.n_batch >= params.n_ctx);
+    // tokenize the prompts and trim
+    for (auto & chunk : chunks) {
+        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
+        if (inp.size() > n_batch) {
+            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
+        }
+        // add eos if not present
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
+        }
+        chunk.tokens = inp;
+    }
+    // tokenization stats
+    if (params.verbose_prompt) {
+        for (int i = 0; i < (int) chunks.size(); i++) {
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
+                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+            }
+            fprintf(stderr, "\n\n");
+        }
+    }
+    // initialize batch
+    const int n_chunks = chunks.size();
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    // allocate output
+    const int n_embd = llama_n_embd(model);
+    std::vector<float> embeddings(n_chunks * n_embd, 0);
+    float * emb = embeddings.data();
+    // break into batches
+    int p = 0; // number of prompts processed already
+    int s = 0; // number of prompts in current batch
+    for (int k = 0; k < n_chunks; k++) {
+        // clamp to n_batch tokens
+        auto & inp = chunks[k].tokens;
+        const uint64_t n_toks = inp.size();
+        // encode if at capacity
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + p * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd);
+            llama_batch_clear(batch);
+            p += s;
+            s = 0;
+        }
+        // add to batch
+        batch_add_seq(batch, inp, s);
+        s += 1;
+    }
+    // final batch
+    float * out = emb + p * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd);
+    // save embeddings to chunks
+    for (int i = 0; i < n_chunks; i++) {
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
+        // clear tokens as they are no longer needed
+        chunks[i].tokens.clear();
+    }
+    // start loop, receive query and return top k similar chunks based on cosine similarity
+    std::string query;
+    while (true) {
+        printf("Enter query: ");
+        std::getline(std::cin, query);
+        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
+        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+        batch_add_seq(query_batch, query_tokens, 0);
+        std::vector<float> query_emb(n_embd, 0);
+        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
+        llama_batch_clear(query_batch);
+        // compute cosine similarities
+        {
+            std::vector<std::pair<int, float>> similarities;
+            for (int i = 0; i < n_chunks; i++) {
+                float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                similarities.push_back(std::make_pair(i, sim));
+            }
+            // sort similarities
+            std::sort(similarities.begin(), similarities.end(), [](const std::pair<int, float> & a, const std::pair<int, float> & b) {
+                return a.second > b.second;
+            });
+            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                printf("similarity: %f\n", similarities[i].second);
+                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                printf("--------------------\n");
+            }
+        }
+    }
+    // clean up
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    llama_free_model(model);
+    llama_backend_free();
+}

package/src/llama.cpp/examples/save-load-state/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,5 @@
+set(TARGET save-load-state)
+add_executable(${TARGET} save-load-state.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

package/src/llama.cpp/examples/save-load-state/save-load-state.cpp ADDED Viewed

@@ -0,0 +1,246 @@
+#include "common.h"
+#include "llama.h"
+#include <vector>
+#include <cstdio>
+#include <chrono>
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.prompt = "The quick brown fox";
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+    print_build_info();
+    if (params.n_predict < 0) {
+        params.n_predict = 16;
+    }
+    auto n_past = 0;
+    std::string result0;
+    std::string result1;
+    std::string result2;
+    // init
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
+        return 1;
+    }
+    // tokenize prompt
+    auto tokens = llama_tokenize(ctx, params.prompt, true);
+    // evaluate prompt
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
+    n_past += tokens.size();
+    // save state (rng, logits, embedding and kv_cache) to file
+    {
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
+        const size_t written = llama_state_get_data(ctx, state_mem.data());
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        fwrite(state_mem.data(), 1, written, fp_write);
+        fclose(fp_write);
+        fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size());
+    }
+    // save state (last tokens)
+    const auto n_past_saved = n_past;
+    // first run
+    printf("\nfirst run: %s", params.prompt.c_str());
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto * logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx, &candidates_p);
+        auto next_token_str = llama_token_to_piece(ctx, next_token);
+        printf("%s", next_token_str.c_str());
+        result0 += next_token_str;
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    // free old context
+    llama_free(ctx);
+    // make new context
+    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    printf("\nsecond run: %s", params.prompt.c_str());
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+        if (read != llama_state_set_data(ctx2, state_mem.data())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    }
+    // restore state (last tokens)
+    n_past = n_past_saved;
+    // second run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto * logits = llama_get_logits(ctx2);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx2, &candidates_p);
+        auto next_token_str = llama_token_to_piece(ctx2, next_token);
+        printf("%s", next_token_str.c_str());
+        result1 += next_token_str;
+        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    llama_free(ctx2);
+    if (result0 != result1) {
+        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+        return 1;
+    }
+    // make new context
+    auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    printf("\nsingle seq run: %s", params.prompt.c_str());
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+        if (read != llama_state_set_data(ctx3, state_mem.data())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    }
+    // restore state (last tokens)
+    n_past = n_past_saved;
+    // save seq 0 and load into seq 1
+    {
+        // save kv of seq 0
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
+        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
+        if (ncopy != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+        // erase whole kv
+        llama_kv_cache_clear(ctx3);
+        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+        // restore kv into seq 1
+        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
+        if (nset != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    }
+    // third run with seq 1 instead of 0
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto * logits = llama_get_logits(ctx3);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx3, &candidates_p);
+        auto next_token_str = llama_token_to_piece(ctx3, next_token);
+        printf("%s", next_token_str.c_str());
+        result2 += next_token_str;
+        if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n");
+    llama_free(ctx3);
+    llama_free_model(model);
+    if (result0 != result2) {
+        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+        return 1;
+    }
+    fprintf(stderr, "\n%s : success\n", __func__);
+    return 0;
+}

package/src/llama.cpp/examples/server/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,40 @@
+set(TARGET server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+set(TARGET_SRCS
+    server.cpp
+    utils.hpp
+    httplib.h
+)
+set(PUBLIC_ASSETS
+    index.html
+    index.js
+    completion.js
+    json-schema-to-grammar.mjs
+)
+foreach(asset ${PUBLIC_ASSETS})
+    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    list(APPEND TARGET_SRCS ${output})
+    add_custom_command(
+        DEPENDS "${input}"
+        OUTPUT "${output}"
+        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
+    )
+endforeach()
+add_executable(${TARGET} ${TARGET_SRCS})
+install(TARGETS ${TARGET} RUNTIME)
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+if (LLAMA_SERVER_SSL)
+    find_package(OpenSSL REQUIRED)
+    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
+endif()
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
+target_compile_features(${TARGET} PRIVATE cxx_std_11)

package/src/llama.cpp/examples/server/bench/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ matplotlib
2	+ requests