npm - @fugood/llama.node - Versions diffs - 0.0.1-alpha.1 - Mend

@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (204) hide show

package/CMakeLists.txt +85 -0
package/README.md +56 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/lib/binding.js +13 -0
package/lib/binding.ts +57 -0
package/lib/index.js +24 -0
package/lib/index.ts +13 -0
package/package.json +65 -0
package/src/addons.cpp +506 -0
package/src/llama.cpp/CMakeLists.txt +1320 -0
package/src/llama.cpp/build.zig +172 -0
package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
package/src/llama.cpp/common/CMakeLists.txt +87 -0
package/src/llama.cpp/common/base64.hpp +392 -0
package/src/llama.cpp/common/common.cpp +2949 -0
package/src/llama.cpp/common/common.h +324 -0
package/src/llama.cpp/common/console.cpp +501 -0
package/src/llama.cpp/common/console.h +19 -0
package/src/llama.cpp/common/grammar-parser.cpp +440 -0
package/src/llama.cpp/common/grammar-parser.h +29 -0
package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
package/src/llama.cpp/common/json.hpp +24766 -0
package/src/llama.cpp/common/log.h +724 -0
package/src/llama.cpp/common/ngram-cache.cpp +282 -0
package/src/llama.cpp/common/ngram-cache.h +94 -0
package/src/llama.cpp/common/sampling.cpp +353 -0
package/src/llama.cpp/common/sampling.h +147 -0
package/src/llama.cpp/common/stb_image.h +8396 -0
package/src/llama.cpp/common/train.cpp +1513 -0
package/src/llama.cpp/common/train.h +233 -0
package/src/llama.cpp/examples/CMakeLists.txt +52 -0
package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/batched/batched.cpp +262 -0
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/infill/infill.cpp +767 -0
package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
package/src/llama.cpp/examples/llava/clip.h +85 -0
package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
package/src/llama.cpp/examples/llava/llava.cpp +426 -0
package/src/llama.cpp/examples/llava/llava.h +50 -0
package/src/llama.cpp/examples/llava/requirements.txt +3 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/main/main.cpp +957 -0
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
package/src/llama.cpp/examples/server/httplib.h +9465 -0
package/src/llama.cpp/examples/server/server.cpp +3826 -0
package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
package/src/llama.cpp/examples/server/utils.hpp +653 -0
package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple/simple.cpp +183 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
package/src/llama.cpp/ggml-alloc.c +985 -0
package/src/llama.cpp/ggml-alloc.h +76 -0
package/src/llama.cpp/ggml-backend-impl.h +141 -0
package/src/llama.cpp/ggml-backend.c +2099 -0
package/src/llama.cpp/ggml-backend.h +233 -0
package/src/llama.cpp/ggml-common.h +1853 -0
package/src/llama.cpp/ggml-cuda.h +43 -0
package/src/llama.cpp/ggml-impl.h +265 -0
package/src/llama.cpp/ggml-kompute.cpp +2006 -0
package/src/llama.cpp/ggml-kompute.h +46 -0
package/src/llama.cpp/ggml-metal.h +66 -0
package/src/llama.cpp/ggml-mpi.c +216 -0
package/src/llama.cpp/ggml-mpi.h +39 -0
package/src/llama.cpp/ggml-opencl.cpp +2301 -0
package/src/llama.cpp/ggml-opencl.h +36 -0
package/src/llama.cpp/ggml-quants.c +12678 -0
package/src/llama.cpp/ggml-quants.h +133 -0
package/src/llama.cpp/ggml-sycl.cpp +17882 -0
package/src/llama.cpp/ggml-sycl.h +49 -0
package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
package/src/llama.cpp/ggml-vulkan.h +29 -0
package/src/llama.cpp/ggml.c +21819 -0
package/src/llama.cpp/ggml.h +2403 -0
package/src/llama.cpp/llama.cpp +17468 -0
package/src/llama.cpp/llama.h +1117 -0
package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
package/src/llama.cpp/prompts/alpaca.txt +1 -0
package/src/llama.cpp/prompts/assistant.txt +31 -0
package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
package/src/llama.cpp/prompts/chat.txt +28 -0
package/src/llama.cpp/prompts/dan-modified.txt +1 -0
package/src/llama.cpp/prompts/dan.txt +1 -0
package/src/llama.cpp/prompts/mnemonics.txt +93 -0
package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
package/src/llama.cpp/prompts/reason-act.txt +18 -0
package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
package/src/llama.cpp/requirements.txt +12 -0
package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
package/src/llama.cpp/scripts/xxd.cmake +16 -0
package/src/llama.cpp/sgemm.cpp +999 -0
package/src/llama.cpp/sgemm.h +12 -0
package/src/llama.cpp/tests/CMakeLists.txt +78 -0
package/src/llama.cpp/tests/get-model.cpp +21 -0
package/src/llama.cpp/tests/get-model.h +2 -0
package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
package/src/llama.cpp/tests/test-c.c +7 -0
package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
package/src/llama.cpp/tests/test-double-float.cpp +57 -0
package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
package/src/llama.cpp/tests/test-opt.cpp +181 -0
package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
package/src/llama.cpp/tests/test-rope.cpp +221 -0
package/src/llama.cpp/tests/test-sampling.cpp +301 -0
package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
package/src/llama.cpp/unicode-data.cpp +1651 -0
package/src/llama.cpp/unicode-data.h +16 -0
package/src/llama.cpp/unicode.cpp +277 -0
package/src/llama.cpp/unicode.h +28 -0

package/src/llama.cpp/examples/server/tests/requirements.txt ADDED Viewed

@@ -0,0 +1,6 @@
+aiohttp~=3.9.3
+behave~=1.2.6
+huggingface_hub~=0.20.3
+numpy~=1.24.4
+openai~=0.25.0
+prometheus-client~=0.20.0

package/src/llama.cpp/examples/server/utils.hpp ADDED Viewed

@@ -0,0 +1,653 @@
+#pragma once
+#include "llama.h"
+#include "common.h"
+#include "json.hpp"
+#include <string>
+#include <vector>
+#include <sstream>
+#include <random>
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
+using json = nlohmann::ordered_json;
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+};
+extern bool server_verbose;
+extern bool server_log_json;
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value) {
+    // Fallback null to default value
+    if (body.contains(key) && !body.at(key).is_null()){
+        try {
+            return body.value(key, default_value);
+        }
+        catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
+            std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
+            server_log("WARN", __func__, __LINE__, message.c_str(), body);
+            return default_value;
+        }
+    } else {
+        return default_value;
+    }
+}
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
+    std::stringstream ss_tid;
+    ss_tid << std::this_thread::get_id();
+    json log = nlohmann::ordered_json{
+        {"tid",       ss_tid.str()},
+        {"timestamp", time(nullptr)},
+    };
+    if (server_log_json) {
+        log.merge_patch( {
+            {"level",    level},
+            {"function", function},
+            {"line",     line},
+            {"msg",      message},
+        });
+        if (!extra.empty()) {
+            log.merge_patch(extra);
+        }
+        printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
+    } else {
+        char buf[1024];
+        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
+        if (!extra.empty()) {
+            log.merge_patch(extra);
+        }
+        std::stringstream ss;
+        ss << buf << " |";
+        for (const auto& el : log.items())
+        {
+            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
+            ss << " " << el.key() << "=" << value;
+        }
+        const std::string str = ss.str();
+        printf("%.*s\n", (int)str.size(), str.data());
+    }
+    fflush(stdout);
+}
+//
+// chat template utils
+//
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+inline bool verify_custom_template(const std::string & tmpl) {
+    llama_chat_message chat[] = {{"user", "test"}};
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+    return res >= 0;
+}
+// Format given chat. If tmpl is empty, we take the template from model metadata
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+    size_t alloc_size = 0;
+    // vector holding all allocated string to be passed to llama_chat_apply_template
+    std::vector<std::string> str(messages.size() * 2);
+    std::vector<llama_chat_message> chat(messages.size());
+    for (size_t i = 0; i < messages.size(); ++i) {
+        const auto & curr_msg = messages[i];
+        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
+        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
+        alloc_size     += str[i*2 + 1].length();
+        chat[i].role    = str[i*2 + 0].c_str();
+        chat[i].content = str[i*2 + 1].c_str();
+    }
+    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    std::vector<char> buf(alloc_size * 2);
+    // run the first time to get the total output length
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+    // if it turns out that our buffer is too small, we resize it
+    if ((size_t) res > buf.size()) {
+        buf.resize(res);
+        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+    }
+    const std::string formatted_chat(buf.data(), res);
+    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
+    return formatted_chat;
+}
+//
+// base64 utils (TODO: move to common in the future)
+//
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+static inline bool is_base64(uint8_t c) {
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+    int in_len = encoded_string.size();
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+    std::vector<uint8_t> ret;
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4) {
+            for (i = 0; i < 4; i++) {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+            for (i = 0; (i < 3); i++) {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+    if (i) {
+        for (j = i; j < 4; j++) {
+            char_array_4[j] = 0;
+        }
+        for (j = 0; j < 4; j++) {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+        for (j = 0; j < i - 1; j++) {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+    return ret;
+}
+//
+// random string / id
+//
+static std::string random_string() {
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+    std::random_device rd;
+    std::mt19937 generator(rd());
+    std::string result(32, ' ');
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+    return result;
+}
+static std::string gen_chatcmplid() {
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}
+//
+// other common utils
+//
+static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+    return i;
+}
+static bool ends_with(const std::string & str, const std::string & suffix) {
+    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
+    if (!text.empty() && !stop.empty()) {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
+            if (stop[char_index] == text_last_char) {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial)) {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+// TODO: reuse llama_detokenize
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += llama_token_to_piece(ctx, *begin);
+    }
+    return ret;
+}
+// format incomplete utf-8 multibyte character for output
+static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+struct completion_token_output {
+    llama_token tok;
+    std::string text_to_send;
+    struct token_prob {
+        llama_token tok;
+        float prob;
+    };
+    std::vector<token_prob> probs;
+};
+// convert a vector of completion_token_output to json
+static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
+    json out = json::array();
+    for (const auto & prob : probs) {
+        json probs_for_token = json::array();
+        for (const auto & p : prob.probs) {
+            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+            probs_for_token.push_back(json {
+                {"tok_str", tok_str},
+                {"prob",    p.prob},
+            });
+        }
+        const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+        out.push_back(json {
+            {"content", tok_str},
+            {"probs",   probs_for_token},
+        });
+    }
+    return out;
+}
+//
+// OAI utils
+//
+static json oaicompat_completion_params_parse(
+    const struct llama_model * model,
+    const json & body, /* openai api json semantics */
+    const std::string & chat_template) {
+    json llama_params;
+    llama_params["__oaicompat"] = true;
+    // Map OpenAI parameters to llama.cpp parameters
+    //
+    // For parameters that are defined by the OpenAI documentation (e.g.
+    // temperature), we explicitly specify OpenAI's intended default; we
+    // need to do that because sometimes OpenAI disagrees with llama.cpp
+    //
+    // https://platform.openai.com/docs/api-reference/chat/create
+    llama_sampling_params default_sparams;
+    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
+    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
+    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
+    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
+    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
+    llama_params["stream"]            = json_value(body,   "stream",            false);
+    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
+    // Apply chat template to the list of messages
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
+    // Handle "stop" field
+    if (body.contains("stop") && body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    }
+    // Handle "n" field
+    int n_choices = json_value(body, "n", 1);
+    if (n_choices != 1) {
+        throw std::runtime_error("Only one completion choice is allowed");
+    }
+    // Handle "logprobs" field
+    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
+    if (body.contains("logprobs")) {
+        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+    } else if (body.contains("top_logprobs")) {
+        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+    }
+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    for (auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+    // Copy remaining properties to llama_params
+    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }
+    return llama_params;
+}
+static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
+    bool stopped_word        = result.count("stopped_word") != 0;
+    bool stopped_eos         = json_value(result, "stopped_eos", false);
+    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    std::string content      = json_value(result, "content", std::string(""));
+    std::string finish_reason = "length";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    json choices =
+        streaming ? json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}})
+                  : json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"message", json{{"content", content},
+                                                         {"role", "assistant"}}}}});
+    std::time_t t = std::time(0);
+    json res = json {
+        {"choices", choices},
+        {"created", t},
+        {"model",
+            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+        {"usage", json {
+            {"completion_tokens", num_tokens_predicted},
+            {"prompt_tokens",     num_prompt_tokens},
+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+        }},
+        {"id", completion_id}
+    };
+    if (server_verbose) {
+        res["__verbose"] = result;
+    }
+    if (result.contains("completion_probabilities")) {
+        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    }
+    return res;
+}
+// return value is vector as there is one case where we might need to generate two responses
+static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+        return std::vector<json>({result});
+    }
+    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+    bool stopped_word   = json_value(result, "stopped_word",  false);
+    bool stopped_eos    = json_value(result, "stopped_eos",   false);
+    bool stopped_limit  = json_value(result, "stopped_limit", false);
+    std::string content = json_value(result, "content",       std::string(""));
+    std::string finish_reason;
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    if (stopped_limit) {
+        finish_reason = "length";
+    }
+    std::time_t t = std::time(0);
+    json choices;
+    if (!finish_reason.empty()) {
+        choices = json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"delta", json::object()}}});
+    } else {
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else {
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", completion_id},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json{
+                                                            {"content", content}}}
+                                                            }})},
+                            {"created", t},
+                            {"id", completion_id},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            // Some idiosyncrasy in task processing logic makes several trailing calls
+            // with empty content, we ignore these at the calee site.
+            if (content.empty()) {
+                return std::vector<json>({json::object()});
+            }
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json{
+                    {"content", content},
+                }},
+            }});
+        }
+    }
+    json ret = json {
+        {"choices", choices},
+        {"created", t},
+        {"id",      completion_id},
+        {"model",   modelname},
+        {"object",  "chat.completion.chunk"}
+    };
+    if (!finish_reason.empty()) {
+        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+        ret.push_back({"usage", json {
+            {"completion_tokens", num_tokens_predicted},
+            {"prompt_tokens",     num_prompt_tokens},
+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+        }});
+    }
+    return std::vector<json>({ret});
+}
+static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
+    json data = json::array();
+    int i = 0;
+    for (auto & elem : embeddings) {
+        data.push_back(json{
+            {"embedding", json_value(elem, "embedding", json::array())},
+            {"index",     i++},
+            {"object",    "embedding"}
+        });
+    }
+    json res = json {
+        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"object", "list"},
+        {"usage", json {
+            {"prompt_tokens", 0},
+            {"total_tokens", 0}
+        }},
+        {"data", data}
+    };
+    return res;
+}
+static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+    return json {
+        {"tokens", tokens}
+    };
+}
+static json format_detokenized_response(const std::string & content) {
+    return json {
+        {"content", content}
+    };
+}
+static json format_error_response(const std::string & message, const enum error_type type) {
+    std::string type_str;
+    int code = 500;
+    switch (type) {
+        case ERROR_TYPE_INVALID_REQUEST:
+            type_str = "invalid_request_error";
+            code = 400;
+            break;
+        case ERROR_TYPE_AUTHENTICATION:
+            type_str = "authentication_error";
+            code = 401;
+            break;
+        case ERROR_TYPE_NOT_FOUND:
+            type_str = "not_found_error";
+            code = 404;
+            break;
+        case ERROR_TYPE_SERVER:
+            type_str = "server_error";
+            code = 500;
+            break;
+        case ERROR_TYPE_PERMISSION:
+            type_str = "permission_error";
+            code = 403;
+            break;
+        case ERROR_TYPE_NOT_SUPPORTED:
+            type_str = "not_supported_error";
+            code = 501;
+            break;
+        case ERROR_TYPE_UNAVAILABLE:
+            type_str = "unavailable_error";
+            code = 503;
+            break;
+    }
+    return json {
+        {"code", code},
+        {"message", message},
+        {"type", type_str},
+    };
+}

package/src/llama.cpp/examples/simple/CMakeLists.txt ADDED Viewed

@@ -0,0 +1,5 @@
+set(TARGET simple)
+add_executable(${TARGET} simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)