npm - @fugood/llama.node - Versions diffs - 0.3.17 → 0.4.1 - Mend

@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

package/CMakeLists.txt +3 -1
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +39 -2
package/lib/index.js +132 -1
package/lib/index.ts +203 -3
package/package.json +2 -1
package/src/EmbeddingWorker.cpp +1 -1
package/src/LlamaCompletionWorker.cpp +366 -19
package/src/LlamaCompletionWorker.h +30 -10
package/src/LlamaContext.cpp +213 -5
package/src/LlamaContext.h +12 -0
package/src/common.hpp +15 -0
package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
package/src/llama.cpp/.github/workflows/build.yml +41 -762
package/src/llama.cpp/.github/workflows/docker.yml +5 -2
package/src/llama.cpp/.github/workflows/release.yml +716 -0
package/src/llama.cpp/.github/workflows/server.yml +12 -12
package/src/llama.cpp/CMakeLists.txt +5 -17
package/src/llama.cpp/cmake/build-info.cmake +8 -2
package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
package/src/llama.cpp/common/CMakeLists.txt +31 -3
package/src/llama.cpp/common/arg.cpp +48 -29
package/src/llama.cpp/common/chat.cpp +128 -106
package/src/llama.cpp/common/chat.h +2 -0
package/src/llama.cpp/common/common.cpp +37 -1
package/src/llama.cpp/common/common.h +18 -9
package/src/llama.cpp/common/llguidance.cpp +1 -0
package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
package/src/llama.cpp/common/minja/minja.hpp +69 -36
package/src/llama.cpp/common/regex-partial.cpp +204 -0
package/src/llama.cpp/common/regex-partial.h +56 -0
package/src/llama.cpp/common/sampling.cpp +57 -50
package/src/llama.cpp/examples/CMakeLists.txt +2 -23
package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/training/finetune.cpp +96 -0
package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
package/src/llama.cpp/ggml/include/ggml.h +10 -7
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
package/src/llama.cpp/ggml/src/ggml.c +29 -20
package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
package/src/llama.cpp/include/llama.h +52 -11
package/src/llama.cpp/requirements/requirements-all.txt +3 -3
package/src/llama.cpp/scripts/xxd.cmake +1 -1
package/src/llama.cpp/src/CMakeLists.txt +1 -0
package/src/llama.cpp/src/llama-adapter.cpp +6 -0
package/src/llama.cpp/src/llama-arch.cpp +3 -0
package/src/llama.cpp/src/llama-batch.cpp +5 -1
package/src/llama.cpp/src/llama-batch.h +2 -1
package/src/llama.cpp/src/llama-chat.cpp +17 -7
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +389 -501
package/src/llama.cpp/src/llama-context.h +44 -32
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +20 -38
package/src/llama.cpp/src/llama-graph.h +12 -8
package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
package/src/llama.cpp/src/llama-kv-cache.h +271 -85
package/src/llama.cpp/src/llama-memory.h +11 -1
package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
package/src/llama.cpp/src/llama-model-saver.h +37 -0
package/src/llama.cpp/src/llama-model.cpp +316 -69
package/src/llama.cpp/src/llama-model.h +8 -1
package/src/llama.cpp/src/llama-quant.cpp +15 -13
package/src/llama.cpp/src/llama-sampling.cpp +18 -6
package/src/llama.cpp/src/llama-vocab.cpp +42 -4
package/src/llama.cpp/src/llama-vocab.h +6 -0
package/src/llama.cpp/src/llama.cpp +14 -0
package/src/llama.cpp/tests/CMakeLists.txt +10 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
package/src/llama.cpp/tests/test-chat.cpp +3 -1
package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
package/src/llama.cpp/tests/test-opt.cpp +33 -21
package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
package/src/llama.cpp/tests/test-sampling.cpp +1 -1
package/src/llama.cpp/tools/CMakeLists.txt +39 -0
package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
package/src/llama.cpp/tools/mtmd/clip.h +99 -0
package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
package/src/llama.cpp/examples/infill/infill.cpp +0 -590
package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
package/src/llama.cpp/examples/llava/clip.h +0 -135
package/src/llama.cpp/examples/llava/llava.cpp +0 -586
package/src/llama.cpp/examples/llava/llava.h +0 -49
package/src/llama.cpp/examples/llava/mtmd.h +0 -168
package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
/package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
/package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
/package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
/package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
/package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
/package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
/package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
/package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
/package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
/package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
/package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0

package/src/llama.cpp/{examples → tools}/server/utils.hpp RENAMED Viewed

@@ -3,7 +3,9 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "arg.h" // common_remote_get_content
 #include "base64.hpp"
+#include "mtmd.h"
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -21,6 +23,7 @@
 #include <string>
 #include <vector>
 #include <memory>
+#include <cinttypes>
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
@@ -41,6 +44,8 @@ using json = nlohmann::ordered_json;
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+using raw_buffer = std::vector<uint8_t>;
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
     // Fallback null to default value
@@ -386,7 +391,7 @@ static inline bool is_base64(uint8_t c) {
     return (isalnum(c) || (c == '+') || (c == '/'));
 }
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
+static inline raw_buffer base64_decode(const std::string & encoded_string) {
     int i = 0;
     int j = 0;
     int in_ = 0;
@@ -396,7 +401,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
     uint8_t char_array_4[4];
     uint8_t char_array_3[3];
-    std::vector<uint8_t> ret;
+    raw_buffer ret;
     while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
         char_array_4[i++] = encoded_string[in_]; in_++;
@@ -578,8 +583,11 @@ static json oaicompat_completion_params_parse(const json & body) {
 static json oaicompat_completion_params_parse(
     const json & body, /* openai api json semantics */
     bool use_jinja,
+    bool prefill_assistant,
     common_reasoning_format reasoning_format,
-    const struct common_chat_templates * tmpls)
+    const struct common_chat_templates * tmpls,
+    bool allow_non_text,
+    std::vector<raw_buffer> & out_files)
 {
     json llama_params;
@@ -627,8 +635,89 @@ static json oaicompat_completion_params_parse(
         }
     }
+    // get input files
+    if (!body.contains("messages")) {
+        throw std::runtime_error("'messages' is required");
+    }
+    json messages = body.at("messages");
+    if (!messages.is_array()) {
+        throw std::runtime_error("Expected 'messages' to be an array");
+    }
+    for (auto & msg : messages) {
+        std::string role = json_value(msg, "role", std::string());
+        if (role != "assistant" && !msg.contains("content")) {
+            throw std::runtime_error("All non-assistant messages must contain 'content'");
+        }
+        if (role == "assistant") {
+            if (!msg.contains("content") && !msg.contains("tool_calls")) {
+                throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
+            }
+            if (!msg.contains("content")) {
+                continue; // avoid errors with no content
+            }
+        }
+        json & content = msg.at("content");
+        if (content.is_string() || content.is_null()) {
+            continue;
+        }
+        if (!content.is_array()) {
+            throw std::runtime_error("Expected 'content' to be a string or an array");
+        }
+        for (auto & p : content) {
+            std::string type      = json_value(p, "type", std::string());
+            json        image_url = json_value(p, "image_url", json::object());
+            if (type == "image_url") {
+                if (!allow_non_text) {
+                    throw std::runtime_error("image input is not supported by this server");
+                }
+                std::string url = json_value(image_url, "url", std::string());
+                if (string_starts_with(url, "http")) {
+                    // download remote image
+                    // TODO @ngxson : maybe make these params configurable
+                    common_remote_params params;
+                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+                    params.max_size = 1024 * 1024 * 10; // 10MB
+                    params.timeout  = 10; // seconds
+                    SRV_INF("downloading image from '%s'\n", url.c_str());
+                    auto res = common_remote_get_content(url, params);
+                    if (200 <= res.first && res.first < 300) {
+                        SRV_INF("downloaded %ld bytes\n", res.second.size());
+                        raw_buffer data;
+                        data.insert(data.end(), res.second.begin(), res.second.end());
+                        out_files.push_back(data);
+                    } else {
+                        throw std::runtime_error("Failed to download image");
+                    }
+                } else {
+                    // try to decode base64 image
+                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+                    if (parts.size() != 2) {
+                        throw std::runtime_error("Invalid image_url.url value");
+                    } else if (!string_starts_with(parts[0], "data:image/")) {
+                        throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
+                    } else if (!string_ends_with(parts[0], "base64")) {
+                        throw std::runtime_error("image_url.url must be base64 encoded");
+                    } else {
+                        auto base64_data = parts[1];
+                        auto decoded_data = base64_decode(base64_data);
+                        out_files.push_back(decoded_data);
+                    }
+                }
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
+                p.erase("image_url");
+            }
+        }
+    }
     common_chat_templates_inputs inputs;
-    inputs.messages              = common_chat_msgs_parse_oaicompat(body.at("messages"));
+    inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
     inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
     inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
@@ -644,7 +733,7 @@ static json oaicompat_completion_params_parse(
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();
@@ -935,3 +1024,286 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
     return lora;
 }
+//
+// utils for interacting with libmtmd
+// (may need to refactor in near future)
+//
+/**
+ * server_tokens is a helper to manage the input tokens and image for the server.
+ * it is made this way to simplify the logic of KV cache management.
+ */
+struct server_tokens {
+    bool has_mtmd = false;
+private: // disallow accessing these members directly, risking out-of-sync
+    // map a **start** position in tokens to the image chunk
+    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
+    // list of tokens
+    // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
+    // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
+    // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
+    llama_tokens tokens;
+    // for ex. with input of 5 text tokens and 2 images:
+    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+    // pos  0   1   2   3   4   5      6      7      8      9
+    // map_pos_to_image will contain: {5, img0}, {8, img1}
+public:
+    server_tokens() = default;
+    ~server_tokens() = default;
+    // Prevent copying
+    server_tokens(const server_tokens&) = delete;
+    server_tokens& operator=(const server_tokens&) = delete;
+    // Allow moving (usually implicitly generated if members are movable)
+    server_tokens(server_tokens&&) = default;
+    server_tokens& operator=(server_tokens&&) = default;
+    // Allow accessing elements using [] operator
+    llama_token operator[](size_t index) { return tokens[index]; }
+    const llama_token& operator[](size_t index) const { return tokens[index]; }
+    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
+        for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
+            push_back(mtmd_chunks[i]);
+        }
+    }
+    server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
+    // for debugging
+    std::string str() const {
+        std::ostringstream oss;
+        oss << "tokens: ";
+        for (const auto & t : tokens) {
+            if (t == LLAMA_TOKEN_NULL) {
+                oss << "<embd> ";
+            } else {
+                oss << t << " ";
+            }
+        }
+        oss << "\n";
+        oss << "image pos: ";
+        for (const auto & it : map_pos_to_image) {
+            oss << it.first << ", ";
+        }
+        return oss.str();
+    }
+    const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
+        auto it = map_pos_to_image.find(pos);
+        if (it != map_pos_to_image.end()) {
+            return it->second;
+        } else {
+            throw std::runtime_error("Chunk not found");
+        }
+    }
+    void push_back(llama_token tok) {
+        if (tok == LLAMA_TOKEN_NULL) {
+            throw std::runtime_error("Invalid token");
+        }
+        tokens.emplace_back(tok);
+    }
+    // will create a copy of the chunk if it contains non-text data
+    void push_back(const mtmd_input_chunk * chunk) {
+        auto type = mtmd_input_chunk_get_type(chunk);
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            GGML_ASSERT(has_mtmd);
+            auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+            llama_pos start_pos = tokens.size();
+            for (int i = 0; i < n_pos; ++i) {
+                tokens.emplace_back(LLAMA_TOKEN_NULL);
+            }
+            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+            map_pos_to_image[start_pos] = std::move(new_chunk);
+        } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            size_t n_tokens;
+            auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+            for (size_t i = 0; i < n_tokens; ++i) {
+                push_back(text_tokens[i]);
+            }
+        } else {
+            GGML_ABORT("Invalid chunk type");
+        }
+    }
+    // for compatibility with context shift and prompt truncation
+    void insert(const llama_tokens & inp_tokens) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
+    }
+    // for compatibility with speculative decoding, ctx shift, slot save/load
+    const llama_tokens & get_text_tokens() const {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        return tokens;
+    }
+    // for compatibility with speculative decoding
+    void set_token(llama_pos pos, llama_token id) {
+        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
+        tokens[pos] = id;
+    }
+    size_t size() const {
+        return tokens.size();
+    }
+    bool empty() const {
+        return tokens.empty();
+    }
+    void clear() {
+        tokens.clear();
+    }
+    void keep_first(size_t n) {
+        GGML_ASSERT(n <= tokens.size());
+        if (has_mtmd) {
+            // we throw an error if we try to remove a token in the middle of an image
+            // for ex. with input of 5 text tokens and 2 images:
+            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
+            // n  1   2   3   4   5   6      7      8      9      10
+            // allowed to resize      ^                    ^
+            // disallowed to resize          ^      ^             ^
+            if (n > 0) {
+                llama_token last_token = tokens[n - 1];
+                // make sure we never remove tokens in the middle of an image
+                if (last_token == LLAMA_TOKEN_NULL) {
+                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
+                }
+            }
+            // remove all image chunks that are not used anymore
+            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
+                llama_pos pos = it->first;
+                if (pos >= (llama_pos)n) {
+                    it = map_pos_to_image.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+        tokens.resize(n);
+    }
+    std::string detokenize(const llama_context * ctx, bool special) const {
+        llama_tokens text_tokens;
+        text_tokens.reserve(tokens.size());
+        for (const auto & t : tokens) {
+            if (t != LLAMA_TOKEN_NULL) {
+                text_tokens.push_back(t);
+            }
+        }
+        return common_detokenize(ctx, text_tokens, special);
+    }
+    size_t get_common_prefix(const server_tokens & b) const {
+        size_t max_idx = std::min(tokens.size(), b.tokens.size());
+        for (size_t i = 0; i < max_idx; ++i) {
+            auto & ai =   tokens[i];
+            auto & bi = b.tokens[i];
+            if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
+                GGML_ASSERT(has_mtmd);
+                const auto & a_chunk =   find_chunk(i);
+                const auto & b_chunk = b.find_chunk(i);
+                GGML_ASSERT(a_chunk && b_chunk);
+                const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
+                const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
+                std::string ai_id  = mtmd_image_tokens_get_id(a_img);
+                std::string bi_id  = mtmd_image_tokens_get_id(b_img);
+                size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img);
+                size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img);
+                if (ai_id == bi_id && a_pos == b_pos) {
+                    GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
+                    i += a_pos - 1; // will be +1 by the for loop
+                    continue;
+                } else {
+                    return i;
+                }
+            } else if (ai == bi) {
+                continue;
+            } else {
+                return i;
+            }
+        }
+        return max_idx; // all tokens are equal
+    }
+    // make sure all text tokens are within the vocab range
+    bool validate(const struct llama_context * ctx) const {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+        for (size_t i = 0; i < tokens.size(); ++i) {
+            auto & t = tokens[i];
+            if (t == LLAMA_TOKEN_NULL) {
+                try {
+                    const auto & chunk = find_chunk(i);
+                    const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
+                    size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+                    i += n_pos - 1; // will be +1 by the for loop
+                } catch (const std::exception & e) {
+                    return false;
+                }
+            } else if (t < 0 || t >= n_vocab) {
+                return false;
+            }
+        }
+        return true;
+    }
+    // encode and decode the image chunk
+    int32_t process_chunk(
+                llama_context * ctx,
+                mtmd_context * mctx,
+                llama_pos n_past,
+                int32_t seq_id,
+                llama_pos & n_pos_out) {
+        auto it = map_pos_to_image.find(n_past);
+        if (it == map_pos_to_image.end()) {
+            throw std::runtime_error("Chunk not found");
+        }
+        SRV_INF("%s\n", "processing image...");
+        int32_t n_batch = llama_n_batch(ctx);
+        int64_t t0 = ggml_time_ms();
+        llama_pos new_n_past = n_past;
+        int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
+            it->second.get(), // chunk
+            n_past,
+            seq_id,
+            n_batch,
+            true, // logits last
+            &new_n_past);
+        SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        if (result != 0) {
+            LOG_ERR("mtmd_helper_eval failed with status %d", result);
+            n_pos_out = n_past;
+            return result;
+        }
+        n_pos_out = new_n_past;
+        return 0;
+    }
+};
+// Computes FNV-1a hash of the data
+static std::string fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return std::to_string(hash);
+}

package/src/llama.cpp/cmake/arm64-windows-msvc.cmake DELETED Viewed

@@ -1,6 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-set( target arm64-pc-windows-msvc )
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )

package/src/llama.cpp/examples/infill/CMakeLists.txt DELETED Viewed

@@ -1,5 +0,0 @@
-set(TARGET llama-infill)
-add_executable(${TARGET} infill.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)