@fugood/llama.node 0.3.17 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +39 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +366 -19
- package/src/LlamaCompletionWorker.h +30 -10
- package/src/LlamaContext.cpp +213 -5
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
- package/src/llama.cpp/.github/workflows/build.yml +41 -762
- package/src/llama.cpp/.github/workflows/docker.yml +5 -2
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +12 -12
- package/src/llama.cpp/CMakeLists.txt +5 -17
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +31 -3
- package/src/llama.cpp/common/arg.cpp +48 -29
- package/src/llama.cpp/common/chat.cpp +128 -106
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +37 -1
- package/src/llama.cpp/common/common.h +18 -9
- package/src/llama.cpp/common/llguidance.cpp +1 -0
- package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
- package/src/llama.cpp/common/minja/minja.hpp +69 -36
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +57 -50
- package/src/llama.cpp/examples/CMakeLists.txt +2 -23
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml.h +10 -7
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
- package/src/llama.cpp/ggml/src/ggml.c +29 -20
- package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
- package/src/llama.cpp/include/llama.h +52 -11
- package/src/llama.cpp/requirements/requirements-all.txt +3 -3
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +3 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +17 -7
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +389 -501
- package/src/llama.cpp/src/llama-context.h +44 -32
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +20 -38
- package/src/llama.cpp/src/llama-graph.h +12 -8
- package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
- package/src/llama.cpp/src/llama-kv-cache.h +271 -85
- package/src/llama.cpp/src/llama-memory.h +11 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +316 -69
- package/src/llama.cpp/src/llama-model.h +8 -1
- package/src/llama.cpp/src/llama-quant.cpp +15 -13
- package/src/llama.cpp/src/llama-sampling.cpp +18 -6
- package/src/llama.cpp/src/llama-vocab.cpp +42 -4
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +14 -0
- package/src/llama.cpp/tests/CMakeLists.txt +10 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
- package/src/llama.cpp/tests/test-chat.cpp +3 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
- package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
- package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
- package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
- package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
- package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.h +0 -135
- package/src/llama.cpp/examples/llava/llava.cpp +0 -586
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/mtmd.h +0 -168
- package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
#include "common.h"
|
|
4
4
|
#include "log.h"
|
|
5
5
|
#include "llama.h"
|
|
6
|
+
#include "arg.h" // common_remote_get_content
|
|
6
7
|
#include "base64.hpp"
|
|
8
|
+
#include "mtmd.h"
|
|
7
9
|
|
|
8
10
|
// increase max payload length to allow use of larger context size
|
|
9
11
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
|
@@ -21,6 +23,7 @@
|
|
|
21
23
|
#include <string>
|
|
22
24
|
#include <vector>
|
|
23
25
|
#include <memory>
|
|
26
|
+
#include <cinttypes>
|
|
24
27
|
|
|
25
28
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
|
|
26
29
|
|
|
@@ -41,6 +44,8 @@ using json = nlohmann::ordered_json;
|
|
|
41
44
|
#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
42
45
|
#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
|
|
43
46
|
|
|
47
|
+
using raw_buffer = std::vector<uint8_t>;
|
|
48
|
+
|
|
44
49
|
template <typename T>
|
|
45
50
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
|
46
51
|
// Fallback null to default value
|
|
@@ -386,7 +391,7 @@ static inline bool is_base64(uint8_t c) {
|
|
|
386
391
|
return (isalnum(c) || (c == '+') || (c == '/'));
|
|
387
392
|
}
|
|
388
393
|
|
|
389
|
-
static inline
|
|
394
|
+
static inline raw_buffer base64_decode(const std::string & encoded_string) {
|
|
390
395
|
int i = 0;
|
|
391
396
|
int j = 0;
|
|
392
397
|
int in_ = 0;
|
|
@@ -396,7 +401,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
|
|
396
401
|
uint8_t char_array_4[4];
|
|
397
402
|
uint8_t char_array_3[3];
|
|
398
403
|
|
|
399
|
-
|
|
404
|
+
raw_buffer ret;
|
|
400
405
|
|
|
401
406
|
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
|
|
402
407
|
char_array_4[i++] = encoded_string[in_]; in_++;
|
|
@@ -578,8 +583,11 @@ static json oaicompat_completion_params_parse(const json & body) {
|
|
|
578
583
|
static json oaicompat_completion_params_parse(
|
|
579
584
|
const json & body, /* openai api json semantics */
|
|
580
585
|
bool use_jinja,
|
|
586
|
+
bool prefill_assistant,
|
|
581
587
|
common_reasoning_format reasoning_format,
|
|
582
|
-
const struct common_chat_templates * tmpls
|
|
588
|
+
const struct common_chat_templates * tmpls,
|
|
589
|
+
bool allow_non_text,
|
|
590
|
+
std::vector<raw_buffer> & out_files)
|
|
583
591
|
{
|
|
584
592
|
json llama_params;
|
|
585
593
|
|
|
@@ -627,8 +635,89 @@ static json oaicompat_completion_params_parse(
|
|
|
627
635
|
}
|
|
628
636
|
}
|
|
629
637
|
|
|
638
|
+
// get input files
|
|
639
|
+
if (!body.contains("messages")) {
|
|
640
|
+
throw std::runtime_error("'messages' is required");
|
|
641
|
+
}
|
|
642
|
+
json messages = body.at("messages");
|
|
643
|
+
if (!messages.is_array()) {
|
|
644
|
+
throw std::runtime_error("Expected 'messages' to be an array");
|
|
645
|
+
}
|
|
646
|
+
for (auto & msg : messages) {
|
|
647
|
+
std::string role = json_value(msg, "role", std::string());
|
|
648
|
+
if (role != "assistant" && !msg.contains("content")) {
|
|
649
|
+
throw std::runtime_error("All non-assistant messages must contain 'content'");
|
|
650
|
+
}
|
|
651
|
+
if (role == "assistant") {
|
|
652
|
+
if (!msg.contains("content") && !msg.contains("tool_calls")) {
|
|
653
|
+
throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
|
|
654
|
+
}
|
|
655
|
+
if (!msg.contains("content")) {
|
|
656
|
+
continue; // avoid errors with no content
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
json & content = msg.at("content");
|
|
660
|
+
if (content.is_string() || content.is_null()) {
|
|
661
|
+
continue;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
if (!content.is_array()) {
|
|
665
|
+
throw std::runtime_error("Expected 'content' to be a string or an array");
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
for (auto & p : content) {
|
|
669
|
+
std::string type = json_value(p, "type", std::string());
|
|
670
|
+
json image_url = json_value(p, "image_url", json::object());
|
|
671
|
+
if (type == "image_url") {
|
|
672
|
+
if (!allow_non_text) {
|
|
673
|
+
throw std::runtime_error("image input is not supported by this server");
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
std::string url = json_value(image_url, "url", std::string());
|
|
677
|
+
if (string_starts_with(url, "http")) {
|
|
678
|
+
// download remote image
|
|
679
|
+
// TODO @ngxson : maybe make these params configurable
|
|
680
|
+
common_remote_params params;
|
|
681
|
+
params.headers.push_back("User-Agent: llama.cpp/" + build_info);
|
|
682
|
+
params.max_size = 1024 * 1024 * 10; // 10MB
|
|
683
|
+
params.timeout = 10; // seconds
|
|
684
|
+
SRV_INF("downloading image from '%s'\n", url.c_str());
|
|
685
|
+
auto res = common_remote_get_content(url, params);
|
|
686
|
+
if (200 <= res.first && res.first < 300) {
|
|
687
|
+
SRV_INF("downloaded %ld bytes\n", res.second.size());
|
|
688
|
+
raw_buffer data;
|
|
689
|
+
data.insert(data.end(), res.second.begin(), res.second.end());
|
|
690
|
+
out_files.push_back(data);
|
|
691
|
+
} else {
|
|
692
|
+
throw std::runtime_error("Failed to download image");
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
} else {
|
|
696
|
+
// try to decode base64 image
|
|
697
|
+
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
|
|
698
|
+
if (parts.size() != 2) {
|
|
699
|
+
throw std::runtime_error("Invalid image_url.url value");
|
|
700
|
+
} else if (!string_starts_with(parts[0], "data:image/")) {
|
|
701
|
+
throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
|
|
702
|
+
} else if (!string_ends_with(parts[0], "base64")) {
|
|
703
|
+
throw std::runtime_error("image_url.url must be base64 encoded");
|
|
704
|
+
} else {
|
|
705
|
+
auto base64_data = parts[1];
|
|
706
|
+
auto decoded_data = base64_decode(base64_data);
|
|
707
|
+
out_files.push_back(decoded_data);
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// replace this chunk with a marker
|
|
712
|
+
p["type"] = "text";
|
|
713
|
+
p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
|
|
714
|
+
p.erase("image_url");
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
|
|
630
719
|
common_chat_templates_inputs inputs;
|
|
631
|
-
inputs.messages = common_chat_msgs_parse_oaicompat(
|
|
720
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
|
|
632
721
|
inputs.tools = common_chat_tools_parse_oaicompat(tools);
|
|
633
722
|
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
|
|
634
723
|
inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
|
|
@@ -644,7 +733,7 @@ static json oaicompat_completion_params_parse(
|
|
|
644
733
|
|
|
645
734
|
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
|
646
735
|
// for ex. this can be useful to modify the reasoning process in reasoning models
|
|
647
|
-
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
|
|
736
|
+
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
|
|
648
737
|
common_chat_msg last_message;
|
|
649
738
|
if (prefill_assistant_message) {
|
|
650
739
|
last_message = inputs.messages.back();
|
|
@@ -935,3 +1024,286 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
|
|
|
935
1024
|
|
|
936
1025
|
return lora;
|
|
937
1026
|
}
|
|
1027
|
+
|
|
1028
|
+
//
|
|
1029
|
+
// utils for interacting with libmtmd
|
|
1030
|
+
// (may need to refactor in near future)
|
|
1031
|
+
//
|
|
1032
|
+
|
|
1033
|
+
/**
|
|
1034
|
+
* server_tokens is a helper to manage the input tokens and image for the server.
|
|
1035
|
+
* it is made this way to simplify the logic of KV cache management.
|
|
1036
|
+
*/
|
|
1037
|
+
struct server_tokens {
|
|
1038
|
+
bool has_mtmd = false;
|
|
1039
|
+
|
|
1040
|
+
private: // disallow accessing these members directly, risking out-of-sync
|
|
1041
|
+
|
|
1042
|
+
// map a **start** position in tokens to the image chunk
|
|
1043
|
+
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
|
|
1044
|
+
|
|
1045
|
+
// list of tokens
|
|
1046
|
+
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
|
|
1047
|
+
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
|
|
1048
|
+
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
|
|
1049
|
+
llama_tokens tokens;
|
|
1050
|
+
|
|
1051
|
+
// for ex. with input of 5 text tokens and 2 images:
|
|
1052
|
+
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
|
1053
|
+
// pos 0 1 2 3 4 5 6 7 8 9
|
|
1054
|
+
// map_pos_to_image will contain: {5, img0}, {8, img1}
|
|
1055
|
+
|
|
1056
|
+
public:
|
|
1057
|
+
server_tokens() = default;
|
|
1058
|
+
~server_tokens() = default;
|
|
1059
|
+
|
|
1060
|
+
// Prevent copying
|
|
1061
|
+
server_tokens(const server_tokens&) = delete;
|
|
1062
|
+
server_tokens& operator=(const server_tokens&) = delete;
|
|
1063
|
+
|
|
1064
|
+
// Allow moving (usually implicitly generated if members are movable)
|
|
1065
|
+
server_tokens(server_tokens&&) = default;
|
|
1066
|
+
server_tokens& operator=(server_tokens&&) = default;
|
|
1067
|
+
|
|
1068
|
+
// Allow accessing elements using [] operator
|
|
1069
|
+
llama_token operator[](size_t index) { return tokens[index]; }
|
|
1070
|
+
const llama_token& operator[](size_t index) const { return tokens[index]; }
|
|
1071
|
+
|
|
1072
|
+
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
|
|
1073
|
+
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
|
|
1074
|
+
push_back(mtmd_chunks[i]);
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
|
|
1079
|
+
|
|
1080
|
+
// for debugging
|
|
1081
|
+
std::string str() const {
|
|
1082
|
+
std::ostringstream oss;
|
|
1083
|
+
oss << "tokens: ";
|
|
1084
|
+
for (const auto & t : tokens) {
|
|
1085
|
+
if (t == LLAMA_TOKEN_NULL) {
|
|
1086
|
+
oss << "<embd> ";
|
|
1087
|
+
} else {
|
|
1088
|
+
oss << t << " ";
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
oss << "\n";
|
|
1092
|
+
oss << "image pos: ";
|
|
1093
|
+
for (const auto & it : map_pos_to_image) {
|
|
1094
|
+
oss << it.first << ", ";
|
|
1095
|
+
}
|
|
1096
|
+
return oss.str();
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
|
|
1100
|
+
auto it = map_pos_to_image.find(pos);
|
|
1101
|
+
if (it != map_pos_to_image.end()) {
|
|
1102
|
+
return it->second;
|
|
1103
|
+
} else {
|
|
1104
|
+
throw std::runtime_error("Chunk not found");
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
void push_back(llama_token tok) {
|
|
1109
|
+
if (tok == LLAMA_TOKEN_NULL) {
|
|
1110
|
+
throw std::runtime_error("Invalid token");
|
|
1111
|
+
}
|
|
1112
|
+
tokens.emplace_back(tok);
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
// will create a copy of the chunk if it contains non-text data
|
|
1116
|
+
void push_back(const mtmd_input_chunk * chunk) {
|
|
1117
|
+
auto type = mtmd_input_chunk_get_type(chunk);
|
|
1118
|
+
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
|
1119
|
+
GGML_ASSERT(has_mtmd);
|
|
1120
|
+
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
|
1121
|
+
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
1122
|
+
llama_pos start_pos = tokens.size();
|
|
1123
|
+
for (int i = 0; i < n_pos; ++i) {
|
|
1124
|
+
tokens.emplace_back(LLAMA_TOKEN_NULL);
|
|
1125
|
+
}
|
|
1126
|
+
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
|
1127
|
+
map_pos_to_image[start_pos] = std::move(new_chunk);
|
|
1128
|
+
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
|
1129
|
+
size_t n_tokens;
|
|
1130
|
+
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
|
1131
|
+
for (size_t i = 0; i < n_tokens; ++i) {
|
|
1132
|
+
push_back(text_tokens[i]);
|
|
1133
|
+
}
|
|
1134
|
+
} else {
|
|
1135
|
+
GGML_ABORT("Invalid chunk type");
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
// for compatibility with context shift and prompt truncation
|
|
1140
|
+
void insert(const llama_tokens & inp_tokens) {
|
|
1141
|
+
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
|
1142
|
+
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
// for compatibility with speculative decoding, ctx shift, slot save/load
|
|
1146
|
+
const llama_tokens & get_text_tokens() const {
|
|
1147
|
+
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
|
1148
|
+
return tokens;
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
// for compatibility with speculative decoding
|
|
1152
|
+
void set_token(llama_pos pos, llama_token id) {
|
|
1153
|
+
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
|
1154
|
+
tokens[pos] = id;
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
size_t size() const {
|
|
1158
|
+
return tokens.size();
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
bool empty() const {
|
|
1162
|
+
return tokens.empty();
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
void clear() {
|
|
1166
|
+
tokens.clear();
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
void keep_first(size_t n) {
|
|
1170
|
+
GGML_ASSERT(n <= tokens.size());
|
|
1171
|
+
if (has_mtmd) {
|
|
1172
|
+
// we throw an error if we try to remove a token in the middle of an image
|
|
1173
|
+
// for ex. with input of 5 text tokens and 2 images:
|
|
1174
|
+
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
|
1175
|
+
// n 1 2 3 4 5 6 7 8 9 10
|
|
1176
|
+
// allowed to resize ^ ^
|
|
1177
|
+
// disallowed to resize ^ ^ ^
|
|
1178
|
+
if (n > 0) {
|
|
1179
|
+
llama_token last_token = tokens[n - 1];
|
|
1180
|
+
// make sure we never remove tokens in the middle of an image
|
|
1181
|
+
if (last_token == LLAMA_TOKEN_NULL) {
|
|
1182
|
+
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
// remove all image chunks that are not used anymore
|
|
1186
|
+
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
|
|
1187
|
+
llama_pos pos = it->first;
|
|
1188
|
+
if (pos >= (llama_pos)n) {
|
|
1189
|
+
it = map_pos_to_image.erase(it);
|
|
1190
|
+
} else {
|
|
1191
|
+
++it;
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
tokens.resize(n);
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
std::string detokenize(const llama_context * ctx, bool special) const {
|
|
1199
|
+
llama_tokens text_tokens;
|
|
1200
|
+
text_tokens.reserve(tokens.size());
|
|
1201
|
+
for (const auto & t : tokens) {
|
|
1202
|
+
if (t != LLAMA_TOKEN_NULL) {
|
|
1203
|
+
text_tokens.push_back(t);
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
return common_detokenize(ctx, text_tokens, special);
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
size_t get_common_prefix(const server_tokens & b) const {
|
|
1210
|
+
size_t max_idx = std::min(tokens.size(), b.tokens.size());
|
|
1211
|
+
for (size_t i = 0; i < max_idx; ++i) {
|
|
1212
|
+
auto & ai = tokens[i];
|
|
1213
|
+
auto & bi = b.tokens[i];
|
|
1214
|
+
|
|
1215
|
+
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
|
|
1216
|
+
GGML_ASSERT(has_mtmd);
|
|
1217
|
+
const auto & a_chunk = find_chunk(i);
|
|
1218
|
+
const auto & b_chunk = b.find_chunk(i);
|
|
1219
|
+
GGML_ASSERT(a_chunk && b_chunk);
|
|
1220
|
+
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
|
|
1221
|
+
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
|
|
1222
|
+
std::string ai_id = mtmd_image_tokens_get_id(a_img);
|
|
1223
|
+
std::string bi_id = mtmd_image_tokens_get_id(b_img);
|
|
1224
|
+
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
|
|
1225
|
+
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
|
|
1226
|
+
if (ai_id == bi_id && a_pos == b_pos) {
|
|
1227
|
+
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
|
|
1228
|
+
i += a_pos - 1; // will be +1 by the for loop
|
|
1229
|
+
continue;
|
|
1230
|
+
} else {
|
|
1231
|
+
return i;
|
|
1232
|
+
}
|
|
1233
|
+
} else if (ai == bi) {
|
|
1234
|
+
continue;
|
|
1235
|
+
} else {
|
|
1236
|
+
return i;
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
return max_idx; // all tokens are equal
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
// make sure all text tokens are within the vocab range
|
|
1243
|
+
bool validate(const struct llama_context * ctx) const {
|
|
1244
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1245
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1246
|
+
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1247
|
+
|
|
1248
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
1249
|
+
auto & t = tokens[i];
|
|
1250
|
+
if (t == LLAMA_TOKEN_NULL) {
|
|
1251
|
+
try {
|
|
1252
|
+
const auto & chunk = find_chunk(i);
|
|
1253
|
+
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
|
|
1254
|
+
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
|
1255
|
+
i += n_pos - 1; // will be +1 by the for loop
|
|
1256
|
+
} catch (const std::exception & e) {
|
|
1257
|
+
return false;
|
|
1258
|
+
}
|
|
1259
|
+
} else if (t < 0 || t >= n_vocab) {
|
|
1260
|
+
return false;
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1263
|
+
return true;
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
// encode and decode the image chunk
|
|
1267
|
+
int32_t process_chunk(
|
|
1268
|
+
llama_context * ctx,
|
|
1269
|
+
mtmd_context * mctx,
|
|
1270
|
+
llama_pos n_past,
|
|
1271
|
+
int32_t seq_id,
|
|
1272
|
+
llama_pos & n_pos_out) {
|
|
1273
|
+
auto it = map_pos_to_image.find(n_past);
|
|
1274
|
+
if (it == map_pos_to_image.end()) {
|
|
1275
|
+
throw std::runtime_error("Chunk not found");
|
|
1276
|
+
}
|
|
1277
|
+
SRV_INF("%s\n", "processing image...");
|
|
1278
|
+
int32_t n_batch = llama_n_batch(ctx);
|
|
1279
|
+
int64_t t0 = ggml_time_ms();
|
|
1280
|
+
llama_pos new_n_past = n_past;
|
|
1281
|
+
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
|
1282
|
+
it->second.get(), // chunk
|
|
1283
|
+
n_past,
|
|
1284
|
+
seq_id,
|
|
1285
|
+
n_batch,
|
|
1286
|
+
true, // logits last
|
|
1287
|
+
&new_n_past);
|
|
1288
|
+
SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
|
1289
|
+
if (result != 0) {
|
|
1290
|
+
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
|
1291
|
+
n_pos_out = n_past;
|
|
1292
|
+
return result;
|
|
1293
|
+
}
|
|
1294
|
+
n_pos_out = new_n_past;
|
|
1295
|
+
return 0;
|
|
1296
|
+
}
|
|
1297
|
+
};
|
|
1298
|
+
|
|
1299
|
+
// Computes FNV-1a hash of the data
|
|
1300
|
+
static std::string fnv_hash(const uint8_t * data, size_t len) {
|
|
1301
|
+
const uint64_t fnv_prime = 0x100000001b3ULL;
|
|
1302
|
+
uint64_t hash = 0xcbf29ce484222325ULL;
|
|
1303
|
+
|
|
1304
|
+
for (size_t i = 0; i < len; ++i) {
|
|
1305
|
+
hash ^= data[i];
|
|
1306
|
+
hash *= fnv_prime;
|
|
1307
|
+
}
|
|
1308
|
+
return std::to_string(hash);
|
|
1309
|
+
}
|