@fugood/llama.node 0.3.17 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -3,7 +3,9 @@
3
3
  #include "common.h"
4
4
  #include "log.h"
5
5
  #include "llama.h"
6
+ #include "arg.h" // common_remote_get_content
6
7
  #include "base64.hpp"
8
+ #include "mtmd.h"
7
9
 
8
10
  // increase max payload length to allow use of larger context size
9
11
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -21,6 +23,7 @@
21
23
  #include <string>
22
24
  #include <vector>
23
25
  #include <memory>
26
+ #include <cinttypes>
24
27
 
25
28
  #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
26
29
 
@@ -41,6 +44,8 @@ using json = nlohmann::ordered_json;
41
44
  #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
42
45
  #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
43
46
 
47
+ using raw_buffer = std::vector<uint8_t>;
48
+
44
49
  template <typename T>
45
50
  static T json_value(const json & body, const std::string & key, const T & default_value) {
46
51
  // Fallback null to default value
@@ -386,7 +391,7 @@ static inline bool is_base64(uint8_t c) {
386
391
  return (isalnum(c) || (c == '+') || (c == '/'));
387
392
  }
388
393
 
389
- static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string) {
394
+ static inline raw_buffer base64_decode(const std::string & encoded_string) {
390
395
  int i = 0;
391
396
  int j = 0;
392
397
  int in_ = 0;
@@ -396,7 +401,7 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
396
401
  uint8_t char_array_4[4];
397
402
  uint8_t char_array_3[3];
398
403
 
399
- std::vector<uint8_t> ret;
404
+ raw_buffer ret;
400
405
 
401
406
  while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
402
407
  char_array_4[i++] = encoded_string[in_]; in_++;
@@ -578,8 +583,11 @@ static json oaicompat_completion_params_parse(const json & body) {
578
583
  static json oaicompat_completion_params_parse(
579
584
  const json & body, /* openai api json semantics */
580
585
  bool use_jinja,
586
+ bool prefill_assistant,
581
587
  common_reasoning_format reasoning_format,
582
- const struct common_chat_templates * tmpls)
588
+ const struct common_chat_templates * tmpls,
589
+ bool allow_non_text,
590
+ std::vector<raw_buffer> & out_files)
583
591
  {
584
592
  json llama_params;
585
593
 
@@ -627,8 +635,89 @@ static json oaicompat_completion_params_parse(
627
635
  }
628
636
  }
629
637
 
638
+ // get input files
639
+ if (!body.contains("messages")) {
640
+ throw std::runtime_error("'messages' is required");
641
+ }
642
+ json messages = body.at("messages");
643
+ if (!messages.is_array()) {
644
+ throw std::runtime_error("Expected 'messages' to be an array");
645
+ }
646
+ for (auto & msg : messages) {
647
+ std::string role = json_value(msg, "role", std::string());
648
+ if (role != "assistant" && !msg.contains("content")) {
649
+ throw std::runtime_error("All non-assistant messages must contain 'content'");
650
+ }
651
+ if (role == "assistant") {
652
+ if (!msg.contains("content") && !msg.contains("tool_calls")) {
653
+ throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
654
+ }
655
+ if (!msg.contains("content")) {
656
+ continue; // avoid errors with no content
657
+ }
658
+ }
659
+ json & content = msg.at("content");
660
+ if (content.is_string() || content.is_null()) {
661
+ continue;
662
+ }
663
+
664
+ if (!content.is_array()) {
665
+ throw std::runtime_error("Expected 'content' to be a string or an array");
666
+ }
667
+
668
+ for (auto & p : content) {
669
+ std::string type = json_value(p, "type", std::string());
670
+ json image_url = json_value(p, "image_url", json::object());
671
+ if (type == "image_url") {
672
+ if (!allow_non_text) {
673
+ throw std::runtime_error("image input is not supported by this server");
674
+ }
675
+
676
+ std::string url = json_value(image_url, "url", std::string());
677
+ if (string_starts_with(url, "http")) {
678
+ // download remote image
679
+ // TODO @ngxson : maybe make these params configurable
680
+ common_remote_params params;
681
+ params.headers.push_back("User-Agent: llama.cpp/" + build_info);
682
+ params.max_size = 1024 * 1024 * 10; // 10MB
683
+ params.timeout = 10; // seconds
684
+ SRV_INF("downloading image from '%s'\n", url.c_str());
685
+ auto res = common_remote_get_content(url, params);
686
+ if (200 <= res.first && res.first < 300) {
687
+ SRV_INF("downloaded %ld bytes\n", res.second.size());
688
+ raw_buffer data;
689
+ data.insert(data.end(), res.second.begin(), res.second.end());
690
+ out_files.push_back(data);
691
+ } else {
692
+ throw std::runtime_error("Failed to download image");
693
+ }
694
+
695
+ } else {
696
+ // try to decode base64 image
697
+ std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
698
+ if (parts.size() != 2) {
699
+ throw std::runtime_error("Invalid image_url.url value");
700
+ } else if (!string_starts_with(parts[0], "data:image/")) {
701
+ throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
702
+ } else if (!string_ends_with(parts[0], "base64")) {
703
+ throw std::runtime_error("image_url.url must be base64 encoded");
704
+ } else {
705
+ auto base64_data = parts[1];
706
+ auto decoded_data = base64_decode(base64_data);
707
+ out_files.push_back(decoded_data);
708
+ }
709
+ }
710
+
711
+ // replace this chunk with a marker
712
+ p["type"] = "text";
713
+ p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
714
+ p.erase("image_url");
715
+ }
716
+ }
717
+ }
718
+
630
719
  common_chat_templates_inputs inputs;
631
- inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages"));
720
+ inputs.messages = common_chat_msgs_parse_oaicompat(messages);
632
721
  inputs.tools = common_chat_tools_parse_oaicompat(tools);
633
722
  inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
634
723
  inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump();
@@ -644,7 +733,7 @@ static json oaicompat_completion_params_parse(
644
733
 
645
734
  // if the assistant message appears at the end of list, we do not add end-of-turn token
646
735
  // for ex. this can be useful to modify the reasoning process in reasoning models
647
- bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
736
+ bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
648
737
  common_chat_msg last_message;
649
738
  if (prefill_assistant_message) {
650
739
  last_message = inputs.messages.back();
@@ -935,3 +1024,286 @@ static std::vector<common_adapter_lora_info> parse_lora_request(
935
1024
 
936
1025
  return lora;
937
1026
  }
1027
+
1028
+ //
1029
+ // utils for interacting with libmtmd
1030
+ // (may need to refactor in near future)
1031
+ //
1032
+
1033
+ /**
1034
+ * server_tokens is a helper to manage the input tokens and image for the server.
1035
+ * it is made this way to simplify the logic of KV cache management.
1036
+ */
1037
+ struct server_tokens {
1038
+ bool has_mtmd = false;
1039
+
1040
+ private: // disallow accessing these members directly, risking out-of-sync
1041
+
1042
+ // map a **start** position in tokens to the image chunk
1043
+ std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
1044
+
1045
+ // list of tokens
1046
+ // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
1047
+ // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
1048
+ // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
1049
+ llama_tokens tokens;
1050
+
1051
+ // for ex. with input of 5 text tokens and 2 images:
1052
+ // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1053
+ // pos 0 1 2 3 4 5 6 7 8 9
1054
+ // map_pos_to_image will contain: {5, img0}, {8, img1}
1055
+
1056
+ public:
1057
+ server_tokens() = default;
1058
+ ~server_tokens() = default;
1059
+
1060
+ // Prevent copying
1061
+ server_tokens(const server_tokens&) = delete;
1062
+ server_tokens& operator=(const server_tokens&) = delete;
1063
+
1064
+ // Allow moving (usually implicitly generated if members are movable)
1065
+ server_tokens(server_tokens&&) = default;
1066
+ server_tokens& operator=(server_tokens&&) = default;
1067
+
1068
+ // Allow accessing elements using [] operator
1069
+ llama_token operator[](size_t index) { return tokens[index]; }
1070
+ const llama_token& operator[](size_t index) const { return tokens[index]; }
1071
+
1072
+ server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
1073
+ for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
1074
+ push_back(mtmd_chunks[i]);
1075
+ }
1076
+ }
1077
+
1078
+ server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
1079
+
1080
+ // for debugging
1081
+ std::string str() const {
1082
+ std::ostringstream oss;
1083
+ oss << "tokens: ";
1084
+ for (const auto & t : tokens) {
1085
+ if (t == LLAMA_TOKEN_NULL) {
1086
+ oss << "<embd> ";
1087
+ } else {
1088
+ oss << t << " ";
1089
+ }
1090
+ }
1091
+ oss << "\n";
1092
+ oss << "image pos: ";
1093
+ for (const auto & it : map_pos_to_image) {
1094
+ oss << it.first << ", ";
1095
+ }
1096
+ return oss.str();
1097
+ }
1098
+
1099
+ const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
1100
+ auto it = map_pos_to_image.find(pos);
1101
+ if (it != map_pos_to_image.end()) {
1102
+ return it->second;
1103
+ } else {
1104
+ throw std::runtime_error("Chunk not found");
1105
+ }
1106
+ }
1107
+
1108
+ void push_back(llama_token tok) {
1109
+ if (tok == LLAMA_TOKEN_NULL) {
1110
+ throw std::runtime_error("Invalid token");
1111
+ }
1112
+ tokens.emplace_back(tok);
1113
+ }
1114
+
1115
+ // will create a copy of the chunk if it contains non-text data
1116
+ void push_back(const mtmd_input_chunk * chunk) {
1117
+ auto type = mtmd_input_chunk_get_type(chunk);
1118
+ if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
1119
+ GGML_ASSERT(has_mtmd);
1120
+ auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
1121
+ const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
1122
+ llama_pos start_pos = tokens.size();
1123
+ for (int i = 0; i < n_pos; ++i) {
1124
+ tokens.emplace_back(LLAMA_TOKEN_NULL);
1125
+ }
1126
+ mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
1127
+ map_pos_to_image[start_pos] = std::move(new_chunk);
1128
+ } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
1129
+ size_t n_tokens;
1130
+ auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
1131
+ for (size_t i = 0; i < n_tokens; ++i) {
1132
+ push_back(text_tokens[i]);
1133
+ }
1134
+ } else {
1135
+ GGML_ABORT("Invalid chunk type");
1136
+ }
1137
+ }
1138
+
1139
+ // for compatibility with context shift and prompt truncation
1140
+ void insert(const llama_tokens & inp_tokens) {
1141
+ GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1142
+ tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
1143
+ }
1144
+
1145
+ // for compatibility with speculative decoding, ctx shift, slot save/load
1146
+ const llama_tokens & get_text_tokens() const {
1147
+ GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1148
+ return tokens;
1149
+ }
1150
+
1151
+ // for compatibility with speculative decoding
1152
+ void set_token(llama_pos pos, llama_token id) {
1153
+ GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
1154
+ tokens[pos] = id;
1155
+ }
1156
+
1157
+ size_t size() const {
1158
+ return tokens.size();
1159
+ }
1160
+
1161
+ bool empty() const {
1162
+ return tokens.empty();
1163
+ }
1164
+
1165
+ void clear() {
1166
+ tokens.clear();
1167
+ }
1168
+
1169
+ void keep_first(size_t n) {
1170
+ GGML_ASSERT(n <= tokens.size());
1171
+ if (has_mtmd) {
1172
+ // we throw an error if we try to remove a token in the middle of an image
1173
+ // for ex. with input of 5 text tokens and 2 images:
1174
+ // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
1175
+ // n 1 2 3 4 5 6 7 8 9 10
1176
+ // allowed to resize ^ ^
1177
+ // disallowed to resize ^ ^ ^
1178
+ if (n > 0) {
1179
+ llama_token last_token = tokens[n - 1];
1180
+ // make sure we never remove tokens in the middle of an image
1181
+ if (last_token == LLAMA_TOKEN_NULL) {
1182
+ find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
1183
+ }
1184
+ }
1185
+ // remove all image chunks that are not used anymore
1186
+ for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
1187
+ llama_pos pos = it->first;
1188
+ if (pos >= (llama_pos)n) {
1189
+ it = map_pos_to_image.erase(it);
1190
+ } else {
1191
+ ++it;
1192
+ }
1193
+ }
1194
+ }
1195
+ tokens.resize(n);
1196
+ }
1197
+
1198
+ std::string detokenize(const llama_context * ctx, bool special) const {
1199
+ llama_tokens text_tokens;
1200
+ text_tokens.reserve(tokens.size());
1201
+ for (const auto & t : tokens) {
1202
+ if (t != LLAMA_TOKEN_NULL) {
1203
+ text_tokens.push_back(t);
1204
+ }
1205
+ }
1206
+ return common_detokenize(ctx, text_tokens, special);
1207
+ }
1208
+
1209
+ size_t get_common_prefix(const server_tokens & b) const {
1210
+ size_t max_idx = std::min(tokens.size(), b.tokens.size());
1211
+ for (size_t i = 0; i < max_idx; ++i) {
1212
+ auto & ai = tokens[i];
1213
+ auto & bi = b.tokens[i];
1214
+
1215
+ if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
1216
+ GGML_ASSERT(has_mtmd);
1217
+ const auto & a_chunk = find_chunk(i);
1218
+ const auto & b_chunk = b.find_chunk(i);
1219
+ GGML_ASSERT(a_chunk && b_chunk);
1220
+ const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
1221
+ const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
1222
+ std::string ai_id = mtmd_image_tokens_get_id(a_img);
1223
+ std::string bi_id = mtmd_image_tokens_get_id(b_img);
1224
+ size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
1225
+ size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
1226
+ if (ai_id == bi_id && a_pos == b_pos) {
1227
+ GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
1228
+ i += a_pos - 1; // will be +1 by the for loop
1229
+ continue;
1230
+ } else {
1231
+ return i;
1232
+ }
1233
+ } else if (ai == bi) {
1234
+ continue;
1235
+ } else {
1236
+ return i;
1237
+ }
1238
+ }
1239
+ return max_idx; // all tokens are equal
1240
+ }
1241
+
1242
+ // make sure all text tokens are within the vocab range
1243
+ bool validate(const struct llama_context * ctx) const {
1244
+ const llama_model * model = llama_get_model(ctx);
1245
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1246
+ const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1247
+
1248
+ for (size_t i = 0; i < tokens.size(); ++i) {
1249
+ auto & t = tokens[i];
1250
+ if (t == LLAMA_TOKEN_NULL) {
1251
+ try {
1252
+ const auto & chunk = find_chunk(i);
1253
+ const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
1254
+ size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
1255
+ i += n_pos - 1; // will be +1 by the for loop
1256
+ } catch (const std::exception & e) {
1257
+ return false;
1258
+ }
1259
+ } else if (t < 0 || t >= n_vocab) {
1260
+ return false;
1261
+ }
1262
+ }
1263
+ return true;
1264
+ }
1265
+
1266
+ // encode and decode the image chunk
1267
+ int32_t process_chunk(
1268
+ llama_context * ctx,
1269
+ mtmd_context * mctx,
1270
+ llama_pos n_past,
1271
+ int32_t seq_id,
1272
+ llama_pos & n_pos_out) {
1273
+ auto it = map_pos_to_image.find(n_past);
1274
+ if (it == map_pos_to_image.end()) {
1275
+ throw std::runtime_error("Chunk not found");
1276
+ }
1277
+ SRV_INF("%s\n", "processing image...");
1278
+ int32_t n_batch = llama_n_batch(ctx);
1279
+ int64_t t0 = ggml_time_ms();
1280
+ llama_pos new_n_past = n_past;
1281
+ int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
1282
+ it->second.get(), // chunk
1283
+ n_past,
1284
+ seq_id,
1285
+ n_batch,
1286
+ true, // logits last
1287
+ &new_n_past);
1288
+ SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
1289
+ if (result != 0) {
1290
+ LOG_ERR("mtmd_helper_eval failed with status %d", result);
1291
+ n_pos_out = n_past;
1292
+ return result;
1293
+ }
1294
+ n_pos_out = new_n_past;
1295
+ return 0;
1296
+ }
1297
+ };
1298
+
1299
+ // Computes FNV-1a hash of the data
1300
+ static std::string fnv_hash(const uint8_t * data, size_t len) {
1301
+ const uint64_t fnv_prime = 0x100000001b3ULL;
1302
+ uint64_t hash = 0xcbf29ce484222325ULL;
1303
+
1304
+ for (size_t i = 0; i < len; ++i) {
1305
+ hash ^= data[i];
1306
+ hash *= fnv_prime;
1307
+ }
1308
+ return std::to_string(hash);
1309
+ }
@@ -1,6 +0,0 @@
1
- set( CMAKE_SYSTEM_NAME Windows )
2
- set( CMAKE_SYSTEM_PROCESSOR arm64 )
3
-
4
- set( target arm64-pc-windows-msvc )
5
- set( CMAKE_C_COMPILER_TARGET ${target} )
6
- set( CMAKE_CXX_COMPILER_TARGET ${target} )
@@ -1,5 +0,0 @@
1
- set(TARGET llama-infill)
2
- add_executable(${TARGET} infill.cpp)
3
- install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
- target_compile_features(${TARGET} PRIVATE cxx_std_17)