@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -7,6 +7,7 @@
7
7
  #include "log.h"
8
8
  #include "sampling.h"
9
9
  #include "speculative.h"
10
+ #include "mtmd.h"
10
11
 
11
12
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
12
13
  #define JSON_ASSERT GGML_ASSERT
@@ -146,6 +147,7 @@ struct slot_params {
146
147
  {"top_k", sampling.top_k},
147
148
  {"top_p", sampling.top_p},
148
149
  {"min_p", sampling.min_p},
150
+ {"top_n_sigma", sampling.top_n_sigma},
149
151
  {"xtc_probability", sampling.xtc_probability},
150
152
  {"xtc_threshold", sampling.xtc_threshold},
151
153
  {"typical_p", sampling.typ_p},
@@ -196,8 +198,8 @@ struct server_task {
196
198
  int id_target = -1;
197
199
 
198
200
  // used by SERVER_TASK_TYPE_INFERENCE
199
- slot_params params;
200
- llama_tokens prompt_tokens;
201
+ slot_params params;
202
+ server_tokens prompt_tokens;
201
203
  int id_selected_slot = -1;
202
204
 
203
205
  // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
@@ -248,6 +250,7 @@ struct server_task {
248
250
  params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
249
251
  params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
250
252
  params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
253
+ params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
251
254
  params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
252
255
  params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
253
256
  params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
@@ -1246,6 +1249,9 @@ struct server_slot {
1246
1249
  llama_context * ctx = nullptr;
1247
1250
  llama_context * ctx_dft = nullptr;
1248
1251
 
1252
+ // multimodal
1253
+ mtmd_context * mctx = nullptr;
1254
+
1249
1255
  common_speculative * spec = nullptr;
1250
1256
 
1251
1257
  std::vector<common_adapter_lora_info> lora;
@@ -1273,14 +1279,14 @@ struct server_slot {
1273
1279
  int32_t n_prompt_tokens_processed = 0;
1274
1280
 
1275
1281
  // input prompt tokens
1276
- llama_tokens prompt_tokens;
1282
+ server_tokens prompt_tokens;
1277
1283
 
1278
1284
  size_t last_nl_pos = 0;
1279
1285
 
1280
1286
  std::string generated_text;
1281
1287
  llama_tokens generated_tokens;
1282
1288
 
1283
- llama_tokens cache_tokens;
1289
+ server_tokens cache_tokens;
1284
1290
 
1285
1291
  std::vector<completion_token_output> generated_token_probs;
1286
1292
 
@@ -1423,7 +1429,7 @@ struct server_slot {
1423
1429
  pos = text.find(word, from_pos);
1424
1430
  } else {
1425
1431
  // otherwise, partial stop
1426
- pos = find_partial_stop_string(word, text);
1432
+ pos = string_find_partial_stop(text, word);
1427
1433
  }
1428
1434
 
1429
1435
  if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@@ -1474,7 +1480,7 @@ struct server_slot {
1474
1480
  {"is_processing", is_processing()},
1475
1481
  {"non_causal", is_non_causal()},
1476
1482
  {"params", params.to_json()},
1477
- {"prompt", common_detokenize(ctx, prompt_tokens)},
1483
+ {"prompt", prompt_tokens.detokenize(ctx, true)},
1478
1484
  {"next_token",
1479
1485
  {
1480
1486
  {"has_next_token", has_next_token},
@@ -1847,13 +1853,16 @@ struct server_context {
1847
1853
  llama_model * model = nullptr;
1848
1854
  llama_context * ctx = nullptr;
1849
1855
 
1856
+ // multimodal
1857
+ mtmd_context * mctx = nullptr;
1858
+
1850
1859
  const llama_vocab * vocab = nullptr;
1851
1860
 
1852
1861
  llama_model * model_dft = nullptr;
1853
1862
 
1854
1863
  llama_context_params cparams_dft;
1855
1864
 
1856
- llama_batch batch = {};
1865
+ llama_batch batch {};
1857
1866
 
1858
1867
  bool clean_kv_cache = true;
1859
1868
  bool add_bos_token = true;
@@ -1876,6 +1885,8 @@ struct server_context {
1876
1885
  common_chat_templates_ptr chat_templates;
1877
1886
 
1878
1887
  ~server_context() {
1888
+ mtmd_free(mctx);
1889
+
1879
1890
  // Clear any sampling context
1880
1891
  for (server_slot & slot : slots) {
1881
1892
  common_sampler_free(slot.smpl);
@@ -1963,6 +1974,36 @@ struct server_context {
1963
1974
  chat_templates = common_chat_templates_init(model, "chatml");
1964
1975
  }
1965
1976
 
1977
+ std::string & mmproj_path = params_base.mmproj.path;
1978
+ if (!mmproj_path.empty()) {
1979
+ mtmd_context_params mparams = mtmd_context_params_default();
1980
+ mparams.use_gpu = params_base.mmproj_use_gpu;
1981
+ mparams.print_timings = false;
1982
+ mparams.n_threads = params_base.cpuparams.n_threads;
1983
+ mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
1984
+ mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
1985
+ if (mctx == nullptr) {
1986
+ SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
1987
+ return false;
1988
+ }
1989
+ SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
1990
+
1991
+ if (params_base.ctx_shift) {
1992
+ params_base.ctx_shift = false;
1993
+ SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
1994
+ }
1995
+
1996
+ if (params_base.n_cache_reuse) {
1997
+ params_base.n_cache_reuse = 0;
1998
+ SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
1999
+ }
2000
+
2001
+ if (!params_base.speculative.model.path.empty()) {
2002
+ SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
2003
+ return false;
2004
+ }
2005
+ }
2006
+
1966
2007
  return true;
1967
2008
  }
1968
2009
 
@@ -1978,6 +2019,8 @@ struct server_context {
1978
2019
  slot.ctx = ctx;
1979
2020
  slot.n_ctx = n_ctx_slot;
1980
2021
  slot.n_predict = params_base.n_predict;
2022
+ slot.mctx = mctx;
2023
+ slot.cache_tokens.has_mtmd = mctx != nullptr;
1981
2024
 
1982
2025
  if (model_dft) {
1983
2026
  slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
@@ -2014,8 +2057,6 @@ struct server_context {
2014
2057
  // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
2015
2058
  {
2016
2059
  const int32_t n_batch = llama_n_batch(ctx);
2017
-
2018
- // only a single seq_id per token is needed
2019
2060
  batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
2020
2061
  }
2021
2062
 
@@ -2052,7 +2093,7 @@ struct server_context {
2052
2093
  }
2053
2094
 
2054
2095
  // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
2055
- int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
2096
+ int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
2056
2097
 
2057
2098
  // fraction of the common subsequence length compared to the current slot's prompt length
2058
2099
  float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@@ -2094,18 +2135,6 @@ struct server_context {
2094
2135
  return ret;
2095
2136
  }
2096
2137
 
2097
- bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2098
- const llama_model * model = llama_get_model(ctx);
2099
- const llama_vocab * vocab = llama_model_get_vocab(model);
2100
- const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2101
- for (const auto & token : tokens) {
2102
- if (token < 0 || token >= n_vocab) {
2103
- return false;
2104
- }
2105
- }
2106
- return true;
2107
- }
2108
-
2109
2138
  bool launch_slot_with_task(server_slot & slot, server_task && task) {
2110
2139
  slot.reset();
2111
2140
  slot.id_task = task.id;
@@ -2120,8 +2149,7 @@ struct server_context {
2120
2149
  slot.lora = slot.params.lora;
2121
2150
  }
2122
2151
 
2123
- bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2124
- if (!can_detokenize) {
2152
+ if (!slot.prompt_tokens.validate(ctx)) {
2125
2153
  send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2126
2154
  return false;
2127
2155
  }
@@ -2223,6 +2251,14 @@ struct server_context {
2223
2251
  slot.has_next_token = true;
2224
2252
  }
2225
2253
 
2254
+ // if context shifting is disabled, make sure that we don't run out of context
2255
+ if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
2256
+ slot.stop = STOP_TYPE_LIMIT;
2257
+ slot.has_next_token = false;
2258
+
2259
+ SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
2260
+ }
2261
+
2226
2262
  // check the limits
2227
2263
  if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
2228
2264
  slot.stop = STOP_TYPE_LIMIT;
@@ -2383,6 +2419,15 @@ struct server_context {
2383
2419
  queue_results.send(std::move(res));
2384
2420
  }
2385
2421
 
2422
+ // if multimodal is enabled, send an error and return false
2423
+ bool ensure_no_mtmd(const int id_task) {
2424
+ if (mctx) {
2425
+ send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
2426
+ return false;
2427
+ }
2428
+ return true;
2429
+ }
2430
+
2386
2431
  void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
2387
2432
  auto res = std::make_unique<server_task_result_cmpl_partial>();
2388
2433
 
@@ -2422,7 +2467,7 @@ struct server_context {
2422
2467
  res->content = std::move(slot.generated_text);
2423
2468
  res->tokens = std::move(slot.generated_tokens);
2424
2469
  res->timings = slot.get_timings();
2425
- res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
2470
+ res->prompt = slot.prompt_tokens.detokenize(ctx, true);
2426
2471
  res->response_fields = std::move(slot.params.response_fields);
2427
2472
 
2428
2473
  res->truncated = slot.truncated;
@@ -2732,6 +2777,10 @@ struct server_context {
2732
2777
  } break;
2733
2778
  case SERVER_TASK_TYPE_SLOT_SAVE:
2734
2779
  {
2780
+ if (!ensure_no_mtmd(task.id)) {
2781
+ break;
2782
+ }
2783
+
2735
2784
  int id_slot = task.slot_action.slot_id;
2736
2785
  server_slot * slot = get_slot_by_id(id_slot);
2737
2786
  if (slot == nullptr) {
@@ -2751,7 +2800,8 @@ struct server_context {
2751
2800
  std::string filename = task.slot_action.filename;
2752
2801
  std::string filepath = task.slot_action.filepath;
2753
2802
 
2754
- const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
2803
+ const llama_tokens & tokens = slot->cache_tokens.get_text_tokens();
2804
+ const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
2755
2805
 
2756
2806
  const int64_t t_end = ggml_time_us();
2757
2807
  const double t_save_ms = (t_end - t_start) / 1000.0;
@@ -2768,6 +2818,7 @@ struct server_context {
2768
2818
  } break;
2769
2819
  case SERVER_TASK_TYPE_SLOT_RESTORE:
2770
2820
  {
2821
+ if (!ensure_no_mtmd(task.id)) break;
2771
2822
  int id_slot = task.slot_action.slot_id;
2772
2823
  server_slot * slot = get_slot_by_id(id_slot);
2773
2824
  if (slot == nullptr) {
@@ -2786,15 +2837,18 @@ struct server_context {
2786
2837
  std::string filename = task.slot_action.filename;
2787
2838
  std::string filepath = task.slot_action.filepath;
2788
2839
 
2789
- slot->cache_tokens.resize(slot->n_ctx);
2840
+ llama_tokens tokens;
2841
+ tokens.resize(slot->n_ctx);
2790
2842
  size_t token_count = 0;
2791
- size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
2843
+ size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
2792
2844
  if (nread == 0) {
2793
- slot->cache_tokens.resize(0);
2845
+ slot->cache_tokens.clear(); // KV may already been invalidated?
2794
2846
  send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
2795
2847
  break;
2796
2848
  }
2797
- slot->cache_tokens.resize(token_count);
2849
+ tokens.resize(token_count);
2850
+ slot->cache_tokens.clear();
2851
+ slot->cache_tokens.insert(tokens);
2798
2852
 
2799
2853
  const int64_t t_end = ggml_time_us();
2800
2854
  const double t_restore_ms = (t_end - t_start) / 1000.0;
@@ -2811,6 +2865,7 @@ struct server_context {
2811
2865
  } break;
2812
2866
  case SERVER_TASK_TYPE_SLOT_ERASE:
2813
2867
  {
2868
+ if (!ensure_no_mtmd(task.id)) break;
2814
2869
  int id_slot = task.slot_action.slot_id;
2815
2870
  server_slot * slot = get_slot_by_id(id_slot);
2816
2871
  if (slot == nullptr) {
@@ -2842,6 +2897,7 @@ struct server_context {
2842
2897
  res->id = task.id;
2843
2898
  queue_results.send(std::move(res));
2844
2899
  } break;
2900
+
2845
2901
  }
2846
2902
  }
2847
2903
 
@@ -2887,6 +2943,12 @@ struct server_context {
2887
2943
  continue;
2888
2944
  }
2889
2945
 
2946
+ if (mctx) {
2947
+ // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
2948
+ // we don't support ctx_shift because an image chunk may contains multiple tokens
2949
+ GGML_ABORT("not supported by multimodal");
2950
+ }
2951
+
2890
2952
  // Shift context
2891
2953
  const int n_keep = slot.params.n_keep + add_bos_token;
2892
2954
  const int n_left = slot.n_past - n_keep;
@@ -2897,12 +2959,16 @@ struct server_context {
2897
2959
  llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2898
2960
  llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2899
2961
 
2900
- if (slot.params.cache_prompt) {
2901
- for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
2902
- slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
2962
+ // add generated tokens to cache
2963
+ {
2964
+ llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
2965
+ for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
2966
+ new_tokens[i - n_discard] = new_tokens[i];
2903
2967
  }
2904
2968
 
2905
- slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
2969
+ new_tokens.resize(slot.cache_tokens.size() - n_discard);
2970
+ slot.cache_tokens.clear();
2971
+ slot.cache_tokens.insert(new_tokens);
2906
2972
  }
2907
2973
 
2908
2974
  slot.n_past -= n_discard;
@@ -2939,10 +3005,7 @@ struct server_context {
2939
3005
  common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
2940
3006
 
2941
3007
  slot.n_past += 1;
2942
-
2943
- if (slot.params.cache_prompt) {
2944
- slot.cache_tokens.push_back(slot.sampled);
2945
- }
3008
+ slot.cache_tokens.push_back(slot.sampled);
2946
3009
 
2947
3010
  SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
2948
3011
  slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
@@ -2980,7 +3043,7 @@ struct server_context {
2980
3043
  SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
2981
3044
 
2982
3045
  // print prompt tokens (for debugging)
2983
- if (1) {
3046
+ /*if (1) {
2984
3047
  // first 16 tokens (avoid flooding logs)
2985
3048
  for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
2986
3049
  SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@@ -2990,7 +3053,7 @@ struct server_context {
2990
3053
  for (int i = 0; i < (int) prompt_tokens.size(); i++) {
2991
3054
  SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
2992
3055
  }
2993
- }
3056
+ }*/
2994
3057
 
2995
3058
  // empty prompt passed -> release the slot and send empty response
2996
3059
  if (prompt_tokens.empty()) {
@@ -3032,21 +3095,27 @@ struct server_context {
3032
3095
 
3033
3096
  // if input prompt is too big, truncate it
3034
3097
  if (slot.n_prompt_tokens >= slot.n_ctx) {
3098
+ if (mctx) {
3099
+ // we should never reach this
3100
+ GGML_ABORT("not supported by multimodal");
3101
+ }
3035
3102
  const int n_left = slot.n_ctx - slot.params.n_keep;
3036
3103
 
3037
3104
  const int n_block_size = n_left / 2;
3038
3105
  const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
3039
3106
 
3107
+ const llama_tokens & curr_tokens = slot.prompt_tokens.get_text_tokens();
3040
3108
  llama_tokens new_tokens(
3041
- prompt_tokens.begin(),
3042
- prompt_tokens.begin() + slot.params.n_keep);
3109
+ curr_tokens.begin(),
3110
+ curr_tokens.begin() + slot.params.n_keep);
3043
3111
 
3044
3112
  new_tokens.insert(
3045
3113
  new_tokens.end(),
3046
- prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
3047
- prompt_tokens.end());
3114
+ curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
3115
+ curr_tokens.end());
3048
3116
 
3049
- prompt_tokens = std::move(new_tokens);
3117
+ prompt_tokens.clear();
3118
+ prompt_tokens.insert(new_tokens);
3050
3119
 
3051
3120
  slot.truncated = true;
3052
3121
  slot.n_prompt_tokens = prompt_tokens.size();
@@ -3058,13 +3127,18 @@ struct server_context {
3058
3127
 
3059
3128
  if (slot.params.cache_prompt) {
3060
3129
  // reuse any previously computed tokens that are common with the new prompt
3061
- slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
3130
+ slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
3062
3131
 
3063
3132
  // reuse chunks from the cached prompt by shifting their KV cache in the new position
3064
3133
  if (params_base.n_cache_reuse > 0) {
3065
3134
  size_t head_c = slot.n_past; // cache
3066
3135
  size_t head_p = slot.n_past; // current prompt
3067
3136
 
3137
+ if (mctx) {
3138
+ // we should never reach this
3139
+ GGML_ABORT("not supported by multimodal");
3140
+ }
3141
+
3068
3142
  SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
3069
3143
 
3070
3144
  while (head_c < slot.cache_tokens.size() &&
@@ -3090,7 +3164,7 @@ struct server_context {
3090
3164
  llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3091
3165
 
3092
3166
  for (size_t i = 0; i < n_match; i++) {
3093
- slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
3167
+ slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
3094
3168
  slot.n_past++;
3095
3169
  }
3096
3170
 
@@ -3103,6 +3177,11 @@ struct server_context {
3103
3177
 
3104
3178
  SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
3105
3179
  }
3180
+ } else {
3181
+ // if we don't cache the prompt, we have to remove the entire KV cache
3182
+ llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
3183
+ slot.n_past = 0;
3184
+ slot.cache_tokens.clear();
3106
3185
  }
3107
3186
  }
3108
3187
 
@@ -3136,23 +3215,53 @@ struct server_context {
3136
3215
  SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
3137
3216
 
3138
3217
  // remove the non-common part from the cache
3139
- slot.cache_tokens.resize(slot.n_past);
3218
+ slot.cache_tokens.keep_first(slot.n_past);
3219
+
3220
+ // check if we should process the image
3221
+ if (slot.n_past < slot.n_prompt_tokens
3222
+ && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
3223
+ // process the image
3224
+ int32_t new_n_past;
3225
+ int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
3226
+ int32_t n_pos = new_n_past - slot.n_past;
3227
+
3228
+ if (res != 0) {
3229
+ SLT_ERR(slot, "failed to process image, res = %d\n", res);
3230
+ slot.release();
3231
+ send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
3232
+ continue;
3233
+ }
3234
+
3235
+ // add the image chunk to cache
3236
+ {
3237
+ const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
3238
+ slot.cache_tokens.push_back(chunk.get()); // copy
3239
+ }
3240
+
3241
+ slot.n_past += n_pos;
3242
+ slot.n_prompt_tokens_processed += n_pos;
3243
+ }
3140
3244
 
3141
3245
  // add prompt tokens for processing in the current batch
3142
3246
  while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
3247
+ // get next token to process
3248
+ llama_token cur_tok = slot.prompt_tokens[slot.n_past];
3249
+ if (cur_tok == LLAMA_TOKEN_NULL) {
3250
+ break; // end of text chunk
3251
+ }
3252
+
3143
3253
  // without pooling, we want to output the embeddings for all the tokens in the batch
3144
3254
  const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
3145
3255
 
3146
- common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
3147
-
3148
- if (slot.params.cache_prompt) {
3149
- slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
3150
- }
3256
+ common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
3257
+ slot.cache_tokens.push_back(cur_tok);
3151
3258
 
3152
3259
  slot.n_prompt_tokens_processed++;
3153
3260
  slot.n_past++;
3154
3261
  }
3155
3262
 
3263
+ // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
3264
+
3156
3265
  SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
3157
3266
 
3158
3267
  // entire prompt has been processed
@@ -3160,12 +3269,16 @@ struct server_context {
3160
3269
  slot.state = SLOT_STATE_DONE_PROMPT;
3161
3270
 
3162
3271
  GGML_ASSERT(batch.n_tokens > 0);
3272
+ GGML_ASSERT((size_t) slot.n_prompt_tokens == slot.prompt_tokens.size());
3163
3273
 
3164
3274
  common_sampler_reset(slot.smpl);
3165
3275
 
3166
3276
  // Process all prompt tokens through sampler system
3167
3277
  for (int i = 0; i < slot.n_prompt_tokens; ++i) {
3168
- common_sampler_accept(slot.smpl, prompt_tokens[i], false);
3278
+ llama_token id = slot.prompt_tokens[i];
3279
+ if (id != LLAMA_TOKEN_NULL) {
3280
+ common_sampler_accept(slot.smpl, id, false);
3281
+ }
3169
3282
  }
3170
3283
 
3171
3284
  // extract the logits only for the last token
@@ -3212,7 +3325,14 @@ struct server_context {
3212
3325
  batch.logits + i,
3213
3326
  };
3214
3327
 
3215
- const int ret = llama_decode(ctx, batch_view);
3328
+ int ret = 0;
3329
+
3330
+ if (params_base.embedding || params_base.reranking) {
3331
+ ret = llama_encode(ctx, batch_view);
3332
+ } else {
3333
+ ret = llama_decode(ctx, batch_view);
3334
+ }
3335
+
3216
3336
  metrics.on_decoded(slots);
3217
3337
 
3218
3338
  if (ret != 0) {
@@ -3311,6 +3431,11 @@ struct server_context {
3311
3431
  continue;
3312
3432
  }
3313
3433
 
3434
+ if (mctx) {
3435
+ // we should never reach this, as speculative is automatically disabled if mmproj is loaded
3436
+ GGML_ABORT("not supported by multimodal");
3437
+ }
3438
+
3314
3439
  // determine the max draft that fits the current slot state
3315
3440
  int n_draft_max = slot.params.speculative.n_max;
3316
3441
 
@@ -3337,7 +3462,8 @@ struct server_context {
3337
3462
  params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
3338
3463
  params_spec.p_min = slot.params.speculative.p_min;
3339
3464
 
3340
- llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
3465
+ const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
3466
+ llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
3341
3467
 
3342
3468
  // keep track of total number of tokens generated in the draft
3343
3469
  slot.n_draft_total += draft.size();
@@ -3371,7 +3497,7 @@ struct server_context {
3371
3497
  slot.n_draft_accepted += ids.size() - 1;
3372
3498
 
3373
3499
  slot.cache_tokens.push_back(id);
3374
- slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3500
+ slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
3375
3501
 
3376
3502
  llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
3377
3503
 
@@ -3589,6 +3715,9 @@ int main(int argc, char ** argv) {
3589
3715
  if (req.path == "/" || tmp.back() == "html") {
3590
3716
  res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
3591
3717
  res.status = 503;
3718
+ } else if (req.path == "/models" || req.path == "/v1/models") {
3719
+ // allow the models endpoint to be accessed during loading
3720
+ return true;
3592
3721
  } else {
3593
3722
  res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
3594
3723
  }
@@ -3894,6 +4023,7 @@ int main(int argc, char ** argv) {
3894
4023
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3895
4024
  { "total_slots", ctx_server.params_base.n_parallel },
3896
4025
  { "model_path", ctx_server.params_base.model.path },
4026
+ { "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
3897
4027
  { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3898
4028
  { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3899
4029
  { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -3941,9 +4071,10 @@ int main(int argc, char ** argv) {
3941
4071
  const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
3942
4072
  server_task_type type,
3943
4073
  json & data,
3944
- std::function<bool()> is_connection_closed,
4074
+ const std::vector<raw_buffer> & files,
4075
+ const std::function<bool()> & is_connection_closed,
3945
4076
  httplib::Response & res,
3946
- oaicompat_type oaicompat) {
4077
+ oaicompat_type oaicompat) -> void {
3947
4078
  GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
3948
4079
 
3949
4080
  if (ctx_server.params_base.embedding) {
@@ -3960,15 +4091,69 @@ int main(int argc, char ** argv) {
3960
4091
  // TODO: this log can become very long, put it behind a flag or think about a more compact format
3961
4092
  //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3962
4093
 
3963
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
3964
- tasks.reserve(tokenized_prompts.size());
3965
- for (size_t i = 0; i < tokenized_prompts.size(); i++) {
4094
+ // process files
4095
+ mtmd::bitmaps bitmaps;
4096
+ const bool has_mtmd = ctx_server.mctx != nullptr;
4097
+ {
4098
+ if (!has_mtmd && !files.empty()) {
4099
+ throw std::runtime_error("This server does not support multimodal");
4100
+ }
4101
+ for (auto & file : files) {
4102
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
4103
+ if (!bmp.ptr) {
4104
+ throw std::runtime_error("Failed to load image");
4105
+ }
4106
+ // calculate bitmap hash (for KV caching)
4107
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
4108
+ bmp.set_id(hash.c_str());
4109
+ bitmaps.entries.push_back(std::move(bmp));
4110
+ }
4111
+ }
4112
+
4113
+ // process prompt
4114
+ std::vector<server_tokens> inputs;
4115
+ if (oaicompat && !prompt.is_string()) {
4116
+ throw std::runtime_error("prompt must be a string");
4117
+ }
4118
+
4119
+ if (oaicompat && has_mtmd) {
4120
+ // multimodal
4121
+ std::string prompt_str = prompt.get<std::string>();
4122
+ mtmd_input_text inp_txt = {
4123
+ prompt_str.c_str(),
4124
+ /* add_special */ true,
4125
+ /* parse_special */ true,
4126
+ };
4127
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
4128
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
4129
+ int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
4130
+ chunks.ptr.get(),
4131
+ &inp_txt,
4132
+ bitmaps_c_ptr.data(),
4133
+ bitmaps_c_ptr.size());
4134
+ if (tokenized != 0) {
4135
+ throw std::runtime_error("Failed to tokenize prompt");
4136
+ }
4137
+
4138
+ server_tokens tmp(chunks, true);
4139
+ inputs.push_back(std::move(tmp));
4140
+ } else {
4141
+ // non-multimodal version
4142
+ auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
4143
+ for (auto & p : tokenized_prompts) {
4144
+ auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
4145
+ inputs.push_back(std::move(tmp));
4146
+ }
4147
+ }
4148
+
4149
+ tasks.reserve(inputs.size());
4150
+ for (size_t i = 0; i < inputs.size(); i++) {
3966
4151
  server_task task = server_task(type);
3967
4152
 
3968
4153
  task.id = ctx_server.queue_tasks.get_new_id();
3969
4154
  task.index = i;
3970
4155
 
3971
- task.prompt_tokens = std::move(tokenized_prompts[i]);
4156
+ task.prompt_tokens = std::move(inputs[i]);
3972
4157
  task.params = server_task::params_from_json_cmpl(
3973
4158
  ctx_server.ctx,
3974
4159
  ctx_server.params_base,
@@ -4050,9 +4235,11 @@ int main(int argc, char ** argv) {
4050
4235
 
4051
4236
  const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
4052
4237
  json data = json::parse(req.body);
4053
- return handle_completions_impl(
4238
+ std::vector<raw_buffer> files; // dummy
4239
+ handle_completions_impl(
4054
4240
  SERVER_TASK_TYPE_COMPLETION,
4055
4241
  data,
4242
+ files,
4056
4243
  req.is_connection_closed,
4057
4244
  res,
4058
4245
  OAICOMPAT_TYPE_NONE);
@@ -4060,9 +4247,11 @@ int main(int argc, char ** argv) {
4060
4247
 
4061
4248
  const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
4062
4249
  json data = oaicompat_completion_params_parse(json::parse(req.body));
4063
- return handle_completions_impl(
4250
+ std::vector<raw_buffer> files; // dummy
4251
+ handle_completions_impl(
4064
4252
  SERVER_TASK_TYPE_COMPLETION,
4065
4253
  data,
4254
+ files,
4066
4255
  req.is_connection_closed,
4067
4256
  res,
4068
4257
  OAICOMPAT_TYPE_COMPLETION);
@@ -4137,9 +4326,11 @@ int main(int argc, char ** argv) {
4137
4326
  tokenized_prompts[0]
4138
4327
  );
4139
4328
 
4140
- return handle_completions_impl(
4329
+ std::vector<raw_buffer> files; // dummy
4330
+ handle_completions_impl(
4141
4331
  SERVER_TASK_TYPE_INFILL,
4142
4332
  data,
4333
+ files,
4143
4334
  req.is_connection_closed,
4144
4335
  res,
4145
4336
  OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
@@ -4153,11 +4344,20 @@ int main(int argc, char ** argv) {
4153
4344
  }
4154
4345
 
4155
4346
  auto body = json::parse(req.body);
4156
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4157
-
4158
- return handle_completions_impl(
4347
+ std::vector<raw_buffer> files;
4348
+ json data = oaicompat_completion_params_parse(
4349
+ body,
4350
+ params.use_jinja,
4351
+ params.prefill_assistant,
4352
+ params.reasoning_format,
4353
+ ctx_server.chat_templates.get(),
4354
+ ctx_server.mctx,
4355
+ files);
4356
+
4357
+ handle_completions_impl(
4159
4358
  SERVER_TASK_TYPE_COMPLETION,
4160
4359
  data,
4360
+ files,
4161
4361
  req.is_connection_closed,
4162
4362
  res,
4163
4363
  OAICOMPAT_TYPE_CHAT);
@@ -4166,11 +4366,25 @@ int main(int argc, char ** argv) {
4166
4366
  // same with handle_chat_completions, but without inference part
4167
4367
  const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4168
4368
  auto body = json::parse(req.body);
4169
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4369
+ std::vector<raw_buffer> files; // dummy, unused
4370
+ json data = oaicompat_completion_params_parse(
4371
+ body,
4372
+ params.use_jinja,
4373
+ params.prefill_assistant,
4374
+ params.reasoning_format,
4375
+ ctx_server.chat_templates.get(),
4376
+ ctx_server.mctx,
4377
+ files);
4170
4378
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4171
4379
  };
4172
4380
 
4173
- const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
4381
+ const auto handle_models = [&params, &ctx_server, &state, &res_ok](const httplib::Request &, httplib::Response & res) {
4382
+ server_state current_state = state.load();
4383
+ json model_meta = nullptr;
4384
+ if (current_state == SERVER_STATE_READY) {
4385
+ model_meta = ctx_server.model_meta();
4386
+ }
4387
+
4174
4388
  json models = {
4175
4389
  {"object", "list"},
4176
4390
  {"data", {
@@ -4179,7 +4393,7 @@ int main(int argc, char ** argv) {
4179
4393
  {"object", "model"},
4180
4394
  {"created", std::time(0)},
4181
4395
  {"owned_by", "llamacpp"},
4182
- {"meta", ctx_server.model_meta()}
4396
+ {"meta", model_meta},
4183
4397
  },
4184
4398
  }}
4185
4399
  };
@@ -4271,7 +4485,7 @@ int main(int argc, char ** argv) {
4271
4485
  }
4272
4486
  }
4273
4487
 
4274
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
4488
+ auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
4275
4489
  for (const auto & tokens : tokenized_prompts) {
4276
4490
  // this check is necessary for models that do not add BOS token to the input
4277
4491
  if (tokens.empty()) {
@@ -4291,7 +4505,7 @@ int main(int argc, char ** argv) {
4291
4505
 
4292
4506
  task.id = ctx_server.queue_tasks.get_new_id();
4293
4507
  task.index = i;
4294
- task.prompt_tokens = std::move(tokenized_prompts[i]);
4508
+ task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
4295
4509
 
4296
4510
  // OAI-compat
4297
4511
  task.params.oaicompat = oaicompat;
@@ -4385,13 +4599,14 @@ int main(int argc, char ** argv) {
4385
4599
  std::unordered_set<int> task_ids;
4386
4600
  {
4387
4601
  std::vector<server_task> tasks;
4388
- std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
4602
+ auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
4389
4603
  tasks.reserve(tokenized_docs.size());
4390
4604
  for (size_t i = 0; i < tokenized_docs.size(); i++) {
4605
+ auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
4391
4606
  server_task task = server_task(SERVER_TASK_TYPE_RERANK);
4392
4607
  task.id = ctx_server.queue_tasks.get_new_id();
4393
4608
  task.index = i;
4394
- task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
4609
+ task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
4395
4610
  tasks.push_back(std::move(task));
4396
4611
  }
4397
4612