@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -7,6 +7,7 @@
7
7
  #include "log.h"
8
8
  #include "sampling.h"
9
9
  #include "speculative.h"
10
+ #include "mtmd.h"
10
11
 
11
12
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
12
13
  #define JSON_ASSERT GGML_ASSERT
@@ -133,7 +134,8 @@ struct slot_params {
133
134
 
134
135
  auto grammar_triggers = json::array();
135
136
  for (const auto & trigger : sampling.grammar_triggers) {
136
- grammar_triggers.push_back(trigger.to_json<json>());
137
+ server_grammar_trigger ct(std::move(trigger));
138
+ grammar_triggers.push_back(ct.to_json());
137
139
  }
138
140
 
139
141
  return json {
@@ -145,6 +147,7 @@ struct slot_params {
145
147
  {"top_k", sampling.top_k},
146
148
  {"top_p", sampling.top_p},
147
149
  {"min_p", sampling.min_p},
150
+ {"top_n_sigma", sampling.top_n_sigma},
148
151
  {"xtc_probability", sampling.xtc_probability},
149
152
  {"xtc_threshold", sampling.xtc_threshold},
150
153
  {"typical_p", sampling.typ_p},
@@ -195,8 +198,8 @@ struct server_task {
195
198
  int id_target = -1;
196
199
 
197
200
  // used by SERVER_TASK_TYPE_INFERENCE
198
- slot_params params;
199
- llama_tokens prompt_tokens;
201
+ slot_params params;
202
+ server_tokens prompt_tokens;
200
203
  int id_selected_slot = -1;
201
204
 
202
205
  // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
@@ -247,6 +250,7 @@ struct server_task {
247
250
  params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
248
251
  params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
249
252
  params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
253
+ params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
250
254
  params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
251
255
  params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
252
256
  params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
@@ -372,9 +376,9 @@ struct server_task {
372
376
  const auto grammar_triggers = data.find("grammar_triggers");
373
377
  if (grammar_triggers != data.end()) {
374
378
  for (const auto & t : *grammar_triggers) {
375
- auto ct = common_grammar_trigger::from_json(t);
376
- if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
377
- const auto & word = ct.value;
379
+ server_grammar_trigger ct(t);
380
+ if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
381
+ const auto & word = ct.value.value;
378
382
  auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
379
383
  if (ids.size() == 1) {
380
384
  auto token = ids[0];
@@ -392,7 +396,7 @@ struct server_task {
392
396
  params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
393
397
  }
394
398
  } else {
395
- params.sampling.grammar_triggers.push_back(ct);
399
+ params.sampling.grammar_triggers.push_back(std::move(ct.value));
396
400
  }
397
401
  }
398
402
  }
@@ -489,8 +493,12 @@ struct result_timings {
489
493
  double predicted_per_token_ms;
490
494
  double predicted_per_second;
491
495
 
496
+ // Optional speculative metrics - only included when > 0
497
+ int32_t draft_n = 0;
498
+ int32_t draft_n_accepted = 0;
499
+
492
500
  json to_json() const {
493
- return {
501
+ json base = {
494
502
  {"prompt_n", prompt_n},
495
503
  {"prompt_ms", prompt_ms},
496
504
  {"prompt_per_token_ms", prompt_per_token_ms},
@@ -501,6 +509,13 @@ struct result_timings {
501
509
  {"predicted_per_token_ms", predicted_per_token_ms},
502
510
  {"predicted_per_second", predicted_per_second},
503
511
  };
512
+
513
+ if (draft_n > 0) {
514
+ base["draft_n"] = draft_n;
515
+ base["draft_n_accepted"] = draft_n_accepted;
516
+ }
517
+
518
+ return base;
504
519
  }
505
520
  };
506
521
 
@@ -1234,6 +1249,9 @@ struct server_slot {
1234
1249
  llama_context * ctx = nullptr;
1235
1250
  llama_context * ctx_dft = nullptr;
1236
1251
 
1252
+ // multimodal
1253
+ mtmd_context * mctx = nullptr;
1254
+
1237
1255
  common_speculative * spec = nullptr;
1238
1256
 
1239
1257
  std::vector<common_adapter_lora_info> lora;
@@ -1261,14 +1279,14 @@ struct server_slot {
1261
1279
  int32_t n_prompt_tokens_processed = 0;
1262
1280
 
1263
1281
  // input prompt tokens
1264
- llama_tokens prompt_tokens;
1282
+ server_tokens prompt_tokens;
1265
1283
 
1266
1284
  size_t last_nl_pos = 0;
1267
1285
 
1268
1286
  std::string generated_text;
1269
1287
  llama_tokens generated_tokens;
1270
1288
 
1271
- llama_tokens cache_tokens;
1289
+ server_tokens cache_tokens;
1272
1290
 
1273
1291
  std::vector<completion_token_output> generated_token_probs;
1274
1292
 
@@ -1299,6 +1317,10 @@ struct server_slot {
1299
1317
 
1300
1318
  std::function<void(int)> callback_on_release;
1301
1319
 
1320
+ // Speculative decoding stats
1321
+ int32_t n_draft_total = 0; // Total draft tokens generated
1322
+ int32_t n_draft_accepted = 0; // Draft tokens actually accepted
1323
+
1302
1324
  void reset() {
1303
1325
  SLT_DBG(*this, "%s", "\n");
1304
1326
 
@@ -1315,6 +1337,10 @@ struct server_slot {
1315
1337
 
1316
1338
  generated_tokens.clear();
1317
1339
  generated_token_probs.clear();
1340
+
1341
+ // clear speculative decoding stats
1342
+ n_draft_total = 0;
1343
+ n_draft_accepted = 0;
1318
1344
  }
1319
1345
 
1320
1346
  bool is_non_causal() const {
@@ -1381,6 +1407,12 @@ struct server_slot {
1381
1407
  timings.predicted_per_token_ms = t_token_generation / n_decoded;
1382
1408
  timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
1383
1409
 
1410
+ // Add speculative metrics
1411
+ if (n_draft_total > 0) {
1412
+ timings.draft_n = n_draft_total;
1413
+ timings.draft_n_accepted = n_draft_accepted;
1414
+ }
1415
+
1384
1416
  return timings;
1385
1417
  }
1386
1418
 
@@ -1397,7 +1429,7 @@ struct server_slot {
1397
1429
  pos = text.find(word, from_pos);
1398
1430
  } else {
1399
1431
  // otherwise, partial stop
1400
- pos = find_partial_stop_string(word, text);
1432
+ pos = string_find_partial_stop(text, word);
1401
1433
  }
1402
1434
 
1403
1435
  if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@@ -1428,6 +1460,15 @@ struct server_slot {
1428
1460
  t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
1429
1461
  t_token_generation, n_decoded, t_gen, n_gen_second,
1430
1462
  t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
1463
+
1464
+ if (n_draft_total > 0) {
1465
+ const float draft_ratio = (float) n_draft_accepted / n_draft_total;
1466
+ SLT_INF(*this,
1467
+ "\n"
1468
+ "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
1469
+ draft_ratio, n_draft_accepted, n_draft_total
1470
+ );
1471
+ }
1431
1472
  }
1432
1473
 
1433
1474
  json to_json() const {
@@ -1439,7 +1480,7 @@ struct server_slot {
1439
1480
  {"is_processing", is_processing()},
1440
1481
  {"non_causal", is_non_causal()},
1441
1482
  {"params", params.to_json()},
1442
- {"prompt", common_detokenize(ctx, prompt_tokens)},
1483
+ {"prompt", prompt_tokens.detokenize(ctx, true)},
1443
1484
  {"next_token",
1444
1485
  {
1445
1486
  {"has_next_token", has_next_token},
@@ -1517,29 +1558,30 @@ struct server_queue {
1517
1558
  std::condition_variable condition_tasks;
1518
1559
 
1519
1560
  // callback functions
1520
- std::function<void(server_task)> callback_new_task;
1521
- std::function<void(void)> callback_update_slots;
1561
+ std::function<void(server_task &&)> callback_new_task;
1562
+ std::function<void(void)> callback_update_slots;
1522
1563
 
1523
1564
  // Add a new task to the end of the queue
1524
- int post(server_task task, bool front = false) {
1565
+ int post(server_task && task, bool front = false) {
1525
1566
  std::unique_lock<std::mutex> lock(mutex_tasks);
1526
1567
  GGML_ASSERT(task.id != -1);
1527
1568
  // if this is cancel task make sure to clean up pending tasks
1528
1569
  if (task.type == SERVER_TASK_TYPE_CANCEL) {
1529
1570
  cleanup_pending_task(task.id_target);
1530
1571
  }
1531
- QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
1572
+ const int task_id = task.id;
1573
+ QUE_DBG("new task, id = %d, front = %d\n", task_id, front);
1532
1574
  if (front) {
1533
1575
  queue_tasks.push_front(std::move(task));
1534
1576
  } else {
1535
1577
  queue_tasks.push_back(std::move(task));
1536
1578
  }
1537
1579
  condition_tasks.notify_one();
1538
- return task.id;
1580
+ return task_id;
1539
1581
  }
1540
1582
 
1541
1583
  // multi-task version of post()
1542
- int post(std::vector<server_task> & tasks, bool front = false) {
1584
+ int post(std::vector<server_task> && tasks, bool front = false) {
1543
1585
  std::unique_lock<std::mutex> lock(mutex_tasks);
1544
1586
  for (auto & task : tasks) {
1545
1587
  if (task.id == -1) {
@@ -1561,7 +1603,7 @@ struct server_queue {
1561
1603
  }
1562
1604
 
1563
1605
  // Add a new task, but defer until one slot is available
1564
- void defer(server_task task) {
1606
+ void defer(server_task && task) {
1565
1607
  std::unique_lock<std::mutex> lock(mutex_tasks);
1566
1608
  QUE_DBG("defer task, id = %d\n", task.id);
1567
1609
  queue_tasks_deferred.push_back(std::move(task));
@@ -1576,7 +1618,7 @@ struct server_queue {
1576
1618
  }
1577
1619
 
1578
1620
  // Register function to process a new task
1579
- void on_new_task(std::function<void(server_task)> callback) {
1621
+ void on_new_task(std::function<void(server_task &&)> callback) {
1580
1622
  callback_new_task = std::move(callback);
1581
1623
  }
1582
1624
 
@@ -1625,7 +1667,7 @@ struct server_queue {
1625
1667
  lock.unlock();
1626
1668
  break;
1627
1669
  }
1628
- server_task task = queue_tasks.front();
1670
+ server_task task = std::move(queue_tasks.front());
1629
1671
  queue_tasks.pop_front();
1630
1672
  lock.unlock();
1631
1673
 
@@ -1670,6 +1712,8 @@ private:
1670
1712
  };
1671
1713
 
1672
1714
  struct server_response {
1715
+ bool running = true;
1716
+
1673
1717
  // for keeping track of all tasks waiting for the result
1674
1718
  std::unordered_set<int> waiting_task_ids;
1675
1719
 
@@ -1724,6 +1768,10 @@ struct server_response {
1724
1768
  while (true) {
1725
1769
  std::unique_lock<std::mutex> lock(mutex_results);
1726
1770
  condition_results.wait(lock, [&]{
1771
+ if (!running) {
1772
+ SRV_DBG("%s : queue result stop\n", __func__);
1773
+ std::terminate(); // we cannot return here since the caller is HTTP code
1774
+ }
1727
1775
  return !queue_results.empty();
1728
1776
  });
1729
1777
 
@@ -1754,6 +1802,10 @@ struct server_response {
1754
1802
  }
1755
1803
 
1756
1804
  std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
1805
+ if (!running) {
1806
+ SRV_DBG("%s : queue result stop\n", __func__);
1807
+ std::terminate(); // we cannot return here since the caller is HTTP code
1808
+ }
1757
1809
  if (cr_res == std::cv_status::timeout) {
1758
1810
  return nullptr;
1759
1811
  }
@@ -1783,6 +1835,12 @@ struct server_response {
1783
1835
  }
1784
1836
  }
1785
1837
  }
1838
+
1839
+ // terminate the waiting loop
1840
+ void terminate() {
1841
+ running = false;
1842
+ condition_results.notify_all();
1843
+ }
1786
1844
  };
1787
1845
 
1788
1846
  struct server_context {
@@ -1795,13 +1853,16 @@ struct server_context {
1795
1853
  llama_model * model = nullptr;
1796
1854
  llama_context * ctx = nullptr;
1797
1855
 
1856
+ // multimodal
1857
+ mtmd_context * mctx = nullptr;
1858
+
1798
1859
  const llama_vocab * vocab = nullptr;
1799
1860
 
1800
1861
  llama_model * model_dft = nullptr;
1801
1862
 
1802
1863
  llama_context_params cparams_dft;
1803
1864
 
1804
- llama_batch batch = {};
1865
+ llama_batch batch {};
1805
1866
 
1806
1867
  bool clean_kv_cache = true;
1807
1868
  bool add_bos_token = true;
@@ -1824,6 +1885,8 @@ struct server_context {
1824
1885
  common_chat_templates_ptr chat_templates;
1825
1886
 
1826
1887
  ~server_context() {
1888
+ mtmd_free(mctx);
1889
+
1827
1890
  // Clear any sampling context
1828
1891
  for (server_slot & slot : slots) {
1829
1892
  common_sampler_free(slot.smpl);
@@ -1842,7 +1905,7 @@ struct server_context {
1842
1905
  }
1843
1906
 
1844
1907
  bool load_model(const common_params & params) {
1845
- SRV_INF("loading model '%s'\n", params.model.c_str());
1908
+ SRV_INF("loading model '%s'\n", params.model.path.c_str());
1846
1909
 
1847
1910
  params_base = params;
1848
1911
 
@@ -1852,7 +1915,7 @@ struct server_context {
1852
1915
  ctx = llama_init.context.get();
1853
1916
 
1854
1917
  if (model == nullptr) {
1855
- SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
1918
+ SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
1856
1919
  return false;
1857
1920
  }
1858
1921
 
@@ -1863,16 +1926,13 @@ struct server_context {
1863
1926
  add_bos_token = llama_vocab_get_add_bos(vocab);
1864
1927
  has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
1865
1928
 
1866
- if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
1867
- SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
1929
+ if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
1930
+ SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
1868
1931
 
1869
1932
  auto params_dft = params_base;
1870
1933
 
1871
1934
  params_dft.devices = params_base.speculative.devices;
1872
- params_dft.hf_file = params_base.speculative.hf_file;
1873
- params_dft.hf_repo = params_base.speculative.hf_repo;
1874
1935
  params_dft.model = params_base.speculative.model;
1875
- params_dft.model_url = params_base.speculative.model_url;
1876
1936
  params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
1877
1937
  params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
1878
1938
  params_dft.n_parallel = 1;
@@ -1886,12 +1946,12 @@ struct server_context {
1886
1946
  model_dft = llama_init_dft.model.get();
1887
1947
 
1888
1948
  if (model_dft == nullptr) {
1889
- SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
1949
+ SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
1890
1950
  return false;
1891
1951
  }
1892
1952
 
1893
1953
  if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
1894
- SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
1954
+ SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
1895
1955
 
1896
1956
  return false;
1897
1957
  }
@@ -1914,6 +1974,36 @@ struct server_context {
1914
1974
  chat_templates = common_chat_templates_init(model, "chatml");
1915
1975
  }
1916
1976
 
1977
+ std::string & mmproj_path = params_base.mmproj.path;
1978
+ if (!mmproj_path.empty()) {
1979
+ mtmd_context_params mparams = mtmd_context_params_default();
1980
+ mparams.use_gpu = params_base.mmproj_use_gpu;
1981
+ mparams.print_timings = false;
1982
+ mparams.n_threads = params_base.cpuparams.n_threads;
1983
+ mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
1984
+ mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
1985
+ if (mctx == nullptr) {
1986
+ SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
1987
+ return false;
1988
+ }
1989
+ SRV_INF("loaded multimodal model, '%s'\n", mmproj_path.c_str());
1990
+
1991
+ if (params_base.ctx_shift) {
1992
+ params_base.ctx_shift = false;
1993
+ SRV_WRN("%s\n", "ctx_shift is not supported by multimodal, it will be disabled");
1994
+ }
1995
+
1996
+ if (params_base.n_cache_reuse) {
1997
+ params_base.n_cache_reuse = 0;
1998
+ SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
1999
+ }
2000
+
2001
+ if (!params_base.speculative.model.path.empty()) {
2002
+ SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
2003
+ return false;
2004
+ }
2005
+ }
2006
+
1917
2007
  return true;
1918
2008
  }
1919
2009
 
@@ -1929,6 +2019,8 @@ struct server_context {
1929
2019
  slot.ctx = ctx;
1930
2020
  slot.n_ctx = n_ctx_slot;
1931
2021
  slot.n_predict = params_base.n_predict;
2022
+ slot.mctx = mctx;
2023
+ slot.cache_tokens.has_mtmd = mctx != nullptr;
1932
2024
 
1933
2025
  if (model_dft) {
1934
2026
  slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
@@ -1956,7 +2048,7 @@ struct server_context {
1956
2048
 
1957
2049
  slot.reset();
1958
2050
 
1959
- slots.push_back(slot);
2051
+ slots.push_back(std::move(slot));
1960
2052
  }
1961
2053
 
1962
2054
  default_generation_settings_for_props = slots[0].to_json();
@@ -1965,8 +2057,6 @@ struct server_context {
1965
2057
  // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
1966
2058
  {
1967
2059
  const int32_t n_batch = llama_n_batch(ctx);
1968
-
1969
- // only a single seq_id per token is needed
1970
2060
  batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
1971
2061
  }
1972
2062
 
@@ -2003,7 +2093,7 @@ struct server_context {
2003
2093
  }
2004
2094
 
2005
2095
  // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
2006
- int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
2096
+ int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);
2007
2097
 
2008
2098
  // fraction of the common subsequence length compared to the current slot's prompt length
2009
2099
  float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@@ -2045,19 +2135,7 @@ struct server_context {
2045
2135
  return ret;
2046
2136
  }
2047
2137
 
2048
- bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
2049
- const llama_model * model = llama_get_model(ctx);
2050
- const llama_vocab * vocab = llama_model_get_vocab(model);
2051
- const int32_t n_vocab = llama_vocab_n_tokens(vocab);
2052
- for (const auto & token : tokens) {
2053
- if (token < 0 || token >= n_vocab) {
2054
- return false;
2055
- }
2056
- }
2057
- return true;
2058
- }
2059
-
2060
- bool launch_slot_with_task(server_slot & slot, const server_task & task) {
2138
+ bool launch_slot_with_task(server_slot & slot, server_task && task) {
2061
2139
  slot.reset();
2062
2140
  slot.id_task = task.id;
2063
2141
  slot.index = task.index;
@@ -2065,14 +2143,13 @@ struct server_context {
2065
2143
  slot.params = std::move(task.params);
2066
2144
  slot.prompt_tokens = std::move(task.prompt_tokens);
2067
2145
 
2068
- if (!are_lora_equal(task.params.lora, slot.lora)) {
2146
+ if (!are_lora_equal(slot.params.lora, slot.lora)) {
2069
2147
  // if lora is changed, we cannot reuse cached tokens
2070
2148
  slot.cache_tokens.clear();
2071
- slot.lora = task.params.lora;
2149
+ slot.lora = slot.params.lora;
2072
2150
  }
2073
2151
 
2074
- bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens);
2075
- if (!can_detokenize) {
2152
+ if (!slot.prompt_tokens.validate(ctx)) {
2076
2153
  send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST);
2077
2154
  return false;
2078
2155
  }
@@ -2174,6 +2251,14 @@ struct server_context {
2174
2251
  slot.has_next_token = true;
2175
2252
  }
2176
2253
 
2254
+ // if context shifting is disabled, make sure that we don't run out of context
2255
+ if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
2256
+ slot.stop = STOP_TYPE_LIMIT;
2257
+ slot.has_next_token = false;
2258
+
2259
+ SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
2260
+ }
2261
+
2177
2262
  // check the limits
2178
2263
  if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
2179
2264
  slot.stop = STOP_TYPE_LIMIT;
@@ -2334,6 +2419,15 @@ struct server_context {
2334
2419
  queue_results.send(std::move(res));
2335
2420
  }
2336
2421
 
2422
+ // if multimodal is enabled, send an error and return false
2423
+ bool ensure_no_mtmd(const int id_task) {
2424
+ if (mctx) {
2425
+ send_error(id_task, "This feature is not supported by multimodal", ERROR_TYPE_NOT_SUPPORTED);
2426
+ return false;
2427
+ }
2428
+ return true;
2429
+ }
2430
+
2337
2431
  void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
2338
2432
  auto res = std::make_unique<server_task_result_cmpl_partial>();
2339
2433
 
@@ -2373,7 +2467,7 @@ struct server_context {
2373
2467
  res->content = std::move(slot.generated_text);
2374
2468
  res->tokens = std::move(slot.generated_tokens);
2375
2469
  res->timings = slot.get_timings();
2376
- res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
2470
+ res->prompt = slot.prompt_tokens.detokenize(ctx, true);
2377
2471
  res->response_fields = std::move(slot.params.response_fields);
2378
2472
 
2379
2473
  res->truncated = slot.truncated;
@@ -2499,10 +2593,10 @@ struct server_context {
2499
2593
  server_task task(SERVER_TASK_TYPE_CANCEL);
2500
2594
  task.id_target = id_task;
2501
2595
  queue_results.remove_waiting_task_id(id_task);
2502
- cancel_tasks.push_back(task);
2596
+ cancel_tasks.push_back(std::move(task));
2503
2597
  }
2504
2598
  // push to beginning of the queue, so it has highest priority
2505
- queue_tasks.post(cancel_tasks, true);
2599
+ queue_tasks.post(std::move(cancel_tasks), true);
2506
2600
  }
2507
2601
 
2508
2602
  // receive the results from task(s)
@@ -2589,7 +2683,7 @@ struct server_context {
2589
2683
  // Functions to process the task
2590
2684
  //
2591
2685
 
2592
- void process_single_task(server_task task) {
2686
+ void process_single_task(server_task && task) {
2593
2687
  switch (task.type) {
2594
2688
  case SERVER_TASK_TYPE_COMPLETION:
2595
2689
  case SERVER_TASK_TYPE_INFILL:
@@ -2603,17 +2697,17 @@ struct server_context {
2603
2697
  if (slot == nullptr) {
2604
2698
  // if no slot is available, we defer this task for processing later
2605
2699
  SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
2606
- queue_tasks.defer(task);
2700
+ queue_tasks.defer(std::move(task));
2607
2701
  break;
2608
2702
  }
2609
2703
  if (slot->is_processing()) {
2610
2704
  // if requested slot is unavailable, we defer this task for processing later
2611
2705
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2612
- queue_tasks.defer(task);
2706
+ queue_tasks.defer(std::move(task));
2613
2707
  break;
2614
2708
  }
2615
2709
 
2616
- if (!launch_slot_with_task(*slot, task)) {
2710
+ if (!launch_slot_with_task(*slot, std::move(task))) {
2617
2711
  SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
2618
2712
  break;
2619
2713
  }
@@ -2683,6 +2777,10 @@ struct server_context {
2683
2777
  } break;
2684
2778
  case SERVER_TASK_TYPE_SLOT_SAVE:
2685
2779
  {
2780
+ if (!ensure_no_mtmd(task.id)) {
2781
+ break;
2782
+ }
2783
+
2686
2784
  int id_slot = task.slot_action.slot_id;
2687
2785
  server_slot * slot = get_slot_by_id(id_slot);
2688
2786
  if (slot == nullptr) {
@@ -2692,7 +2790,7 @@ struct server_context {
2692
2790
  if (slot->is_processing()) {
2693
2791
  // if requested slot is unavailable, we defer this task for processing later
2694
2792
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2695
- queue_tasks.defer(task);
2793
+ queue_tasks.defer(std::move(task));
2696
2794
  break;
2697
2795
  }
2698
2796
 
@@ -2702,7 +2800,8 @@ struct server_context {
2702
2800
  std::string filename = task.slot_action.filename;
2703
2801
  std::string filepath = task.slot_action.filepath;
2704
2802
 
2705
- const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
2803
+ const llama_tokens & tokens = slot->cache_tokens.get_text_tokens();
2804
+ const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
2706
2805
 
2707
2806
  const int64_t t_end = ggml_time_us();
2708
2807
  const double t_save_ms = (t_end - t_start) / 1000.0;
@@ -2719,6 +2818,7 @@ struct server_context {
2719
2818
  } break;
2720
2819
  case SERVER_TASK_TYPE_SLOT_RESTORE:
2721
2820
  {
2821
+ if (!ensure_no_mtmd(task.id)) break;
2722
2822
  int id_slot = task.slot_action.slot_id;
2723
2823
  server_slot * slot = get_slot_by_id(id_slot);
2724
2824
  if (slot == nullptr) {
@@ -2728,7 +2828,7 @@ struct server_context {
2728
2828
  if (slot->is_processing()) {
2729
2829
  // if requested slot is unavailable, we defer this task for processing later
2730
2830
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2731
- queue_tasks.defer(task);
2831
+ queue_tasks.defer(std::move(task));
2732
2832
  break;
2733
2833
  }
2734
2834
 
@@ -2737,15 +2837,18 @@ struct server_context {
2737
2837
  std::string filename = task.slot_action.filename;
2738
2838
  std::string filepath = task.slot_action.filepath;
2739
2839
 
2740
- slot->cache_tokens.resize(slot->n_ctx);
2840
+ llama_tokens tokens;
2841
+ tokens.resize(slot->n_ctx);
2741
2842
  size_t token_count = 0;
2742
- size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
2843
+ size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, tokens.data(), tokens.size(), &token_count);
2743
2844
  if (nread == 0) {
2744
- slot->cache_tokens.resize(0);
2845
+ slot->cache_tokens.clear(); // KV may already been invalidated?
2745
2846
  send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
2746
2847
  break;
2747
2848
  }
2748
- slot->cache_tokens.resize(token_count);
2849
+ tokens.resize(token_count);
2850
+ slot->cache_tokens.clear();
2851
+ slot->cache_tokens.insert(tokens);
2749
2852
 
2750
2853
  const int64_t t_end = ggml_time_us();
2751
2854
  const double t_restore_ms = (t_end - t_start) / 1000.0;
@@ -2762,6 +2865,7 @@ struct server_context {
2762
2865
  } break;
2763
2866
  case SERVER_TASK_TYPE_SLOT_ERASE:
2764
2867
  {
2868
+ if (!ensure_no_mtmd(task.id)) break;
2765
2869
  int id_slot = task.slot_action.slot_id;
2766
2870
  server_slot * slot = get_slot_by_id(id_slot);
2767
2871
  if (slot == nullptr) {
@@ -2771,7 +2875,7 @@ struct server_context {
2771
2875
  if (slot->is_processing()) {
2772
2876
  // if requested slot is unavailable, we defer this task for processing later
2773
2877
  SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
2774
- queue_tasks.defer(task);
2878
+ queue_tasks.defer(std::move(task));
2775
2879
  break;
2776
2880
  }
2777
2881
 
@@ -2793,6 +2897,7 @@ struct server_context {
2793
2897
  res->id = task.id;
2794
2898
  queue_results.send(std::move(res));
2795
2899
  } break;
2900
+
2796
2901
  }
2797
2902
  }
2798
2903
 
@@ -2823,7 +2928,7 @@ struct server_context {
2823
2928
 
2824
2929
  server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE);
2825
2930
  task.id = queue_tasks.get_new_id();
2826
- queue_tasks.post(task);
2931
+ queue_tasks.post(std::move(task));
2827
2932
  }
2828
2933
 
2829
2934
  // apply context-shift if needed
@@ -2838,6 +2943,12 @@ struct server_context {
2838
2943
  continue;
2839
2944
  }
2840
2945
 
2946
+ if (mctx) {
2947
+ // we should never reach this because params_base.ctx_shift is automatically disabled if mmproj is loaded
2948
+ // we don't support ctx_shift because an image chunk may contains multiple tokens
2949
+ GGML_ABORT("not supported by multimodal");
2950
+ }
2951
+
2841
2952
  // Shift context
2842
2953
  const int n_keep = slot.params.n_keep + add_bos_token;
2843
2954
  const int n_left = slot.n_past - n_keep;
@@ -2848,12 +2959,16 @@ struct server_context {
2848
2959
  llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
2849
2960
  llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
2850
2961
 
2851
- if (slot.params.cache_prompt) {
2852
- for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
2853
- slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
2962
+ // add generated tokens to cache
2963
+ {
2964
+ llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
2965
+ for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
2966
+ new_tokens[i - n_discard] = new_tokens[i];
2854
2967
  }
2855
2968
 
2856
- slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
2969
+ new_tokens.resize(slot.cache_tokens.size() - n_discard);
2970
+ slot.cache_tokens.clear();
2971
+ slot.cache_tokens.insert(new_tokens);
2857
2972
  }
2858
2973
 
2859
2974
  slot.n_past -= n_discard;
@@ -2890,10 +3005,7 @@ struct server_context {
2890
3005
  common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
2891
3006
 
2892
3007
  slot.n_past += 1;
2893
-
2894
- if (slot.params.cache_prompt) {
2895
- slot.cache_tokens.push_back(slot.sampled);
2896
- }
3008
+ slot.cache_tokens.push_back(slot.sampled);
2897
3009
 
2898
3010
  SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
2899
3011
  slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
@@ -2931,7 +3043,7 @@ struct server_context {
2931
3043
  SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
2932
3044
 
2933
3045
  // print prompt tokens (for debugging)
2934
- if (1) {
3046
+ /*if (1) {
2935
3047
  // first 16 tokens (avoid flooding logs)
2936
3048
  for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
2937
3049
  SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@@ -2941,7 +3053,7 @@ struct server_context {
2941
3053
  for (int i = 0; i < (int) prompt_tokens.size(); i++) {
2942
3054
  SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
2943
3055
  }
2944
- }
3056
+ }*/
2945
3057
 
2946
3058
  // empty prompt passed -> release the slot and send empty response
2947
3059
  if (prompt_tokens.empty()) {
@@ -2983,21 +3095,27 @@ struct server_context {
2983
3095
 
2984
3096
  // if input prompt is too big, truncate it
2985
3097
  if (slot.n_prompt_tokens >= slot.n_ctx) {
3098
+ if (mctx) {
3099
+ // we should never reach this
3100
+ GGML_ABORT("not supported by multimodal");
3101
+ }
2986
3102
  const int n_left = slot.n_ctx - slot.params.n_keep;
2987
3103
 
2988
3104
  const int n_block_size = n_left / 2;
2989
3105
  const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
2990
3106
 
3107
+ const llama_tokens & curr_tokens = slot.prompt_tokens.get_text_tokens();
2991
3108
  llama_tokens new_tokens(
2992
- prompt_tokens.begin(),
2993
- prompt_tokens.begin() + slot.params.n_keep);
3109
+ curr_tokens.begin(),
3110
+ curr_tokens.begin() + slot.params.n_keep);
2994
3111
 
2995
3112
  new_tokens.insert(
2996
3113
  new_tokens.end(),
2997
- prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
2998
- prompt_tokens.end());
3114
+ curr_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
3115
+ curr_tokens.end());
2999
3116
 
3000
- prompt_tokens = std::move(new_tokens);
3117
+ prompt_tokens.clear();
3118
+ prompt_tokens.insert(new_tokens);
3001
3119
 
3002
3120
  slot.truncated = true;
3003
3121
  slot.n_prompt_tokens = prompt_tokens.size();
@@ -3009,13 +3127,18 @@ struct server_context {
3009
3127
 
3010
3128
  if (slot.params.cache_prompt) {
3011
3129
  // reuse any previously computed tokens that are common with the new prompt
3012
- slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
3130
+ slot.n_past = slot.cache_tokens.get_common_prefix(prompt_tokens);
3013
3131
 
3014
3132
  // reuse chunks from the cached prompt by shifting their KV cache in the new position
3015
3133
  if (params_base.n_cache_reuse > 0) {
3016
3134
  size_t head_c = slot.n_past; // cache
3017
3135
  size_t head_p = slot.n_past; // current prompt
3018
3136
 
3137
+ if (mctx) {
3138
+ // we should never reach this
3139
+ GGML_ABORT("not supported by multimodal");
3140
+ }
3141
+
3019
3142
  SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
3020
3143
 
3021
3144
  while (head_c < slot.cache_tokens.size() &&
@@ -3041,7 +3164,7 @@ struct server_context {
3041
3164
  llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
3042
3165
 
3043
3166
  for (size_t i = 0; i < n_match; i++) {
3044
- slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
3167
+ slot.cache_tokens.set_token(head_p + i, slot.cache_tokens[head_c + i]);
3045
3168
  slot.n_past++;
3046
3169
  }
3047
3170
 
@@ -3054,6 +3177,11 @@ struct server_context {
3054
3177
 
3055
3178
  SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
3056
3179
  }
3180
+ } else {
3181
+ // if we don't cache the prompt, we have to remove the entire KV cache
3182
+ llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
3183
+ slot.n_past = 0;
3184
+ slot.cache_tokens.clear();
3057
3185
  }
3058
3186
  }
3059
3187
 
@@ -3087,23 +3215,53 @@ struct server_context {
3087
3215
  SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
3088
3216
 
3089
3217
  // remove the non-common part from the cache
3090
- slot.cache_tokens.resize(slot.n_past);
3218
+ slot.cache_tokens.keep_first(slot.n_past);
3219
+
3220
+ // check if we should process the image
3221
+ if (slot.n_past < slot.n_prompt_tokens
3222
+ && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
3223
+ // process the image
3224
+ int32_t new_n_past;
3225
+ int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
3226
+ int32_t n_pos = new_n_past - slot.n_past;
3227
+
3228
+ if (res != 0) {
3229
+ SLT_ERR(slot, "failed to process image, res = %d\n", res);
3230
+ slot.release();
3231
+ send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
3232
+ continue;
3233
+ }
3234
+
3235
+ // add the image chunk to cache
3236
+ {
3237
+ const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
3238
+ slot.cache_tokens.push_back(chunk.get()); // copy
3239
+ }
3240
+
3241
+ slot.n_past += n_pos;
3242
+ slot.n_prompt_tokens_processed += n_pos;
3243
+ }
3091
3244
 
3092
3245
  // add prompt tokens for processing in the current batch
3093
3246
  while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
3247
+ // get next token to process
3248
+ llama_token cur_tok = slot.prompt_tokens[slot.n_past];
3249
+ if (cur_tok == LLAMA_TOKEN_NULL) {
3250
+ break; // end of text chunk
3251
+ }
3252
+
3094
3253
  // without pooling, we want to output the embeddings for all the tokens in the batch
3095
3254
  const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
3096
3255
 
3097
- common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
3098
-
3099
- if (slot.params.cache_prompt) {
3100
- slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
3101
- }
3256
+ common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
3257
+ slot.cache_tokens.push_back(cur_tok);
3102
3258
 
3103
3259
  slot.n_prompt_tokens_processed++;
3104
3260
  slot.n_past++;
3105
3261
  }
3106
3262
 
3263
+ // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
3264
+
3107
3265
  SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
3108
3266
 
3109
3267
  // entire prompt has been processed
@@ -3111,12 +3269,16 @@ struct server_context {
3111
3269
  slot.state = SLOT_STATE_DONE_PROMPT;
3112
3270
 
3113
3271
  GGML_ASSERT(batch.n_tokens > 0);
3272
+ GGML_ASSERT((size_t) slot.n_prompt_tokens == slot.prompt_tokens.size());
3114
3273
 
3115
3274
  common_sampler_reset(slot.smpl);
3116
3275
 
3117
3276
  // Process all prompt tokens through sampler system
3118
3277
  for (int i = 0; i < slot.n_prompt_tokens; ++i) {
3119
- common_sampler_accept(slot.smpl, prompt_tokens[i], false);
3278
+ llama_token id = slot.prompt_tokens[i];
3279
+ if (id != LLAMA_TOKEN_NULL) {
3280
+ common_sampler_accept(slot.smpl, id, false);
3281
+ }
3120
3282
  }
3121
3283
 
3122
3284
  // extract the logits only for the last token
@@ -3163,7 +3325,14 @@ struct server_context {
3163
3325
  batch.logits + i,
3164
3326
  };
3165
3327
 
3166
- const int ret = llama_decode(ctx, batch_view);
3328
+ int ret = 0;
3329
+
3330
+ if (params_base.embedding || params_base.reranking) {
3331
+ ret = llama_encode(ctx, batch_view);
3332
+ } else {
3333
+ ret = llama_decode(ctx, batch_view);
3334
+ }
3335
+
3167
3336
  metrics.on_decoded(slots);
3168
3337
 
3169
3338
  if (ret != 0) {
@@ -3262,6 +3431,11 @@ struct server_context {
3262
3431
  continue;
3263
3432
  }
3264
3433
 
3434
+ if (mctx) {
3435
+ // we should never reach this, as speculative is automatically disabled if mmproj is loaded
3436
+ GGML_ABORT("not supported by multimodal");
3437
+ }
3438
+
3265
3439
  // determine the max draft that fits the current slot state
3266
3440
  int n_draft_max = slot.params.speculative.n_max;
3267
3441
 
@@ -3288,7 +3462,11 @@ struct server_context {
3288
3462
  params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
3289
3463
  params_spec.p_min = slot.params.speculative.p_min;
3290
3464
 
3291
- llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
3465
+ const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
3466
+ llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
3467
+
3468
+ // keep track of total number of tokens generated in the draft
3469
+ slot.n_draft_total += draft.size();
3292
3470
 
3293
3471
  // ignore small drafts
3294
3472
  if (slot.params.speculative.n_min > (int) draft.size()) {
@@ -3315,8 +3493,11 @@ struct server_context {
3315
3493
  slot.n_past += ids.size();
3316
3494
  slot.n_decoded += ids.size();
3317
3495
 
3496
+ // update how many tokens out of draft was accepted
3497
+ slot.n_draft_accepted += ids.size() - 1;
3498
+
3318
3499
  slot.cache_tokens.push_back(id);
3319
- slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
3500
+ slot.cache_tokens.insert({ids.begin(), ids.end() - 1});
3320
3501
 
3321
3502
  llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
3322
3503
 
@@ -3534,6 +3715,9 @@ int main(int argc, char ** argv) {
3534
3715
  if (req.path == "/" || tmp.back() == "html") {
3535
3716
  res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
3536
3717
  res.status = 503;
3718
+ } else if (req.path == "/models" || req.path == "/v1/models") {
3719
+ // allow the models endpoint to be accessed during loading
3720
+ return true;
3537
3721
  } else {
3538
3722
  res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
3539
3723
  }
@@ -3579,14 +3763,17 @@ int main(int argc, char ** argv) {
3579
3763
  }
3580
3764
 
3581
3765
  // request slots data using task queue
3582
- server_task task(SERVER_TASK_TYPE_METRICS);
3583
- task.id = ctx_server.queue_tasks.get_new_id();
3584
- ctx_server.queue_results.add_waiting_task_id(task.id);
3585
- ctx_server.queue_tasks.post(task, true); // high-priority task
3766
+ int task_id = ctx_server.queue_tasks.get_new_id();
3767
+ {
3768
+ server_task task(SERVER_TASK_TYPE_METRICS);
3769
+ task.id = task_id;
3770
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3771
+ ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
3772
+ }
3586
3773
 
3587
3774
  // get the result
3588
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3589
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3775
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3776
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3590
3777
 
3591
3778
  if (result->is_error()) {
3592
3779
  res_error(res, result->to_json());
@@ -3615,16 +3802,17 @@ int main(int argc, char ** argv) {
3615
3802
  }
3616
3803
 
3617
3804
  // request slots data using task queue
3618
- server_task task(SERVER_TASK_TYPE_METRICS);
3619
- task.id = ctx_server.queue_tasks.get_new_id();
3620
- task.metrics_reset_bucket = true;
3621
-
3622
- ctx_server.queue_results.add_waiting_task_id(task.id);
3623
- ctx_server.queue_tasks.post(task, true); // high-priority task
3805
+ int task_id = ctx_server.queue_tasks.get_new_id();
3806
+ {
3807
+ server_task task(SERVER_TASK_TYPE_METRICS);
3808
+ task.id = task_id;
3809
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3810
+ ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
3811
+ }
3624
3812
 
3625
3813
  // get the result
3626
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3627
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3814
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3815
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3628
3816
 
3629
3817
  if (result->is_error()) {
3630
3818
  res_error(res, result->to_json());
@@ -3721,17 +3909,20 @@ int main(int argc, char ** argv) {
3721
3909
  }
3722
3910
  std::string filepath = params.slot_save_path + filename;
3723
3911
 
3724
- server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
3725
- task.id = ctx_server.queue_tasks.get_new_id();
3726
- task.slot_action.slot_id = id_slot;
3727
- task.slot_action.filename = filename;
3728
- task.slot_action.filepath = filepath;
3912
+ int task_id = ctx_server.queue_tasks.get_new_id();
3913
+ {
3914
+ server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
3915
+ task.id = task_id;
3916
+ task.slot_action.slot_id = id_slot;
3917
+ task.slot_action.filename = filename;
3918
+ task.slot_action.filepath = filepath;
3729
3919
 
3730
- ctx_server.queue_results.add_waiting_task_id(task.id);
3731
- ctx_server.queue_tasks.post(task);
3920
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3921
+ ctx_server.queue_tasks.post(std::move(task));
3922
+ }
3732
3923
 
3733
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3734
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3924
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3925
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3735
3926
 
3736
3927
  if (result->is_error()) {
3737
3928
  res_error(res, result->to_json());
@@ -3750,17 +3941,20 @@ int main(int argc, char ** argv) {
3750
3941
  }
3751
3942
  std::string filepath = params.slot_save_path + filename;
3752
3943
 
3753
- server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
3754
- task.id = ctx_server.queue_tasks.get_new_id();
3755
- task.slot_action.slot_id = id_slot;
3756
- task.slot_action.filename = filename;
3757
- task.slot_action.filepath = filepath;
3944
+ int task_id = ctx_server.queue_tasks.get_new_id();
3945
+ {
3946
+ server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
3947
+ task.id = task_id;
3948
+ task.slot_action.slot_id = id_slot;
3949
+ task.slot_action.filename = filename;
3950
+ task.slot_action.filepath = filepath;
3758
3951
 
3759
- ctx_server.queue_results.add_waiting_task_id(task.id);
3760
- ctx_server.queue_tasks.post(task);
3952
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3953
+ ctx_server.queue_tasks.post(std::move(task));
3954
+ }
3761
3955
 
3762
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3763
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3956
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3957
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3764
3958
 
3765
3959
  if (result->is_error()) {
3766
3960
  res_error(res, result->to_json());
@@ -3772,15 +3966,18 @@ int main(int argc, char ** argv) {
3772
3966
  };
3773
3967
 
3774
3968
  const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
3775
- server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
3776
- task.id = ctx_server.queue_tasks.get_new_id();
3777
- task.slot_action.slot_id = id_slot;
3969
+ int task_id = ctx_server.queue_tasks.get_new_id();
3970
+ {
3971
+ server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
3972
+ task.id = task_id;
3973
+ task.slot_action.slot_id = id_slot;
3778
3974
 
3779
- ctx_server.queue_results.add_waiting_task_id(task.id);
3780
- ctx_server.queue_tasks.post(task);
3975
+ ctx_server.queue_results.add_waiting_task_id(task_id);
3976
+ ctx_server.queue_tasks.post(std::move(task));
3977
+ }
3781
3978
 
3782
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
3783
- ctx_server.queue_results.remove_waiting_task_id(task.id);
3979
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
3980
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
3784
3981
 
3785
3982
  if (result->is_error()) {
3786
3983
  res_error(res, result->to_json());
@@ -3825,7 +4022,8 @@ int main(int argc, char ** argv) {
3825
4022
  json data = {
3826
4023
  { "default_generation_settings", ctx_server.default_generation_settings_for_props },
3827
4024
  { "total_slots", ctx_server.params_base.n_parallel },
3828
- { "model_path", ctx_server.params_base.model },
4025
+ { "model_path", ctx_server.params_base.model.path },
4026
+ { "modalities", json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
3829
4027
  { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
3830
4028
  { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
3831
4029
  { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -3853,14 +4051,30 @@ int main(int argc, char ** argv) {
3853
4051
  res_ok(res, {{ "success", true }});
3854
4052
  };
3855
4053
 
4054
+ const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
4055
+ json data = {
4056
+ {
4057
+ "template", common_chat_templates_source(ctx_server.chat_templates.get()),
4058
+ },
4059
+ {
4060
+ "model_info", {
4061
+ { "llama.context_length", ctx_server.slots.back().n_ctx, },
4062
+ }
4063
+ },
4064
+ };
4065
+
4066
+ res_ok(res, data);
4067
+ };
4068
+
3856
4069
  // handle completion-like requests (completion, chat, infill)
3857
4070
  // we can optionally provide a custom format for partial results and final results
3858
4071
  const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
3859
4072
  server_task_type type,
3860
4073
  json & data,
3861
- std::function<bool()> is_connection_closed,
4074
+ const std::vector<raw_buffer> & files,
4075
+ const std::function<bool()> & is_connection_closed,
3862
4076
  httplib::Response & res,
3863
- oaicompat_type oaicompat) {
4077
+ oaicompat_type oaicompat) -> void {
3864
4078
  GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
3865
4079
 
3866
4080
  if (ctx_server.params_base.embedding) {
@@ -3869,26 +4083,81 @@ int main(int argc, char ** argv) {
3869
4083
  }
3870
4084
 
3871
4085
  auto completion_id = gen_chatcmplid();
3872
- std::vector<server_task> tasks;
3873
-
4086
+ std::unordered_set<int> task_ids;
3874
4087
  try {
4088
+ std::vector<server_task> tasks;
4089
+
3875
4090
  const auto & prompt = data.at("prompt");
3876
4091
  // TODO: this log can become very long, put it behind a flag or think about a more compact format
3877
4092
  //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
3878
4093
 
3879
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
3880
- tasks.reserve(tokenized_prompts.size());
3881
- for (size_t i = 0; i < tokenized_prompts.size(); i++) {
4094
+ // process files
4095
+ mtmd::bitmaps bitmaps;
4096
+ const bool has_mtmd = ctx_server.mctx != nullptr;
4097
+ {
4098
+ if (!has_mtmd && !files.empty()) {
4099
+ throw std::runtime_error("This server does not support multimodal");
4100
+ }
4101
+ for (auto & file : files) {
4102
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
4103
+ if (!bmp.ptr) {
4104
+ throw std::runtime_error("Failed to load image");
4105
+ }
4106
+ // calculate bitmap hash (for KV caching)
4107
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
4108
+ bmp.set_id(hash.c_str());
4109
+ bitmaps.entries.push_back(std::move(bmp));
4110
+ }
4111
+ }
4112
+
4113
+ // process prompt
4114
+ std::vector<server_tokens> inputs;
4115
+ if (oaicompat && !prompt.is_string()) {
4116
+ throw std::runtime_error("prompt must be a string");
4117
+ }
4118
+
4119
+ if (oaicompat && has_mtmd) {
4120
+ // multimodal
4121
+ std::string prompt_str = prompt.get<std::string>();
4122
+ mtmd_input_text inp_txt = {
4123
+ prompt_str.c_str(),
4124
+ /* add_special */ true,
4125
+ /* parse_special */ true,
4126
+ };
4127
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
4128
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
4129
+ int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
4130
+ chunks.ptr.get(),
4131
+ &inp_txt,
4132
+ bitmaps_c_ptr.data(),
4133
+ bitmaps_c_ptr.size());
4134
+ if (tokenized != 0) {
4135
+ throw std::runtime_error("Failed to tokenize prompt");
4136
+ }
4137
+
4138
+ server_tokens tmp(chunks, true);
4139
+ inputs.push_back(std::move(tmp));
4140
+ } else {
4141
+ // non-multimodal version
4142
+ auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
4143
+ for (auto & p : tokenized_prompts) {
4144
+ auto tmp = server_tokens(p, ctx_server.mctx != nullptr);
4145
+ inputs.push_back(std::move(tmp));
4146
+ }
4147
+ }
4148
+
4149
+ tasks.reserve(inputs.size());
4150
+ for (size_t i = 0; i < inputs.size(); i++) {
3882
4151
  server_task task = server_task(type);
3883
4152
 
3884
4153
  task.id = ctx_server.queue_tasks.get_new_id();
3885
4154
  task.index = i;
3886
4155
 
3887
- task.prompt_tokens = std::move(tokenized_prompts[i]);
4156
+ task.prompt_tokens = std::move(inputs[i]);
3888
4157
  task.params = server_task::params_from_json_cmpl(
3889
- ctx_server.ctx,
3890
- ctx_server.params_base,
3891
- data);
4158
+ ctx_server.ctx,
4159
+ ctx_server.params_base,
4160
+ data);
3892
4161
  task.id_selected_slot = json_value(data, "id_slot", -1);
3893
4162
 
3894
4163
  // OAI-compat
@@ -3896,18 +4165,18 @@ int main(int argc, char ** argv) {
3896
4165
  task.params.oaicompat_cmpl_id = completion_id;
3897
4166
  // oaicompat_model is already populated by params_from_json_cmpl
3898
4167
 
3899
- tasks.push_back(task);
4168
+ tasks.push_back(std::move(task));
3900
4169
  }
4170
+
4171
+ task_ids = server_task::get_list_id(tasks);
4172
+ ctx_server.queue_results.add_waiting_tasks(tasks);
4173
+ ctx_server.queue_tasks.post(std::move(tasks));
3901
4174
  } catch (const std::exception & e) {
3902
4175
  res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
3903
4176
  return;
3904
4177
  }
3905
4178
 
3906
- ctx_server.queue_results.add_waiting_tasks(tasks);
3907
- ctx_server.queue_tasks.post(tasks);
3908
-
3909
4179
  bool stream = json_value(data, "stream", false);
3910
- const auto task_ids = server_task::get_list_id(tasks);
3911
4180
 
3912
4181
  if (!stream) {
3913
4182
  ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
@@ -3966,9 +4235,11 @@ int main(int argc, char ** argv) {
3966
4235
 
3967
4236
  const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
3968
4237
  json data = json::parse(req.body);
3969
- return handle_completions_impl(
4238
+ std::vector<raw_buffer> files; // dummy
4239
+ handle_completions_impl(
3970
4240
  SERVER_TASK_TYPE_COMPLETION,
3971
4241
  data,
4242
+ files,
3972
4243
  req.is_connection_closed,
3973
4244
  res,
3974
4245
  OAICOMPAT_TYPE_NONE);
@@ -3976,9 +4247,11 @@ int main(int argc, char ** argv) {
3976
4247
 
3977
4248
  const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
3978
4249
  json data = oaicompat_completion_params_parse(json::parse(req.body));
3979
- return handle_completions_impl(
4250
+ std::vector<raw_buffer> files; // dummy
4251
+ handle_completions_impl(
3980
4252
  SERVER_TASK_TYPE_COMPLETION,
3981
4253
  data,
4254
+ files,
3982
4255
  req.is_connection_closed,
3983
4256
  res,
3984
4257
  OAICOMPAT_TYPE_COMPLETION);
@@ -4053,9 +4326,11 @@ int main(int argc, char ** argv) {
4053
4326
  tokenized_prompts[0]
4054
4327
  );
4055
4328
 
4056
- return handle_completions_impl(
4329
+ std::vector<raw_buffer> files; // dummy
4330
+ handle_completions_impl(
4057
4331
  SERVER_TASK_TYPE_INFILL,
4058
4332
  data,
4333
+ files,
4059
4334
  req.is_connection_closed,
4060
4335
  res,
4061
4336
  OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
@@ -4069,11 +4344,20 @@ int main(int argc, char ** argv) {
4069
4344
  }
4070
4345
 
4071
4346
  auto body = json::parse(req.body);
4072
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4073
-
4074
- return handle_completions_impl(
4347
+ std::vector<raw_buffer> files;
4348
+ json data = oaicompat_completion_params_parse(
4349
+ body,
4350
+ params.use_jinja,
4351
+ params.prefill_assistant,
4352
+ params.reasoning_format,
4353
+ ctx_server.chat_templates.get(),
4354
+ ctx_server.mctx,
4355
+ files);
4356
+
4357
+ handle_completions_impl(
4075
4358
  SERVER_TASK_TYPE_COMPLETION,
4076
4359
  data,
4360
+ files,
4077
4361
  req.is_connection_closed,
4078
4362
  res,
4079
4363
  OAICOMPAT_TYPE_CHAT);
@@ -4082,20 +4366,34 @@ int main(int argc, char ** argv) {
4082
4366
  // same with handle_chat_completions, but without inference part
4083
4367
  const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
4084
4368
  auto body = json::parse(req.body);
4085
- json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get());
4369
+ std::vector<raw_buffer> files; // dummy, unused
4370
+ json data = oaicompat_completion_params_parse(
4371
+ body,
4372
+ params.use_jinja,
4373
+ params.prefill_assistant,
4374
+ params.reasoning_format,
4375
+ ctx_server.chat_templates.get(),
4376
+ ctx_server.mctx,
4377
+ files);
4086
4378
  res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
4087
4379
  };
4088
4380
 
4089
- const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
4381
+ const auto handle_models = [&params, &ctx_server, &state, &res_ok](const httplib::Request &, httplib::Response & res) {
4382
+ server_state current_state = state.load();
4383
+ json model_meta = nullptr;
4384
+ if (current_state == SERVER_STATE_READY) {
4385
+ model_meta = ctx_server.model_meta();
4386
+ }
4387
+
4090
4388
  json models = {
4091
4389
  {"object", "list"},
4092
4390
  {"data", {
4093
4391
  {
4094
- {"id", params.model_alias.empty() ? params.model : params.model_alias},
4392
+ {"id", params.model_alias.empty() ? params.model.path : params.model_alias},
4095
4393
  {"object", "model"},
4096
4394
  {"created", std::time(0)},
4097
4395
  {"owned_by", "llamacpp"},
4098
- {"meta", ctx_server.model_meta()}
4396
+ {"meta", model_meta},
4099
4397
  },
4100
4398
  }}
4101
4399
  };
@@ -4187,7 +4485,7 @@ int main(int argc, char ** argv) {
4187
4485
  }
4188
4486
  }
4189
4487
 
4190
- std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
4488
+ auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
4191
4489
  for (const auto & tokens : tokenized_prompts) {
4192
4490
  // this check is necessary for models that do not add BOS token to the input
4193
4491
  if (tokens.empty()) {
@@ -4199,6 +4497,7 @@ int main(int argc, char ** argv) {
4199
4497
  // create and queue the task
4200
4498
  json responses = json::array();
4201
4499
  bool error = false;
4500
+ std::unordered_set<int> task_ids;
4202
4501
  {
4203
4502
  std::vector<server_task> tasks;
4204
4503
  for (size_t i = 0; i < tokenized_prompts.size(); i++) {
@@ -4206,32 +4505,31 @@ int main(int argc, char ** argv) {
4206
4505
 
4207
4506
  task.id = ctx_server.queue_tasks.get_new_id();
4208
4507
  task.index = i;
4209
- task.prompt_tokens = std::move(tokenized_prompts[i]);
4508
+ task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
4210
4509
 
4211
4510
  // OAI-compat
4212
4511
  task.params.oaicompat = oaicompat;
4213
4512
 
4214
- tasks.push_back(task);
4513
+ tasks.push_back(std::move(task));
4215
4514
  }
4216
4515
 
4516
+ task_ids = server_task::get_list_id(tasks);
4217
4517
  ctx_server.queue_results.add_waiting_tasks(tasks);
4218
- ctx_server.queue_tasks.post(tasks);
4219
-
4220
- // get the result
4221
- std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
4518
+ ctx_server.queue_tasks.post(std::move(tasks));
4519
+ }
4222
4520
 
4223
- ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4224
- for (auto & res : results) {
4225
- GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
4226
- responses.push_back(res->to_json());
4227
- }
4228
- }, [&](const json & error_data) {
4229
- res_error(res, error_data);
4230
- error = true;
4231
- }, req.is_connection_closed);
4521
+ // get the result
4522
+ ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4523
+ for (auto & res : results) {
4524
+ GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
4525
+ responses.push_back(res->to_json());
4526
+ }
4527
+ }, [&](const json & error_data) {
4528
+ res_error(res, error_data);
4529
+ error = true;
4530
+ }, req.is_connection_closed);
4232
4531
 
4233
- ctx_server.queue_results.remove_waiting_task_ids(task_ids);
4234
- }
4532
+ ctx_server.queue_results.remove_waiting_task_ids(task_ids);
4235
4533
 
4236
4534
  if (error) {
4237
4535
  return;
@@ -4298,35 +4596,35 @@ int main(int argc, char ** argv) {
4298
4596
  // create and queue the task
4299
4597
  json responses = json::array();
4300
4598
  bool error = false;
4599
+ std::unordered_set<int> task_ids;
4301
4600
  {
4302
4601
  std::vector<server_task> tasks;
4303
- std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
4602
+ auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
4304
4603
  tasks.reserve(tokenized_docs.size());
4305
4604
  for (size_t i = 0; i < tokenized_docs.size(); i++) {
4605
+ auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
4306
4606
  server_task task = server_task(SERVER_TASK_TYPE_RERANK);
4307
4607
  task.id = ctx_server.queue_tasks.get_new_id();
4308
4608
  task.index = i;
4309
- task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
4310
- tasks.push_back(task);
4609
+ task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr);
4610
+ tasks.push_back(std::move(task));
4311
4611
  }
4312
4612
 
4613
+ task_ids = server_task::get_list_id(tasks);
4313
4614
  ctx_server.queue_results.add_waiting_tasks(tasks);
4314
- ctx_server.queue_tasks.post(tasks);
4315
-
4316
- // get the result
4317
- std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
4318
-
4319
- ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4320
- for (auto & res : results) {
4321
- GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
4322
- responses.push_back(res->to_json());
4323
- }
4324
- }, [&](const json & error_data) {
4325
- res_error(res, error_data);
4326
- error = true;
4327
- }, req.is_connection_closed);
4615
+ ctx_server.queue_tasks.post(std::move(tasks));
4328
4616
  }
4329
4617
 
4618
+ ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
4619
+ for (auto & res : results) {
4620
+ GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
4621
+ responses.push_back(res->to_json());
4622
+ }
4623
+ }, [&](const json & error_data) {
4624
+ res_error(res, error_data);
4625
+ error = true;
4626
+ }, req.is_connection_closed);
4627
+
4330
4628
  if (error) {
4331
4629
  return;
4332
4630
  }
@@ -4362,14 +4660,19 @@ int main(int argc, char ** argv) {
4362
4660
  res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
4363
4661
  return;
4364
4662
  }
4365
- server_task task(SERVER_TASK_TYPE_SET_LORA);
4366
- task.id = ctx_server.queue_tasks.get_new_id();
4367
- task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
4368
- ctx_server.queue_results.add_waiting_task_id(task.id);
4369
- ctx_server.queue_tasks.post(task);
4370
4663
 
4371
- server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
4372
- ctx_server.queue_results.remove_waiting_task_id(task.id);
4664
+ int task_id = ctx_server.queue_tasks.get_new_id();
4665
+ {
4666
+ server_task task(SERVER_TASK_TYPE_SET_LORA);
4667
+ task.id = task_id;
4668
+ task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
4669
+ ctx_server.queue_results.add_waiting_task_id(task_id);
4670
+ ctx_server.queue_tasks.post(std::move(task));
4671
+ }
4672
+
4673
+ // get the result
4674
+ server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
4675
+ ctx_server.queue_results.remove_waiting_task_id(task_id);
4373
4676
 
4374
4677
  if (result->is_error()) {
4375
4678
  res_error(res, result->to_json());
@@ -4417,6 +4720,7 @@ int main(int argc, char ** argv) {
4417
4720
  svr->Get ("/metrics", handle_metrics);
4418
4721
  svr->Get ("/props", handle_props);
4419
4722
  svr->Post("/props", handle_props_change);
4723
+ svr->Post("/api/show", handle_api_show);
4420
4724
  svr->Get ("/models", handle_models); // public endpoint (no API key check)
4421
4725
  svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
4422
4726
  svr->Post("/completion", handle_completions); // legacy
@@ -4453,21 +4757,31 @@ int main(int argc, char ** argv) {
4453
4757
  svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
4454
4758
 
4455
4759
  // clean up function, to be called before exit
4456
- auto clean_up = [&svr]() {
4760
+ auto clean_up = [&svr, &ctx_server]() {
4457
4761
  SRV_INF("%s: cleaning up before exit...\n", __func__);
4458
4762
  svr->stop();
4763
+ ctx_server.queue_results.terminate();
4459
4764
  llama_backend_free();
4460
4765
  };
4461
4766
 
4462
- // bind HTTP listen port
4463
4767
  bool was_bound = false;
4464
- if (params.port == 0) {
4465
- int bound_port = svr->bind_to_any_port(params.hostname);
4466
- if ((was_bound = (bound_port >= 0))) {
4467
- params.port = bound_port;
4468
- }
4768
+ if (string_ends_with(std::string(params.hostname), ".sock")) {
4769
+ LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
4770
+ svr->set_address_family(AF_UNIX);
4771
+ // bind_to_port requires a second arg, any value other than 0 should
4772
+ // simply get ignored
4773
+ was_bound = svr->bind_to_port(params.hostname, 8080);
4469
4774
  } else {
4470
- was_bound = svr->bind_to_port(params.hostname, params.port);
4775
+ LOG_INF("%s: binding port with default address family\n", __func__);
4776
+ // bind HTTP listen port
4777
+ if (params.port == 0) {
4778
+ int bound_port = svr->bind_to_any_port(params.hostname);
4779
+ if ((was_bound = (bound_port >= 0))) {
4780
+ params.port = bound_port;
4781
+ }
4782
+ } else {
4783
+ was_bound = svr->bind_to_port(params.hostname, params.port);
4784
+ }
4471
4785
  }
4472
4786
 
4473
4787
  if (!was_bound) {
@@ -4487,7 +4801,7 @@ int main(int argc, char ** argv) {
4487
4801
 
4488
4802
  if (!ctx_server.load_model(params)) {
4489
4803
  clean_up();
4490
- // t.join(); // FIXME: see below
4804
+ t.join();
4491
4805
  LOG_ERR("%s: exiting due to model loading error\n", __func__);
4492
4806
  return 1;
4493
4807
  }
@@ -4502,8 +4816,8 @@ int main(int argc, char ** argv) {
4502
4816
  common_chat_templates_source(ctx_server.chat_templates.get()),
4503
4817
  common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
4504
4818
 
4505
- ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
4506
- ctx_server.process_single_task(task);
4819
+ ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
4820
+ ctx_server.process_single_task(std::move(task));
4507
4821
  });
4508
4822
 
4509
4823
  ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
@@ -4535,7 +4849,7 @@ int main(int argc, char ** argv) {
4535
4849
  ctx_server.queue_tasks.start_loop();
4536
4850
 
4537
4851
  clean_up();
4538
- // t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
4852
+ t.join();
4539
4853
 
4540
4854
  return 0;
4541
4855
  }