llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  14. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  15. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  17. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  18. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  19. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
  26. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  27. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  28. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  29. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  30. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
  31. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  32. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  33. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
  34. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  36. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  37. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  38. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  39. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  40. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  128. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  129. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  130. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  131. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  132. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  133. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
  134. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  135. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  136. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  137. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  142. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  143. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
  145. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  146. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  147. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  148. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  149. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  150. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
  151. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  152. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  153. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  155. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
  156. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  157. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
  158. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
  159. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  160. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  161. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
  162. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  163. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  164. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  165. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  166. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  167. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  168. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  169. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  170. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  171. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  172. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  173. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  174. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  175. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  176. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
  177. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  178. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  179. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  180. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  182. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
  183. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  184. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  186. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  187. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  188. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  189. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  190. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  191. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  192. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  193. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  194. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  196. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  197. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  198. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  200. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
  201. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  202. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  203. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  206. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  227. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  228. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  231. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  232. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
  233. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
  234. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  235. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
  236. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
  237. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  238. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  239. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  240. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  241. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  242. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) {
33
33
  } else {
34
34
  queue_tasks.push_back(std::move(task));
35
35
  }
36
+ time_last_task = ggml_time_ms();
36
37
  condition_tasks.notify_one();
37
38
  return task_id;
38
39
  }
@@ -54,6 +55,7 @@ int server_queue::post(std::vector<server_task> && tasks, bool front) {
54
55
  queue_tasks.push_back(std::move(task));
55
56
  }
56
57
  }
58
+ time_last_task = ggml_time_ms();
57
59
  condition_tasks.notify_one();
58
60
  return 0;
59
61
  }
@@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) {
62
64
  std::unique_lock<std::mutex> lock(mutex_tasks);
63
65
  QUE_DBG("defer task, id = %d\n", task.id);
64
66
  queue_tasks_deferred.push_back(std::move(task));
67
+ time_last_task = ggml_time_ms();
65
68
  condition_tasks.notify_one();
66
69
  }
67
70
 
@@ -71,31 +74,52 @@ int server_queue::get_new_id() {
71
74
  return new_id;
72
75
  }
73
76
 
74
- void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
75
- callback_new_task = std::move(callback);
76
- }
77
-
78
- void server_queue::on_update_slots(std::function<void(void)> callback) {
79
- callback_update_slots = std::move(callback);
80
- }
81
-
82
77
  void server_queue::pop_deferred_task() {
83
78
  std::unique_lock<std::mutex> lock(mutex_tasks);
84
79
  if (!queue_tasks_deferred.empty()) {
85
80
  queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
86
81
  queue_tasks_deferred.pop_front();
87
82
  }
83
+ time_last_task = ggml_time_ms();
88
84
  condition_tasks.notify_one();
89
85
  }
90
86
 
87
+ void server_queue::wait_until_no_sleep() {
88
+ std::unique_lock<std::mutex> lock(mutex_tasks);
89
+ if (!sleeping) {
90
+ return;
91
+ } else {
92
+ if (!req_stop_sleeping) {
93
+ QUE_DBG("%s", "requesting to stop sleeping\n");
94
+ req_stop_sleeping = true;
95
+ condition_tasks.notify_one(); // only main thread is waiting on this
96
+ }
97
+ QUE_DBG("%s", "waiting until no sleep\n");
98
+ condition_tasks.wait(lock, [&]{
99
+ return !sleeping;
100
+ });
101
+ }
102
+ }
103
+
91
104
  void server_queue::terminate() {
92
105
  std::unique_lock<std::mutex> lock(mutex_tasks);
93
106
  running = false;
94
107
  condition_tasks.notify_all();
95
108
  }
96
109
 
97
- void server_queue::start_loop() {
110
+ void server_queue::start_loop(int64_t idle_sleep_ms) {
98
111
  running = true;
112
+ time_last_task = ggml_time_ms();
113
+
114
+ constexpr auto max_wait_time = std::chrono::seconds(1);
115
+ auto should_sleep = [&]() -> bool {
116
+ // caller must hold mutex_tasks
117
+ if (idle_sleep_ms < 0) {
118
+ return false;
119
+ }
120
+ int64_t now = ggml_time_ms();
121
+ return (now - time_last_task) >= idle_sleep_ms;
122
+ };
99
123
 
100
124
  while (true) {
101
125
  QUE_DBG("%s", "processing new tasks\n");
@@ -117,23 +141,53 @@ void server_queue::start_loop() {
117
141
  QUE_DBG("processing task, id = %d\n", task.id);
118
142
  callback_new_task(std::move(task));
119
143
  }
120
-
121
144
  // all tasks in the current loop is processed, slots data is now ready
122
145
  QUE_DBG("%s", "update slots\n");
123
146
 
147
+ // this will run the main inference process for all slots
124
148
  callback_update_slots();
149
+ {
150
+ // update_slots() may take a while to finish, we need to make sure it's not counted as idle
151
+ std::unique_lock<std::mutex> lock(mutex_tasks);
152
+ time_last_task = ggml_time_ms();
153
+ }
125
154
 
126
155
  QUE_DBG("%s", "waiting for new tasks\n");
127
- {
156
+ while (true) {
128
157
  std::unique_lock<std::mutex> lock(mutex_tasks);
129
- if (!running) {
130
- QUE_DBG("%s", "terminate\n");
131
- return;
158
+ if (!running || !queue_tasks.empty()) {
159
+ break; // go back to process new tasks or terminate
132
160
  }
133
- if (queue_tasks.empty()) {
161
+
162
+ // no tasks, check for sleeping state
163
+ if (should_sleep()) {
164
+ QUE_INF("%s", "entering sleeping state\n");
165
+ sleeping = true;
166
+ callback_sleeping_state(true);
167
+ req_stop_sleeping = false;
168
+ // wait until we are requested to exit sleeping state
134
169
  condition_tasks.wait(lock, [&]{
170
+ return (!running || req_stop_sleeping);
171
+ });
172
+ if (!running) { // may changed during sleep
173
+ break; // terminate
174
+ }
175
+ QUE_INF("%s", "exiting sleeping state\n");
176
+ req_stop_sleeping = false;
177
+ callback_sleeping_state(false);
178
+ sleeping = false;
179
+ time_last_task = ggml_time_ms();
180
+ condition_tasks.notify_all(); // notify wait_until_no_sleep()
181
+ break; // process new tasks
182
+ } else {
183
+ // wait for new tasks or timeout for checking sleeping condition
184
+ bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
135
185
  return (!queue_tasks.empty() || !running);
136
186
  });
187
+ if (res) {
188
+ break; // new task arrived or terminate
189
+ }
190
+ // otherwise, loop again to check sleeping condition
137
191
  }
138
192
  }
139
193
  }
@@ -271,23 +325,25 @@ void server_response::terminate() {
271
325
  // server_response_reader
272
326
  //
273
327
 
274
- void server_response_reader::post_task(server_task && task) {
328
+ void server_response_reader::post_task(server_task && task, bool front) {
275
329
  GGML_ASSERT(id_tasks.empty() && "post_task() can only be called once per reader");
330
+ task.index = 0;
276
331
  id_tasks.insert(task.id);
277
332
  states.push_back(task.create_state());
278
333
  queue_results.add_waiting_task_id(task.id);
279
- queue_tasks.post(std::move(task));
334
+ queue_tasks.post(std::move(task), front);
280
335
  }
281
336
 
282
- void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
337
+ void server_response_reader::post_tasks(std::vector<server_task> && tasks, bool front) {
283
338
  GGML_ASSERT(id_tasks.empty() && "post_tasks() can only be called once per reader");
284
339
  id_tasks = server_task::get_list_id(tasks);
285
340
  states.reserve(tasks.size());
286
341
  for (size_t i = 0; i < tasks.size(); i++) {
342
+ tasks[i].index = i;
287
343
  states.push_back(tasks[i].create_state());
288
344
  }
289
345
  queue_results.add_waiting_tasks(tasks);
290
- queue_tasks.post(std::move(tasks));
346
+ queue_tasks.post(std::move(tasks), front);
291
347
  }
292
348
 
293
349
  bool server_response_reader::has_next() const {
@@ -313,7 +369,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
313
369
  }
314
370
  if (!states.empty()) {
315
371
  // update the generation state if needed
316
- size_t idx = result->get_index();
372
+ const size_t idx = result->index;
317
373
  GGML_ASSERT(idx < states.size());
318
374
  result->update(states[idx]);
319
375
  }
@@ -329,6 +385,7 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
329
385
 
330
386
  server_response_reader::batch_response server_response_reader::wait_for_all(const std::function<bool()> & should_stop) {
331
387
  batch_response batch_res;
388
+ batch_res.results.clear();
332
389
  batch_res.results.resize(id_tasks.size());
333
390
  while (has_next()) {
334
391
  auto res = next(should_stop);
@@ -340,7 +397,7 @@ server_response_reader::batch_response server_response_reader::wait_for_all(cons
340
397
  batch_res.error = std::move(res);
341
398
  return batch_res;
342
399
  }
343
- const size_t idx = res->get_index();
400
+ const size_t idx = res->index;
344
401
  GGML_ASSERT(idx < batch_res.results.size() && "index out of range");
345
402
  GGML_ASSERT(batch_res.results[idx] == nullptr && "duplicate result received");
346
403
  batch_res.results[idx] = std::move(res);
@@ -5,6 +5,7 @@
5
5
  #include <condition_variable>
6
6
  #include <deque>
7
7
  #include <mutex>
8
+ #include <vector>
8
9
  #include <unordered_set>
9
10
 
10
11
  // struct for managing server tasks
@@ -12,7 +13,10 @@
12
13
  struct server_queue {
13
14
  private:
14
15
  int id = 0;
15
- bool running;
16
+ bool running = false;
17
+ bool sleeping = false;
18
+ bool req_stop_sleeping = false;
19
+ int64_t time_last_task = 0;
16
20
 
17
21
  // queues
18
22
  std::deque<server_task> queue_tasks;
@@ -24,6 +28,7 @@ private:
24
28
  // callback functions
25
29
  std::function<void(server_task &&)> callback_new_task;
26
30
  std::function<void(void)> callback_update_slots;
31
+ std::function<void(bool)> callback_sleeping_state;
27
32
 
28
33
  public:
29
34
  // Add a new task to the end of the queue
@@ -38,15 +43,18 @@ public:
38
43
  // Get the next id for creating a new task
39
44
  int get_new_id();
40
45
 
41
- // Register function to process a new task
42
- void on_new_task(std::function<void(server_task &&)> callback);
43
-
44
- // Register the function to be called when all slots data is ready to be processed
45
- void on_update_slots(std::function<void(void)> callback);
46
-
47
46
  // Call when the state of one slot is changed, it will move one task from deferred to main queue
48
47
  void pop_deferred_task();
49
48
 
49
+ // if sleeping, request exiting sleep state and wait until it is done
50
+ // returns immediately if not sleeping
51
+ void wait_until_no_sleep();
52
+
53
+ bool is_sleeping() {
54
+ std::unique_lock<std::mutex> lock(mutex_tasks);
55
+ return sleeping;
56
+ }
57
+
50
58
  // end the start_loop routine
51
59
  void terminate();
52
60
 
@@ -56,8 +64,15 @@ public:
56
64
  * - Process the task (i.e. maybe copy data into slot)
57
65
  * - Check if multitask is finished
58
66
  * - Update all slots
67
+ *
68
+ * Sleeping procedure (disabled if idle_sleep_ms < 0):
69
+ * - If there is no task after idle_sleep_ms, enter sleeping state
70
+ * - Call callback_sleeping_state(true)
71
+ * - Wait until req_stop_sleeping is set to true
72
+ * - Call callback_sleeping_state(false)
73
+ * - Exit sleeping state
59
74
  */
60
- void start_loop();
75
+ void start_loop(int64_t idle_sleep_ms = -1);
61
76
 
62
77
  // for metrics
63
78
  size_t queue_tasks_deferred_size() {
@@ -65,6 +80,27 @@ public:
65
80
  return queue_tasks_deferred.size();
66
81
  }
67
82
 
83
+ //
84
+ // Functions below are not thread-safe, must only be used before start_loop() is called
85
+ //
86
+
87
+ // Register function to process a new task
88
+ void on_new_task(std::function<void(server_task &&)> callback) {
89
+ callback_new_task = std::move(callback);
90
+ }
91
+
92
+ // Register the function to be called when all slots data is ready to be processed
93
+ void on_update_slots(std::function<void(void)> callback) {
94
+ callback_update_slots = std::move(callback);
95
+ }
96
+
97
+ // Register callback for sleeping state change
98
+ // note: when entering sleeping state, the callback is called AFTER sleeping is set to true
99
+ // when leaving sleeping state, the callback is called BEFORE sleeping is set to false
100
+ void on_sleeping_state(std::function<void(bool)> callback) {
101
+ callback_sleeping_state = std::move(callback);
102
+ }
103
+
68
104
  private:
69
105
  void cleanup_pending_task(int id_target);
70
106
  };
@@ -138,8 +174,10 @@ struct server_response_reader {
138
174
  int get_new_id() {
139
175
  return queue_tasks.get_new_id();
140
176
  }
141
- void post_task(server_task && task);
142
- void post_tasks(std::vector<server_task> && tasks);
177
+
178
+ // if front = true, the task will be posted to the front of the queue (high priority)
179
+ void post_task(server_task && task, bool front = false);
180
+ void post_tasks(std::vector<server_task> && tasks, bool front = false);
143
181
  bool has_next() const;
144
182
 
145
183
  // return nullptr if should_stop() is true before receiving a result
@@ -32,8 +32,8 @@ json task_params::to_json(bool only_metrics) const {
32
32
  }
33
33
 
34
34
  json lora = json::array();
35
- for (size_t i = 0; i < this->lora.size(); ++i) {
36
- lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
35
+ for (auto & it : this->lora) {
36
+ lora.push_back({{"id", it.first}, {"scale", it.second}});
37
37
  }
38
38
 
39
39
  if (only_metrics) {
@@ -78,6 +78,7 @@ json task_params::to_json(bool only_metrics) const {
78
78
  {"speculative.p_min", speculative.p_min},
79
79
  {"timings_per_token", timings_per_token},
80
80
  {"post_sampling_probs", post_sampling_probs},
81
+ {"backend_sampling", sampling.backend_sampling},
81
82
  {"lora", lora},
82
83
  };
83
84
  }
@@ -136,6 +137,7 @@ json task_params::to_json(bool only_metrics) const {
136
137
  {"speculative.p_min", speculative.p_min},
137
138
  {"timings_per_token", timings_per_token},
138
139
  {"post_sampling_probs", post_sampling_probs},
140
+ {"backend_sampling", sampling.backend_sampling},
139
141
  {"lora", lora},
140
142
  };
141
143
  }
@@ -145,12 +147,10 @@ json task_params::to_json(bool only_metrics) const {
145
147
  //
146
148
 
147
149
  task_params server_task::params_from_json_cmpl(
148
- const llama_context * ctx,
150
+ const llama_vocab * vocab,
149
151
  const common_params & params_base,
152
+ const int n_ctx_slot,
150
153
  const json & data) {
151
- const llama_model * model = llama_get_model(ctx);
152
- const llama_vocab * vocab = llama_model_get_vocab(model);
153
-
154
154
  task_params params;
155
155
 
156
156
  // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
@@ -206,6 +206,7 @@ task_params server_task::params_from_json_cmpl(
206
206
  params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
207
207
  params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
208
208
  params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
209
+ params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
209
210
  params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
210
211
 
211
212
  params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
@@ -223,12 +224,12 @@ task_params server_task::params_from_json_cmpl(
223
224
 
224
225
  if (data.contains("lora")) {
225
226
  if (data.at("lora").is_array()) {
226
- params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
227
+ params.lora = parse_lora_request(data.at("lora"));
227
228
  } else {
228
229
  throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
229
230
  }
230
231
  } else {
231
- params.lora = params_base.lora_adapters;
232
+ params.lora = {};
232
233
  }
233
234
 
234
235
  // TODO: add more sanity checks for the input parameters
@@ -243,11 +244,11 @@ task_params server_task::params_from_json_cmpl(
243
244
 
244
245
  if (params.sampling.penalty_last_n == -1) {
245
246
  // note: should be the slot's context and not the full context, but it's ok
246
- params.sampling.penalty_last_n = llama_n_ctx(ctx);
247
+ params.sampling.penalty_last_n = n_ctx_slot;
247
248
  }
248
249
 
249
250
  if (params.sampling.dry_penalty_last_n == -1) {
250
- params.sampling.dry_penalty_last_n = llama_n_ctx(ctx);
251
+ params.sampling.dry_penalty_last_n = n_ctx_slot;
251
252
  }
252
253
 
253
254
  if (params.sampling.dry_base < 1.0f) {
@@ -1153,7 +1154,7 @@ json server_task_result_rerank::to_json() {
1153
1154
  json server_task_result_cmpl_partial::to_json_anthropic() {
1154
1155
  json events = json::array();
1155
1156
  bool first = (n_decoded == 1);
1156
- static bool text_block_started = false;
1157
+ bool text_block_started = false;
1157
1158
 
1158
1159
  if (first) {
1159
1160
  text_block_started = false;
@@ -1324,6 +1325,30 @@ json server_task_result_slot_erase::to_json() {
1324
1325
  };
1325
1326
  }
1326
1327
 
1328
+ //
1329
+ // server_task_result_get_lora
1330
+ //
1331
+
1332
+ json server_task_result_get_lora::to_json() {
1333
+ json result = json::array();
1334
+ for (size_t i = 0; i < loras.size(); ++i) {
1335
+ auto & lora = loras[i];
1336
+ json entry = {
1337
+ {"id", i},
1338
+ {"path", lora.info.path},
1339
+ {"scale", lora.info.scale},
1340
+ {"task_name", lora.info.task_name},
1341
+ {"prompt_prefix", lora.info.prompt_prefix},
1342
+ };
1343
+ if (!lora.alora_invocation_tokens.empty()) {
1344
+ entry["alora_invocation_string"] = lora.alora_invocation_string;
1345
+ entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
1346
+ }
1347
+ result.push_back(std::move(entry));
1348
+ }
1349
+ return result;
1350
+ }
1351
+
1327
1352
  //
1328
1353
  // server_task_result_apply_lora
1329
1354
  //
@@ -6,6 +6,7 @@
6
6
  #include <string>
7
7
  #include <unordered_set>
8
8
  #include <list>
9
+ #include <map>
9
10
 
10
11
  // TODO: prevent including the whole server-common.h as we only use server_tokens
11
12
  #include "server-common.h"
@@ -23,6 +24,7 @@ enum server_task_type {
23
24
  SERVER_TASK_TYPE_SLOT_SAVE,
24
25
  SERVER_TASK_TYPE_SLOT_RESTORE,
25
26
  SERVER_TASK_TYPE_SLOT_ERASE,
27
+ SERVER_TASK_TYPE_GET_LORA,
26
28
  SERVER_TASK_TYPE_SET_LORA,
27
29
  };
28
30
 
@@ -60,7 +62,7 @@ struct task_params {
60
62
  int64_t t_max_prompt_ms = -1; // TODO: implement
61
63
  int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
62
64
 
63
- std::vector<common_adapter_lora_info> lora;
65
+ std::map<int, float> lora; // mapping adapter ID -> scale
64
66
 
65
67
  std::vector<std::string> antiprompt;
66
68
  std::vector<std::string> response_fields;
@@ -105,8 +107,10 @@ struct task_result_state {
105
107
  };
106
108
 
107
109
  struct server_task {
108
- int id = -1; // to be filled by server_queue
109
- int index = -1; // used when there are multiple prompts (batch request)
110
+ int id = -1; // to be filled by server_queue
111
+
112
+ // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
113
+ size_t index = 0; // used when there are multiple prompts (batch request)
110
114
 
111
115
  // used by SERVER_TASK_TYPE_CANCEL
112
116
  int id_target = -1;
@@ -138,7 +142,7 @@ struct server_task {
138
142
  bool metrics_reset_bucket = false;
139
143
 
140
144
  // used by SERVER_TASK_TYPE_SET_LORA
141
- std::vector<common_adapter_lora_info> set_lora;
145
+ std::map<int, float> set_lora; // mapping adapter ID -> scale
142
146
 
143
147
  server_task() = default;
144
148
 
@@ -149,9 +153,10 @@ struct server_task {
149
153
  }
150
154
 
151
155
  static task_params params_from_json_cmpl(
152
- const llama_context * ctx,
153
- const common_params & params_base,
154
- const json & data);
156
+ const llama_vocab * vocab,
157
+ const common_params & params_base,
158
+ const int n_ctx_slot,
159
+ const json & data);
155
160
 
156
161
  // utility function
157
162
  static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
@@ -162,10 +167,9 @@ struct server_task {
162
167
  return ids;
163
168
  }
164
169
 
165
- server_task create_child(int id_parent, int id_child, int idx) const {
170
+ server_task create_child(int id_parent, int id_child) const {
166
171
  server_task copy;
167
172
  copy.id = id_child;
168
- copy.index = idx;
169
173
  copy.id_parent = id_parent;
170
174
  copy.params = params;
171
175
  copy.type = type;
@@ -212,6 +216,10 @@ struct result_prompt_progress {
212
216
  struct server_task_result {
213
217
  int id = -1;
214
218
  int id_slot = -1;
219
+
220
+ // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader
221
+ size_t index = 0; // to be used for batched tasks
222
+
215
223
  virtual bool is_error() {
216
224
  // only used by server_task_result_error
217
225
  return false;
@@ -220,9 +228,6 @@ struct server_task_result {
220
228
  // only used by server_task_result_cmpl_*
221
229
  return true;
222
230
  }
223
- virtual int get_index() {
224
- return -1;
225
- }
226
231
  virtual void update(task_result_state &) {
227
232
  // only used by server_task_result_cmpl_*
228
233
  }
@@ -255,8 +260,6 @@ struct completion_token_output {
255
260
  };
256
261
 
257
262
  struct server_task_result_cmpl_final : server_task_result {
258
- int index = 0;
259
-
260
263
  std::string content;
261
264
  llama_tokens tokens;
262
265
 
@@ -289,10 +292,6 @@ struct server_task_result_cmpl_final : server_task_result {
289
292
  std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
290
293
  bool is_updated = false;
291
294
 
292
- virtual int get_index() override {
293
- return index;
294
- }
295
-
296
295
  virtual bool is_stop() override {
297
296
  return true; // in stream mode, final responses are considered stop
298
297
  }
@@ -318,8 +317,6 @@ struct server_task_result_cmpl_final : server_task_result {
318
317
  };
319
318
 
320
319
  struct server_task_result_cmpl_partial : server_task_result {
321
- int index = 0;
322
-
323
320
  std::string content;
324
321
  llama_tokens tokens;
325
322
 
@@ -340,10 +337,6 @@ struct server_task_result_cmpl_partial : server_task_result {
340
337
  std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
341
338
  bool is_updated = false;
342
339
 
343
- virtual int get_index() override {
344
- return index;
345
- }
346
-
347
340
  virtual bool is_stop() override {
348
341
  return false; // in stream mode, partial responses are not considered stop
349
342
  }
@@ -365,7 +358,6 @@ struct server_task_result_cmpl_partial : server_task_result {
365
358
  };
366
359
 
367
360
  struct server_task_result_embd : server_task_result {
368
- int index = 0;
369
361
  std::vector<std::vector<float>> embedding;
370
362
 
371
363
  int32_t n_tokens;
@@ -373,10 +365,6 @@ struct server_task_result_embd : server_task_result {
373
365
  // response formatting
374
366
  task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
375
367
 
376
- virtual int get_index() override {
377
- return index;
378
- }
379
-
380
368
  virtual json to_json() override;
381
369
 
382
370
  json to_json_non_oaicompat();
@@ -385,20 +373,14 @@ struct server_task_result_embd : server_task_result {
385
373
  };
386
374
 
387
375
  struct server_task_result_rerank : server_task_result {
388
- int index = 0;
389
376
  float score = -1e6;
390
377
 
391
378
  int32_t n_tokens;
392
379
 
393
- virtual int get_index() override {
394
- return index;
395
- }
396
-
397
380
  virtual json to_json() override;
398
381
  };
399
382
 
400
383
  struct server_task_result_error : server_task_result {
401
- int index = 0;
402
384
  error_type err_type = ERROR_TYPE_SERVER;
403
385
  std::string err_msg;
404
386
 
@@ -460,6 +442,17 @@ struct server_task_result_slot_erase : server_task_result {
460
442
  virtual json to_json() override;
461
443
  };
462
444
 
445
+ struct server_task_result_get_lora : server_task_result {
446
+ struct lora {
447
+ common_adapter_lora_info info;
448
+ std::string alora_invocation_string;
449
+ llama_tokens alora_invocation_tokens;
450
+ };
451
+ std::vector<lora> loras;
452
+
453
+ virtual json to_json() override;
454
+ };
455
+
463
456
  struct server_task_result_apply_lora : server_task_result {
464
457
  virtual json to_json() override;
465
458
  };
@@ -66,7 +66,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
66
66
  };
67
67
  }
68
68
 
69
- int main(int argc, char ** argv, char ** envp) {
69
+ int main(int argc, char ** argv) {
70
70
  // own arguments required by this example
71
71
  common_params params;
72
72
 
@@ -119,14 +119,14 @@ int main(int argc, char ** argv, char ** envp) {
119
119
  //
120
120
 
121
121
  // register API routes
122
- server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); });
122
+ server_routes routes(params, ctx_server);
123
123
 
124
124
  bool is_router_server = params.model.path.empty();
125
125
  std::optional<server_models_routes> models_routes{};
126
126
  if (is_router_server) {
127
127
  // setup server instances manager
128
128
  try {
129
- models_routes.emplace(params, argc, argv, envp);
129
+ models_routes.emplace(params, argc, argv);
130
130
  } catch (const std::exception & e) {
131
131
  LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
132
132
  return 1;
@@ -252,7 +252,7 @@ int main(int argc, char ** argv, char ** envp) {
252
252
  return 1;
253
253
  }
254
254
 
255
- ctx_server.init();
255
+ routes.update_meta(ctx_server);
256
256
  ctx_http.is_ready.store(true);
257
257
 
258
258
  LOG_INF("%s: model loaded\n", __func__);
@@ -309,7 +309,11 @@ int main(int argc, char ** argv, char ** envp) {
309
309
  if (monitor_thread.joinable()) {
310
310
  monitor_thread.join();
311
311
  }
312
- llama_memory_breakdown_print(ctx_server.get_llama_context());
312
+
313
+ auto * ll_ctx = ctx_server.get_llama_context();
314
+ if (ll_ctx != nullptr) {
315
+ llama_memory_breakdown_print(ll_ctx);
316
+ }
313
317
  }
314
318
 
315
319
  return 0;