llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  14. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  15. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  17. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  18. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  19. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
  26. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  27. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  28. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  29. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  30. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
  31. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  32. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  33. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
  34. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  36. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  37. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  38. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  39. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  40. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  128. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  129. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  130. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  131. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  132. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  133. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
  134. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  135. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  136. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  137. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  142. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  143. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
  145. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  146. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  147. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  148. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  149. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  150. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
  151. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  152. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  153. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  155. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
  156. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  157. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
  158. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
  159. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  160. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  161. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
  162. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  163. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  164. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  165. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  166. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  167. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  168. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  169. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  170. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  171. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  172. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  173. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  174. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  175. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  176. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
  177. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  178. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  179. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  180. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  182. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
  183. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  184. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  186. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  187. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  188. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  189. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  190. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  191. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  192. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  193. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  194. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  196. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  197. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  198. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  200. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
  201. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  202. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  203. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  206. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  227. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  228. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  231. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  232. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
  233. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
  234. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  235. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
  236. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
  237. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  238. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  239. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  240. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  241. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  242. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -434,8 +434,8 @@ def test_context_size_exceeded_stream():
434
434
  @pytest.mark.parametrize(
435
435
  "n_batch,batch_count,reuse_cache",
436
436
  [
437
- (64, 3, False),
438
- (64, 1, True),
437
+ (64, 4, False),
438
+ (64, 2, True),
439
439
  ]
440
440
  )
441
441
  def test_return_progress(n_batch, batch_count, reuse_cache):
@@ -462,10 +462,18 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
462
462
  res = make_cmpl_request()
463
463
  last_progress = None
464
464
  total_batch_count = 0
465
+
465
466
  for data in res:
466
467
  cur_progress = data.get("prompt_progress", None)
467
468
  if cur_progress is None:
468
469
  continue
470
+ if total_batch_count == 0:
471
+ # first progress report must have n_cache == n_processed
472
+ assert cur_progress["total"] > 0
473
+ assert cur_progress["cache"] == cur_progress["processed"]
474
+ if reuse_cache:
475
+ # when reusing cache, we expect some cached tokens
476
+ assert cur_progress["cache"] > 0
469
477
  if last_progress is not None:
470
478
  assert cur_progress["total"] == last_progress["total"]
471
479
  assert cur_progress["cache"] == last_progress["cache"]
@@ -473,6 +481,7 @@ def test_return_progress(n_batch, batch_count, reuse_cache):
473
481
  total_batch_count += 1
474
482
  last_progress = cur_progress
475
483
 
484
+ # last progress should indicate completion (all tokens processed)
476
485
  assert last_progress is not None
477
486
  assert last_progress["total"] > 0
478
487
  assert last_progress["processed"] == last_progress["total"]
@@ -0,0 +1,39 @@
1
+ import pytest
2
+ import time
3
+ from utils import *
4
+
5
+ server = ServerPreset.tinyllama2()
6
+
7
+
8
+ @pytest.fixture(autouse=True)
9
+ def create_server():
10
+ global server
11
+ server = ServerPreset.tinyllama2()
12
+
13
+
14
+ def test_server_sleep():
15
+ global server
16
+ server.sleep_idle_seconds = 1
17
+ server.start()
18
+
19
+ # wait a bit so that server can go to sleep
20
+ time.sleep(2)
21
+
22
+ # make sure these endpoints are still responsive after sleep
23
+ res = server.make_request("GET", "/health")
24
+ assert res.status_code == 200
25
+ res = server.make_request("GET", "/props")
26
+ assert res.status_code == 200
27
+ assert res.body["is_sleeping"] == True
28
+
29
+ # make a generation request to wake up the server
30
+ res = server.make_request("POST", "/completion", data={
31
+ "n_predict": 1,
32
+ "prompt": "Hello",
33
+ })
34
+ assert res.status_code == 200
35
+
36
+ # it should no longer be sleeping
37
+ res = server.make_request("GET", "/props")
38
+ assert res.status_code == 200
39
+ assert res.body["is_sleeping"] == False
@@ -100,6 +100,7 @@ class ServerProcess:
100
100
  server_path: str | None = None
101
101
  mmproj_url: str | None = None
102
102
  media_path: str | None = None
103
+ sleep_idle_seconds: int | None = None
103
104
 
104
105
  # session variables
105
106
  process: subprocess.Popen | None = None
@@ -230,6 +231,8 @@ class ServerProcess:
230
231
  server_args.extend(["--mmproj-url", self.mmproj_url])
231
232
  if self.media_path:
232
233
  server_args.extend(["--media-path", self.media_path])
234
+ if self.sleep_idle_seconds is not None:
235
+ server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
233
236
 
234
237
  args = [str(arg) for arg in [server_path, *server_args]]
235
238
  print(f"tests: starting server with: {' '.join(args)}")
@@ -89,6 +89,7 @@
89
89
  const fallbackToolCalls = $derived(typeof toolCallContent === 'string' ? toolCallContent : null);
90
90
 
91
91
  const processingState = useProcessingState();
92
+
92
93
  let currentConfig = $derived(config());
93
94
  let isRouter = $derived(isRouterMode());
94
95
  let displayedModel = $derived((): string | null => {
@@ -116,6 +117,12 @@
116
117
  }
117
118
  });
118
119
 
120
+ $effect(() => {
121
+ if (isLoading() && !message?.content?.trim()) {
122
+ processingState.startMonitoring();
123
+ }
124
+ });
125
+
119
126
  function formatToolCallBadge(toolCall: ApiChatCompletionToolCall, index: number) {
120
127
  const callNumber = index + 1;
121
128
  const functionName = toolCall.function?.name?.trim();
@@ -186,7 +193,7 @@
186
193
  <div class="mt-6 w-full max-w-[48rem]" in:fade>
187
194
  <div class="processing-container">
188
195
  <span class="processing-text">
189
- {processingState.getProcessingMessage()}
196
+ {processingState.getPromptProgressText() ?? processingState.getProcessingMessage()}
190
197
  </span>
191
198
  </div>
192
199
  </div>
@@ -263,6 +270,23 @@
263
270
  predictedTokens={message.timings.predicted_n}
264
271
  predictedMs={message.timings.predicted_ms}
265
272
  />
273
+ {:else if isLoading() && currentConfig.showMessageStats}
274
+ {@const liveStats = processingState.getLiveProcessingStats()}
275
+ {@const genStats = processingState.getLiveGenerationStats()}
276
+ {@const promptProgress = processingState.processingState?.promptProgress}
277
+ {@const isStillProcessingPrompt =
278
+ promptProgress && promptProgress.processed < promptProgress.total}
279
+
280
+ {#if liveStats || genStats}
281
+ <ChatMessageStatistics
282
+ isLive={true}
283
+ isProcessingPrompt={!!isStillProcessingPrompt}
284
+ promptTokens={liveStats?.tokensProcessed}
285
+ promptMs={liveStats?.timeMs}
286
+ predictedTokens={genStats?.tokensGenerated}
287
+ predictedMs={genStats?.timeMs}
288
+ />
289
+ {/if}
266
290
  {/if}
267
291
  </div>
268
292
  {/if}
@@ -5,21 +5,64 @@
5
5
  import { ChatMessageStatsView } from '$lib/enums';
6
6
 
7
7
  interface Props {
8
- predictedTokens: number;
9
- predictedMs: number;
8
+ predictedTokens?: number;
9
+ predictedMs?: number;
10
10
  promptTokens?: number;
11
11
  promptMs?: number;
12
+ // Live mode: when true, shows stats during streaming
13
+ isLive?: boolean;
14
+ // Whether prompt processing is still in progress
15
+ isProcessingPrompt?: boolean;
16
+ // Initial view to show (defaults to READING in live mode)
17
+ initialView?: ChatMessageStatsView;
12
18
  }
13
19
 
14
- let { predictedTokens, predictedMs, promptTokens, promptMs }: Props = $props();
20
+ let {
21
+ predictedTokens,
22
+ predictedMs,
23
+ promptTokens,
24
+ promptMs,
25
+ isLive = false,
26
+ isProcessingPrompt = false,
27
+ initialView = ChatMessageStatsView.GENERATION
28
+ }: Props = $props();
15
29
 
16
- let activeView: ChatMessageStatsView = $state(ChatMessageStatsView.GENERATION);
30
+ let activeView: ChatMessageStatsView = $state(initialView);
31
+ let hasAutoSwitchedToGeneration = $state(false);
17
32
 
18
- let tokensPerSecond = $derived((predictedTokens / predictedMs) * 1000);
19
- let timeInSeconds = $derived((predictedMs / 1000).toFixed(2));
33
+ // In live mode: auto-switch to GENERATION tab when prompt processing completes
34
+ $effect(() => {
35
+ if (isLive) {
36
+ // Auto-switch to generation tab only when prompt processing is done (once)
37
+ if (
38
+ !hasAutoSwitchedToGeneration &&
39
+ !isProcessingPrompt &&
40
+ predictedTokens &&
41
+ predictedTokens > 0
42
+ ) {
43
+ activeView = ChatMessageStatsView.GENERATION;
44
+ hasAutoSwitchedToGeneration = true;
45
+ } else if (!hasAutoSwitchedToGeneration) {
46
+ // Stay on READING while prompt is still being processed
47
+ activeView = ChatMessageStatsView.READING;
48
+ }
49
+ }
50
+ });
51
+
52
+ let hasGenerationStats = $derived(
53
+ predictedTokens !== undefined &&
54
+ predictedTokens > 0 &&
55
+ predictedMs !== undefined &&
56
+ predictedMs > 0
57
+ );
58
+
59
+ let tokensPerSecond = $derived(hasGenerationStats ? (predictedTokens! / predictedMs!) * 1000 : 0);
60
+ let timeInSeconds = $derived(
61
+ predictedMs !== undefined ? (predictedMs / 1000).toFixed(2) : '0.00'
62
+ );
20
63
 
21
64
  let promptTokensPerSecond = $derived(
22
- promptTokens !== undefined && promptMs !== undefined
65
+ promptTokens !== undefined && promptMs !== undefined && promptMs > 0
23
66
  ? (promptTokens / promptMs) * 1000
24
67
  : undefined
25
68
  );
@@ -34,11 +77,14 @@
34
77
  promptTokensPerSecond !== undefined &&
35
78
  promptTimeInSeconds !== undefined
36
79
  );
80
+
81
+ // In live mode, generation tab is disabled until we have generation stats
82
+ let isGenerationDisabled = $derived(isLive && !hasGenerationStats);
37
83
  </script>
38
84
 
39
85
  <div class="inline-flex items-center text-xs text-muted-foreground">
40
86
  <div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
41
- {#if hasPromptStats}
87
+ {#if hasPromptStats || isLive}
42
88
  <Tooltip.Root>
43
89
  <Tooltip.Trigger>
44
90
  <button
@@ -65,25 +111,32 @@
65
111
  class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
66
112
  ChatMessageStatsView.GENERATION
67
113
  ? 'bg-background text-foreground shadow-sm'
68
- : 'hover:text-foreground'}"
69
- onclick={() => (activeView = ChatMessageStatsView.GENERATION)}
114
+ : isGenerationDisabled
115
+ ? 'cursor-not-allowed opacity-40'
116
+ : 'hover:text-foreground'}"
117
+ onclick={() => !isGenerationDisabled && (activeView = ChatMessageStatsView.GENERATION)}
118
+ disabled={isGenerationDisabled}
70
119
  >
71
120
  <Sparkles class="h-3 w-3" />
72
121
  <span class="sr-only">Generation</span>
73
122
  </button>
74
123
  </Tooltip.Trigger>
75
124
  <Tooltip.Content>
76
- <p>Generation (token output)</p>
125
+ <p>
126
+ {isGenerationDisabled
127
+ ? 'Generation (waiting for tokens...)'
128
+ : 'Generation (token output)'}
129
+ </p>
77
130
  </Tooltip.Content>
78
131
  </Tooltip.Root>
79
132
  </div>
80
133
 
81
134
  <div class="flex items-center gap-1 px-2">
82
- {#if activeView === ChatMessageStatsView.GENERATION}
135
+ {#if activeView === ChatMessageStatsView.GENERATION && hasGenerationStats}
83
136
  <BadgeChatStatistic
84
137
  class="bg-transparent"
85
138
  icon={WholeWord}
86
- value="{predictedTokens} tokens"
139
+ value="{predictedTokens?.toLocaleString()} tokens"
87
140
  tooltipLabel="Generated tokens"
88
141
  />
89
142
  <BadgeChatStatistic
@@ -185,6 +185,11 @@
185
185
  key: 'samplers',
186
186
  label: 'Samplers',
187
187
  type: 'input'
188
+ },
189
+ {
190
+ key: 'backend_sampling',
191
+ label: 'Backend sampling',
192
+ type: 'checkbox'
188
193
  }
189
194
  ]
190
195
  },
@@ -21,6 +21,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
21
21
  autoMicOnEmpty: false,
22
22
  // make sure these default values are in sync with `common.h`
23
23
  samplers: 'top_k;typ_p;top_p;min_p;temperature',
24
+ backend_sampling: false,
24
25
  temperature: 0.8,
25
26
  dynatemp_range: 0.0,
26
27
  dynatemp_exponent: 1.0,
@@ -57,6 +58,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
57
58
  'When copying a message with text attachments, combine them into a single plain text string instead of a special format that can be pasted back as attachments.',
58
59
  samplers:
59
60
  'The order at which samplers are applied, in simplified way. Default is "top_k;typ_p;top_p;min_p;temperature": top_k->typ_p->top_p->min_p->temperature',
61
+ backend_sampling:
62
+ 'Enable backend-based samplers. When enabled, supported samplers run on the accelerator backend for faster sampling.',
60
63
  temperature:
61
64
  'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
62
65
  dynatemp_range:
@@ -1,10 +1,27 @@
1
1
  import { activeProcessingState } from '$lib/stores/chat.svelte';
2
2
  import { config } from '$lib/stores/settings.svelte';
3
3
 
4
+ export interface LiveProcessingStats {
5
+ tokensProcessed: number;
6
+ totalTokens: number;
7
+ timeMs: number;
8
+ tokensPerSecond: number;
9
+ etaSecs?: number;
10
+ }
11
+
12
+ export interface LiveGenerationStats {
13
+ tokensGenerated: number;
14
+ timeMs: number;
15
+ tokensPerSecond: number;
16
+ }
17
+
4
18
  export interface UseProcessingStateReturn {
5
19
  readonly processingState: ApiProcessingState | null;
6
20
  getProcessingDetails(): string[];
7
21
  getProcessingMessage(): string;
22
+ getPromptProgressText(): string | null;
23
+ getLiveProcessingStats(): LiveProcessingStats | null;
24
+ getLiveGenerationStats(): LiveGenerationStats | null;
8
25
  shouldShowDetails(): boolean;
9
26
  startMonitoring(): void;
10
27
  stopMonitoring(): void;
@@ -29,6 +46,7 @@ export interface UseProcessingStateReturn {
29
46
  export function useProcessingState(): UseProcessingStateReturn {
30
47
  let isMonitoring = $state(false);
31
48
  let lastKnownState = $state<ApiProcessingState | null>(null);
49
+ let lastKnownProcessingStats = $state<LiveProcessingStats | null>(null);
32
50
 
33
51
  // Derive processing state reactively from chatStore's direct state
34
52
  const processingState = $derived.by(() => {
@@ -46,6 +64,34 @@ export function useProcessingState(): UseProcessingStateReturn {
46
64
  }
47
65
  });
48
66
 
67
+ // Track last known processing stats for when promptProgress disappears
68
+ $effect(() => {
69
+ if (processingState?.promptProgress) {
70
+ const { processed, total, time_ms, cache } = processingState.promptProgress;
71
+ const actualProcessed = processed - cache;
72
+ const actualTotal = total - cache;
73
+
74
+ if (actualProcessed > 0 && time_ms > 0) {
75
+ const tokensPerSecond = actualProcessed / (time_ms / 1000);
76
+ lastKnownProcessingStats = {
77
+ tokensProcessed: actualProcessed,
78
+ totalTokens: actualTotal,
79
+ timeMs: time_ms,
80
+ tokensPerSecond
81
+ };
82
+ }
83
+ }
84
+ });
85
+
86
+ function getETASecs(done: number, total: number, elapsedMs: number): number | undefined {
87
+ const elapsedSecs = elapsedMs / 1000;
88
+ const progressETASecs =
89
+ done === 0 || elapsedSecs < 0.5
90
+ ? undefined // can be the case for the 0% progress report
91
+ : elapsedSecs * (total / done - 1);
92
+ return progressETASecs;
93
+ }
94
+
49
95
  function startMonitoring(): void {
50
96
  if (isMonitoring) return;
51
97
  isMonitoring = true;
@@ -59,28 +105,25 @@ export function useProcessingState(): UseProcessingStateReturn {
59
105
  const currentConfig = config();
60
106
  if (!currentConfig.keepStatsVisible) {
61
107
  lastKnownState = null;
108
+ lastKnownProcessingStats = null;
62
109
  }
63
110
  }
64
111
 
65
112
  function getProcessingMessage(): string {
66
- const state = processingState;
67
- if (!state) {
113
+ if (!processingState) {
68
114
  return 'Processing...';
69
115
  }
70
116
 
71
- switch (state.status) {
117
+ switch (processingState.status) {
72
118
  case 'initializing':
73
119
  return 'Initializing...';
74
120
  case 'preparing':
75
- if (state.progressPercent !== undefined) {
76
- return `Processing (${state.progressPercent}%)`;
121
+ if (processingState.progressPercent !== undefined) {
122
+ return `Processing (${processingState.progressPercent}%)`;
77
123
  }
78
124
  return 'Preparing response...';
79
125
  case 'generating':
80
- if (state.tokensDecoded > 0) {
81
- return `Generating... (${state.tokensDecoded} tokens)`;
82
- }
83
- return 'Generating...';
126
+ return '';
84
127
  default:
85
128
  return 'Processing...';
86
129
  }
@@ -131,8 +174,76 @@ export function useProcessingState(): UseProcessingStateReturn {
131
174
  }
132
175
 
133
176
  function shouldShowDetails(): boolean {
134
- const state = processingState;
135
- return state !== null && state.status !== 'idle';
177
+ return processingState !== null && processingState.status !== 'idle';
178
+ }
179
+
180
+ /**
181
+ * Returns a short progress message with percent
182
+ */
183
+ function getPromptProgressText(): string | null {
184
+ if (!processingState?.promptProgress) return null;
185
+
186
+ const { processed, total, cache } = processingState.promptProgress;
187
+
188
+ const actualProcessed = processed - cache;
189
+ const actualTotal = total - cache;
190
+ const percent = Math.round((actualProcessed / actualTotal) * 100);
191
+ const eta = getETASecs(actualProcessed, actualTotal, processingState.promptProgress.time_ms);
192
+
193
+ if (eta !== undefined) {
194
+ const etaSecs = Math.ceil(eta);
195
+ return `Processing ${percent}% (ETA: ${etaSecs}s)`;
196
+ }
197
+
198
+ return `Processing ${percent}%`;
199
+ }
200
+
201
+ /**
202
+ * Returns live processing statistics for display (prompt processing phase)
203
+ * Returns last known stats when promptProgress becomes unavailable
204
+ */
205
+ function getLiveProcessingStats(): LiveProcessingStats | null {
206
+ if (processingState?.promptProgress) {
207
+ const { processed, total, time_ms, cache } = processingState.promptProgress;
208
+
209
+ const actualProcessed = processed - cache;
210
+ const actualTotal = total - cache;
211
+
212
+ if (actualProcessed > 0 && time_ms > 0) {
213
+ const tokensPerSecond = actualProcessed / (time_ms / 1000);
214
+
215
+ return {
216
+ tokensProcessed: actualProcessed,
217
+ totalTokens: actualTotal,
218
+ timeMs: time_ms,
219
+ tokensPerSecond
220
+ };
221
+ }
222
+ }
223
+
224
+ // Return last known stats if promptProgress is no longer available
225
+ return lastKnownProcessingStats;
226
+ }
227
+
228
+ /**
229
+ * Returns live generation statistics for display (token generation phase)
230
+ */
231
+ function getLiveGenerationStats(): LiveGenerationStats | null {
232
+ if (!processingState) return null;
233
+
234
+ const { tokensDecoded, tokensPerSecond } = processingState;
235
+
236
+ if (tokensDecoded <= 0) return null;
237
+
238
+ // Calculate time from tokens and speed
239
+ const timeMs =
240
+ tokensPerSecond && tokensPerSecond > 0 ? (tokensDecoded / tokensPerSecond) * 1000 : 0;
241
+
242
+ return {
243
+ tokensGenerated: tokensDecoded,
244
+ timeMs,
245
+ tokensPerSecond: tokensPerSecond || 0
246
+ };
136
247
  }
137
248
 
138
249
  return {
@@ -141,6 +252,9 @@ export function useProcessingState(): UseProcessingStateReturn {
141
252
  },
142
253
  getProcessingDetails,
143
254
  getProcessingMessage,
255
+ getPromptProgressText,
256
+ getLiveProcessingStats,
257
+ getLiveGenerationStats,
144
258
  shouldShowDetails,
145
259
  startMonitoring,
146
260
  stopMonitoring
@@ -86,6 +86,7 @@ export class ChatService {
86
86
  dry_penalty_last_n,
87
87
  // Other parameters
88
88
  samplers,
89
+ backend_sampling,
89
90
  custom,
90
91
  timings_per_token,
91
92
  // Config options
@@ -117,7 +118,8 @@ export class ChatService {
117
118
  role: msg.role,
118
119
  content: msg.content
119
120
  })),
120
- stream
121
+ stream,
122
+ return_progress: stream ? true : undefined
121
123
  };
122
124
 
123
125
  // Include model in request if provided (required in ROUTER mode)
@@ -158,6 +160,8 @@ export class ChatService {
158
160
  : samplers;
159
161
  }
160
162
 
163
+ if (backend_sampling !== undefined) requestBody.backend_sampling = backend_sampling;
164
+
161
165
  if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
162
166
 
163
167
  if (custom) {
@@ -271,7 +275,7 @@ export class ChatService {
271
275
  onReasoningChunk?: (chunk: string) => void,
272
276
  onToolCallChunk?: (chunk: string) => void,
273
277
  onModel?: (model: string) => void,
274
- onTimings?: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
278
+ onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
275
279
  conversationId?: string,
276
280
  abortSignal?: AbortSignal
277
281
  ): Promise<void> {
@@ -366,11 +370,13 @@ export class ChatService {
366
370
  onModel?.(chunkModel);
367
371
  }
368
372
 
369
- if (timings || promptProgress) {
373
+ if (promptProgress) {
374
+ ChatService.notifyTimings(undefined, promptProgress, onTimings);
375
+ }
376
+
377
+ if (timings) {
370
378
  ChatService.notifyTimings(timings, promptProgress, onTimings);
371
- if (timings) {
372
- lastTimings = timings;
373
- }
379
+ lastTimings = timings;
374
380
  }
375
381
 
376
382
  if (content) {
@@ -768,10 +774,11 @@ export class ChatService {
768
774
  timings: ChatMessageTimings | undefined,
769
775
  promptProgress: ChatMessagePromptProgress | undefined,
770
776
  onTimingsCallback:
771
- | ((timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
777
+ | ((timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void)
772
778
  | undefined
773
779
  ): void {
774
- if (!timings || !onTimingsCallback) return;
780
+ if (!onTimingsCallback || (!timings && !promptProgress)) return;
781
+
775
782
  onTimingsCallback(timings, promptProgress);
776
783
  }
777
784
  }
@@ -303,11 +303,17 @@ class ChatStore {
303
303
  const currentConfig = config();
304
304
  const outputTokensMax = currentConfig.max_tokens || -1;
305
305
 
306
+ // Note: for timings data, the n_prompt does NOT include cache tokens
306
307
  const contextUsed = promptTokens + cacheTokens + predictedTokens;
307
308
  const outputTokensUsed = predictedTokens;
308
309
 
310
+ // Note: for prompt progress, the "processed" DOES include cache tokens
311
+ // we need to exclude them to get the real prompt tokens processed count
312
+ const progressCache = promptProgress?.cache || 0;
313
+ const progressActualDone = (promptProgress?.processed ?? 0) - progressCache;
314
+ const progressActualTotal = (promptProgress?.total ?? 0) - progressCache;
309
315
  const progressPercent = promptProgress
310
- ? Math.round((promptProgress.processed / promptProgress.total) * 100)
316
+ ? Math.round((progressActualDone / progressActualTotal) * 100)
311
317
  : undefined;
312
318
 
313
319
  return {
@@ -324,6 +330,7 @@ class ChatStore {
324
330
  topP: currentConfig.top_p ?? 0.95,
325
331
  speculative: false,
326
332
  progressPercent,
333
+ promptProgress,
327
334
  promptTokens,
328
335
  promptMs,
329
336
  cacheTokens
@@ -534,7 +541,7 @@ class ChatStore {
534
541
  conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
535
542
  },
536
543
  onModel: (modelName: string) => recordModel(modelName),
537
- onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
544
+ onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
538
545
  const tokensPerSecond =
539
546
  timings?.predicted_ms && timings?.predicted_n
540
547
  ? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1032,7 +1039,7 @@ class ChatStore {
1032
1039
  });
1033
1040
  },
1034
1041
 
1035
- onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
1042
+ onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
1036
1043
  const tokensPerSecond =
1037
1044
  timings?.predicted_ms && timings?.predicted_n
1038
1045
  ? (timings.predicted_n / timings.predicted_ms) * 1000
@@ -1454,6 +1461,8 @@ class ChatStore {
1454
1461
  if (hasValue(currentConfig.dry_penalty_last_n))
1455
1462
  apiOptions.dry_penalty_last_n = Number(currentConfig.dry_penalty_last_n);
1456
1463
  if (currentConfig.samplers) apiOptions.samplers = currentConfig.samplers;
1464
+ if (currentConfig.backend_sampling)
1465
+ apiOptions.backend_sampling = currentConfig.backend_sampling;
1457
1466
  if (currentConfig.custom) apiOptions.custom = currentConfig.custom;
1458
1467
 
1459
1468
  return apiOptions;
@@ -294,15 +294,14 @@ class SettingsStore {
294
294
  * This sets up the default values from /props endpoint
295
295
  */
296
296
  syncWithServerDefaults(): void {
297
- const serverParams = serverStore.defaultParams;
298
- if (!serverParams) {
299
- console.warn('No server parameters available for initialization');
297
+ const propsDefaults = this.getServerDefaults();
298
+
299
+ if (Object.keys(propsDefaults).length === 0) {
300
+ console.warn('No server defaults available for initialization');
300
301
 
301
302
  return;
302
303
  }
303
304
 
304
- const propsDefaults = this.getServerDefaults();
305
-
306
305
  for (const [key, propsValue] of Object.entries(propsDefaults)) {
307
306
  const currentValue = getConfigValue(this.config, key);
308
307
 
@@ -149,6 +149,7 @@ export interface ApiLlamaCppServerProps {
149
149
  reasoning_in_content: boolean;
150
150
  thinking_forced_open: boolean;
151
151
  samplers: string[];
152
+ backend_sampling: boolean;
152
153
  'speculative.n_max': number;
153
154
  'speculative.n_min': number;
154
155
  'speculative.p_min': number;
@@ -186,6 +187,7 @@ export interface ApiChatCompletionRequest {
186
187
  }>;
187
188
  stream?: boolean;
188
189
  model?: string;
190
+ return_progress?: boolean;
189
191
  // Reasoning parameters
190
192
  reasoning_format?: string;
191
193
  // Generation parameters
@@ -211,6 +213,7 @@ export interface ApiChatCompletionRequest {
211
213
  dry_penalty_last_n?: number;
212
214
  // Sampler configuration
213
215
  samplers?: string[];
216
+ backend_sampling?: boolean;
214
217
  // Custom parameters (JSON string)
215
218
  custom?: Record<string, unknown>;
216
219
  timings_per_token?: boolean;
@@ -311,6 +314,7 @@ export interface ApiSlotData {
311
314
  reasoning_in_content: boolean;
312
315
  thinking_forced_open: boolean;
313
316
  samplers: string[];
317
+ backend_sampling: boolean;
314
318
  'speculative.n_max': number;
315
319
  'speculative.n_min': number;
316
320
  'speculative.p_min': number;
@@ -341,6 +345,7 @@ export interface ApiProcessingState {
341
345
  tokensPerSecond?: number;
342
346
  // Progress information from prompt_progress
343
347
  progressPercent?: number;
348
+ promptProgress?: ChatMessagePromptProgress;
344
349
  promptTokens?: number;
345
350
  promptMs?: number;
346
351
  cacheTokens?: number;