llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  14. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  15. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  17. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  18. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  19. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
  26. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  27. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  28. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  29. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  30. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
  31. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  32. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  33. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
  34. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  36. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  37. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  38. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  39. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  40. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  128. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  129. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  130. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  131. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  132. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  133. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
  134. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  135. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  136. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  137. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  142. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  143. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
  145. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  146. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  147. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  148. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  149. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  150. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
  151. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  152. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  153. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  155. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
  156. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  157. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
  158. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
  159. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  160. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  161. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
  162. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  163. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  164. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  165. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  166. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  167. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  168. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  169. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  170. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  171. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  172. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  173. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  174. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  175. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  176. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
  177. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  178. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  179. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  180. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  182. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
  183. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  184. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  186. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  187. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  188. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  189. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  190. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  191. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  192. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  193. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  194. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  196. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  197. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  198. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  200. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
  201. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  202. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  203. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  206. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  227. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  228. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  231. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  232. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
  233. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
  234. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  235. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
  236. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
  237. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  238. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  239. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  240. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  241. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  242. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -818,6 +818,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
818
818
  case PROJECTOR_TYPE_VOXTRAL:
819
819
  case PROJECTOR_TYPE_QWEN2A:
820
820
  case PROJECTOR_TYPE_GLMA:
821
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
821
822
  {
822
823
  builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
823
824
  } break;
@@ -845,6 +846,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
845
846
  {
846
847
  builder = std::make_unique<clip_graph_glm4v>(ctx, img);
847
848
  } break;
849
+ case PROJECTOR_TYPE_YOUTUVL:
850
+ {
851
+ builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
852
+ } break;
848
853
  default:
849
854
  GGML_ABORT("missing cgraph builder");
850
855
  }
@@ -1158,6 +1163,20 @@ struct clip_model_loader {
1158
1163
  LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
1159
1164
  }
1160
1165
  } break;
1166
+ case PROJECTOR_TYPE_YOUTUVL:
1167
+ {
1168
+ hparams.n_merge = 2;
1169
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1170
+ get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
1171
+ std::vector<int> wa_layer_indexes_vec;
1172
+ get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
1173
+ for (auto & layer : wa_layer_indexes_vec) {
1174
+ hparams.wa_layer_indexes.insert(layer);
1175
+ }
1176
+ // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
1177
+ hparams.set_limit_image_tokens(1, 62500);
1178
+ hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
1179
+ } break;
1161
1180
  case PROJECTOR_TYPE_GLM4V:
1162
1181
  {
1163
1182
  hparams.rope_theta = 10000.0f;
@@ -1176,6 +1195,7 @@ struct clip_model_loader {
1176
1195
  case PROJECTOR_TYPE_QWEN2A:
1177
1196
  case PROJECTOR_TYPE_GLMA:
1178
1197
  case PROJECTOR_TYPE_VOXTRAL:
1198
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
1179
1199
  {
1180
1200
  bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
1181
1201
  model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
@@ -1225,7 +1245,14 @@ struct clip_model_loader {
1225
1245
  LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector);
1226
1246
  LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version);
1227
1247
  LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
1228
- LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
1248
+ LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
1249
+ if (!hparams.wa_layer_indexes.empty()) {
1250
+ LOG_INF("%s: wa_layer_indexes: ", __func__);
1251
+ for (auto & layer : hparams.wa_layer_indexes) {
1252
+ LOG_INF("%d ", layer);
1253
+ }
1254
+ LOG_INF("\n");
1255
+ }
1229
1256
  if (hparams.image_min_pixels > 0) {
1230
1257
  LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
1231
1258
  }
@@ -1493,6 +1520,14 @@ struct clip_model_loader {
1493
1520
  model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1494
1521
  model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1495
1522
  } break;
1523
+ case PROJECTOR_TYPE_YOUTUVL:
1524
+ {
1525
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
1526
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0
1527
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1528
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2
1529
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1530
+ } break;
1496
1531
  case PROJECTOR_TYPE_GLM4V:
1497
1532
  {
1498
1533
  model.projection = get_tensor(TN_MM_PROJECTOR);
@@ -1517,6 +1552,14 @@ struct clip_model_loader {
1517
1552
  model.projection = get_tensor(TN_MM_PROJECTOR);
1518
1553
  } break;
1519
1554
  case PROJECTOR_TYPE_LFM2:
1555
+ {
1556
+ model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
1557
+ model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
1558
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1559
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
1560
+ model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1561
+ model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1562
+ } break;
1520
1563
  case PROJECTOR_TYPE_KIMIVL:
1521
1564
  {
1522
1565
  model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
@@ -1576,6 +1619,17 @@ struct clip_model_loader {
1576
1619
  model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1577
1620
  model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1578
1621
  } break;
1622
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
1623
+ {
1624
+ model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1625
+ model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1626
+ model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1627
+ model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1628
+ model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1629
+ model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
1630
+ model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1631
+ model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
1632
+ } break;
1579
1633
  case PROJECTOR_TYPE_INTERNVL:
1580
1634
  {
1581
1635
  model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@@ -2684,6 +2738,57 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2684
2738
  // res_imgs->data[0] = *res;
2685
2739
  res_imgs->entries.push_back(std::move(img_f32));
2686
2740
  } break;
2741
+ case PROJECTOR_TYPE_YOUTUVL:
2742
+ {
2743
+ const int patch_size = params.patch_size; // typically 16
2744
+ const int merge_size = params.n_merge; // typically 2
2745
+ const int align_size = patch_size * merge_size; // 32
2746
+
2747
+ const int max_num_patches = params.image_max_pixels > 0 ?
2748
+ params.image_max_pixels / (patch_size * patch_size) : 256;
2749
+
2750
+ // Linear search for optimal scale to fit within max_num_patches
2751
+ float scale = 1.0f;
2752
+ int target_height = original_size.height;
2753
+ int target_width = original_size.width;
2754
+
2755
+ auto get_scaled_image_size = [align_size](float scale, int size) -> int {
2756
+ float scaled_size = size * scale;
2757
+ // Round up to nearest multiple of align_size
2758
+ int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
2759
+ // Ensure at least one patch
2760
+ return std::max(align_size, aligned);
2761
+ };
2762
+
2763
+ // Linear search with 0.02 step size
2764
+ while (scale > 0.0f) {
2765
+ target_height = get_scaled_image_size(scale, original_size.height);
2766
+ target_width = get_scaled_image_size(scale, original_size.width);
2767
+
2768
+ int num_patches_h = target_height / patch_size;
2769
+ int num_patches_w = target_width / patch_size;
2770
+ int num_patches = num_patches_h * num_patches_w;
2771
+
2772
+ if (num_patches > max_num_patches) {
2773
+ scale -= 0.02f;
2774
+ } else {
2775
+ break;
2776
+ }
2777
+ }
2778
+
2779
+ clip_image_size new_size = {target_width, target_height};
2780
+
2781
+ // Resize the image
2782
+ clip_image_u8 resized;
2783
+ img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
2784
+
2785
+ // Normalize to float32
2786
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
2787
+ normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
2788
+
2789
+ // Add to results
2790
+ res_imgs->entries.push_back(std::move(img_f32));
2791
+ } break;
2687
2792
 
2688
2793
  case PROJECTOR_TYPE_IDEFICS3:
2689
2794
  {
@@ -2916,6 +3021,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
2916
3021
  case PROJECTOR_TYPE_QWEN25VL:
2917
3022
  case PROJECTOR_TYPE_QWEN3VL:
2918
3023
  case PROJECTOR_TYPE_GLM4V:
3024
+ case PROJECTOR_TYPE_YOUTUVL:
2919
3025
  return (img->nx / params.patch_size) / 2;
2920
3026
  default:
2921
3027
  break;
@@ -2931,6 +3037,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
2931
3037
  case PROJECTOR_TYPE_QWEN25VL:
2932
3038
  case PROJECTOR_TYPE_QWEN3VL:
2933
3039
  case PROJECTOR_TYPE_GLM4V:
3040
+ case PROJECTOR_TYPE_YOUTUVL:
2934
3041
  return (img->ny / params.patch_size) / 2;
2935
3042
  default:
2936
3043
  break;
@@ -2991,6 +3098,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
2991
3098
  case PROJECTOR_TYPE_QWEN25VL:
2992
3099
  case PROJECTOR_TYPE_QWEN3VL:
2993
3100
  case PROJECTOR_TYPE_GLM4V:
3101
+ case PROJECTOR_TYPE_YOUTUVL:
2994
3102
  {
2995
3103
  // dynamic size (2 conv, so double patch size)
2996
3104
  int x_patch = img->nx / (params.patch_size * 2);
@@ -3031,6 +3139,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
3031
3139
  case PROJECTOR_TYPE_VOXTRAL:
3032
3140
  case PROJECTOR_TYPE_ULTRAVOX:
3033
3141
  case PROJECTOR_TYPE_QWEN2A:
3142
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
3034
3143
  {
3035
3144
  n_patches = img->nx;
3036
3145
 
@@ -3117,7 +3226,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3117
3226
  const int pos_w = image_size_width / patch_size;
3118
3227
  const int pos_h = image_size_height / patch_size;
3119
3228
 
3120
- const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
3121
3229
 
3122
3230
  auto get_inp_tensor = [&gf](const char * name) {
3123
3231
  ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
@@ -3266,9 +3374,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3266
3374
  set_input_i32("positions", positions);
3267
3375
  } break;
3268
3376
  case PROJECTOR_TYPE_QWEN25VL:
3377
+ case PROJECTOR_TYPE_YOUTUVL:
3269
3378
  {
3270
3379
  // pw * ph = number of tokens output by ViT after apply patch merger
3271
3380
  // ipw * ipw = number of vision token been processed inside ViT
3381
+ const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
3272
3382
  const int merge_ratio = 2;
3273
3383
  const int pw = image_size_width / patch_size / merge_ratio;
3274
3384
  const int ph = image_size_height / patch_size / merge_ratio;
@@ -3279,7 +3389,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3279
3389
  std::vector<int> inv_idx(ph * pw);
3280
3390
 
3281
3391
  if (use_window_attn) {
3282
- const int attn_window_size = 112;
3392
+ const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
3283
3393
  const int grid_window = attn_window_size / patch_size / merge_ratio;
3284
3394
  int dst = 0;
3285
3395
  // [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3403,6 +3513,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3403
3513
  case PROJECTOR_TYPE_ULTRAVOX:
3404
3514
  case PROJECTOR_TYPE_LFM2:
3405
3515
  case PROJECTOR_TYPE_VOXTRAL:
3516
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
3406
3517
  case PROJECTOR_TYPE_JANUS_PRO:
3407
3518
  case PROJECTOR_TYPE_COGVLM:
3408
3519
  {
@@ -3516,6 +3627,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3516
3627
  case PROJECTOR_TYPE_QWEN2VL:
3517
3628
  case PROJECTOR_TYPE_QWEN25VL:
3518
3629
  case PROJECTOR_TYPE_JANUS_PRO:
3630
+ case PROJECTOR_TYPE_YOUTUVL:
3519
3631
  return ctx->model.mm_1_b->ne[0];
3520
3632
  case PROJECTOR_TYPE_QWEN3VL:
3521
3633
  // main path + deepstack paths
@@ -3526,6 +3638,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3526
3638
  return ctx->model.projection->ne[1];
3527
3639
  case PROJECTOR_TYPE_ULTRAVOX:
3528
3640
  case PROJECTOR_TYPE_VOXTRAL:
3641
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
3529
3642
  return ctx->model.mm_2_w->ne[1];
3530
3643
  case PROJECTOR_TYPE_INTERNVL:
3531
3644
  return ctx->model.mm_3_w->ne[1];
@@ -3587,7 +3700,8 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
3587
3700
  return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
3588
3701
  || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
3589
3702
  || ctx->proj_type() == PROJECTOR_TYPE_GLMA
3590
- || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
3703
+ || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
3704
+ || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
3591
3705
  }
3592
3706
 
3593
3707
  bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -2,6 +2,11 @@
2
2
 
3
3
  #include "../clip-graph.h"
4
4
 
5
+ /*
6
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
7
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
8
+ */
9
+
5
10
  struct clip_graph_siglip : clip_graph {
6
11
  clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
7
12
  ggml_cgraph * build() override;
@@ -22,6 +27,11 @@ struct clip_graph_qwen3vl : clip_graph {
22
27
  ggml_cgraph * build() override;
23
28
  };
24
29
 
30
+ struct clip_graph_youtuvl : clip_graph {
31
+ clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
32
+ ggml_cgraph * build() override;
33
+ };
34
+
25
35
  struct clip_graph_minicpmv : clip_graph {
26
36
  clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
27
37
  ggml_cgraph * build() override;
@@ -50,10 +50,15 @@ ggml_cgraph * clip_graph_siglip::build() {
50
50
  const int scale_factor = model.hparams.n_merge;
51
51
  cur = build_patch_merge_permute(cur, scale_factor);
52
52
 
53
- // projection
54
- cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
55
- cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
56
- cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
53
+ // projection, in LFM2-VL input norm is optional
54
+ if (model.mm_input_norm_w) {
55
+ cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
56
+ cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
57
+ }
58
+
59
+ if (model.mm_input_norm_b) {
60
+ cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
61
+ }
57
62
 
58
63
  cur = build_ffn(cur,
59
64
  model.mm_1_w, model.mm_1_b,
@@ -86,6 +86,15 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
86
86
  FFN_GELU_ERF,
87
87
  -1);
88
88
 
89
+ } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
90
+ // projector
91
+ cur = build_ffn(cur,
92
+ model.mm_1_w, model.mm_1_b,
93
+ nullptr, nullptr,
94
+ model.mm_2_w, model.mm_2_b,
95
+ FFN_GELU_ERF,
96
+ -1);
97
+
89
98
  } else if (proj_type == PROJECTOR_TYPE_GLMA) {
90
99
  cur = ggml_norm(ctx0, cur, hparams.eps);
91
100
  cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
@@ -0,0 +1,179 @@
1
+ #include "models.h"
2
+
3
+ ggml_cgraph * clip_graph_youtuvl::build() {
4
+ GGML_ASSERT(model.class_embedding == nullptr);
5
+ const int batch_size = 1;
6
+ const bool use_window_attn = !hparams.wa_layer_indexes.empty();
7
+ const int n_pos = n_patches;
8
+ const int num_position_ids = n_pos * 4;
9
+ const int m = 2;
10
+ const int Wp = n_patches_x;
11
+ const int Hp = n_patches_y;
12
+ const int Hm = Hp / m;
13
+ const int Wm = Wp / m;
14
+ norm_type norm_t = NORM_TYPE_NORMAL;
15
+
16
+ int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
17
+
18
+ ggml_tensor * inp = build_inp_raw();
19
+
20
+ // change conv3d to linear
21
+ // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
22
+ {
23
+ inp = ggml_reshape_4d(
24
+ ctx0, inp,
25
+ Wm * m * patch_size, m * patch_size, Hm, 3);
26
+ inp = ggml_permute(ctx0, inp, 1, 2, 3, 0);
27
+ inp = ggml_cont_4d(
28
+ ctx0, inp,
29
+ m * patch_size * 3, Wm, m * patch_size, Hm);
30
+
31
+ inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
32
+ inp = ggml_cont_4d(
33
+ ctx0, inp,
34
+ m * patch_size * 3, patch_size, m, Hm * Wm);
35
+
36
+ inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
37
+ inp = ggml_cont_4d(
38
+ ctx0, inp,
39
+ patch_size, 3, patch_size, Hm * Wm * m * m);
40
+
41
+ inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
42
+ inp = ggml_cont_3d(
43
+ ctx0, inp,
44
+ 3*patch_size* patch_size, Hm * Wm * m * m, 1);
45
+ }
46
+ inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
47
+
48
+ if (model.patch_bias) {
49
+ inp = ggml_add(ctx0, inp, model.patch_bias);
50
+ }
51
+
52
+ inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
53
+
54
+ ggml_tensor * inpL = inp;
55
+ ggml_tensor * window_mask = nullptr;
56
+ ggml_tensor * window_idx = nullptr;
57
+ ggml_tensor * inv_window_idx = nullptr;
58
+
59
+ ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
60
+ ggml_set_name(positions, "positions");
61
+ ggml_set_input(positions);
62
+
63
+ // pre-layernorm
64
+ if (model.pre_ln_w) {
65
+ inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
66
+ }
67
+ if (use_window_attn) {
68
+ inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
69
+ ggml_set_name(inv_window_idx, "inv_window_idx");
70
+ ggml_set_input(inv_window_idx);
71
+ // mask for window attention
72
+ window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
73
+ ggml_set_name(window_mask, "window_mask");
74
+ ggml_set_input(window_mask);
75
+
76
+ // if flash attn is used, we need to pad the mask and cast to f16
77
+ if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
78
+ window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
79
+ }
80
+
81
+ // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
82
+ GGML_ASSERT(batch_size == 1);
83
+ inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
84
+ inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
85
+ inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
86
+ }
87
+
88
+ // loop over layers
89
+ for (int il = 0; il < n_layer; il++) {
90
+ const auto & layer = model.layers[il];
91
+ const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true;
92
+
93
+ ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
94
+
95
+ // layernorm1
96
+ cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
97
+ // self-attention
98
+ {
99
+ ggml_tensor * Qcur = ggml_add(ctx0,
100
+ ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
101
+ ggml_tensor * Kcur = ggml_add(ctx0,
102
+ ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
103
+ ggml_tensor * Vcur = ggml_add(ctx0,
104
+ ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
105
+
106
+ Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
107
+ Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
108
+ Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
109
+
110
+ Qcur = ggml_rope_multi(
111
+ ctx0, Qcur, positions, nullptr,
112
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
113
+ Kcur = ggml_rope_multi(
114
+ ctx0, Kcur, positions, nullptr,
115
+ d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
116
+
117
+ ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
118
+
119
+ cur = build_attn(layer.o_w, layer.o_b,
120
+ Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
121
+ }
122
+ // re-add the layer input, e.g., residual
123
+ cur = ggml_add(ctx0, cur, inpL);
124
+
125
+ inpL = cur; // inpL = residual, cur = hidden_states
126
+
127
+ // layernorm2
128
+ cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
129
+
130
+ // ffn
131
+ cur = build_ffn(cur,
132
+ layer.ff_up_w, layer.ff_up_b,
133
+ nullptr, nullptr,
134
+ layer.ff_down_w, layer.ff_down_b,
135
+ hparams.ffn_op, il);
136
+
137
+ // residual 2
138
+ cur = ggml_add(ctx0, inpL, cur);
139
+
140
+ inpL = cur;
141
+ }
142
+
143
+ ggml_tensor * embeddings = inpL;
144
+ if (use_window_attn) {
145
+ const int spatial_merge_unit = 4;
146
+ window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit);
147
+ ggml_set_name(window_idx, "window_idx");
148
+ ggml_set_input(window_idx);
149
+ GGML_ASSERT(batch_size == 1);
150
+ embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit);
151
+ embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
152
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size);
153
+ cb(embeddings, "window_order_restored", -1);
154
+ }
155
+
156
+ // post-layernorm (part of Siglip2VisionTransformer, applied after encoder)
157
+ if (model.post_ln_w) {
158
+ embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
159
+ }
160
+
161
+ // Now apply merger (VLPatchMerger):
162
+ // 1. Apply RMS norm (ln_q in VLPatchMerger)
163
+ embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
164
+ cb(embeddings, "merger_normed", -1);
165
+
166
+ // 2. First reshape for spatial merge (merge 2x2 patches)
167
+ embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
168
+ cb(embeddings, "merger_reshaped", -1);
169
+
170
+ embeddings = build_ffn(embeddings,
171
+ model.mm_0_w, model.mm_0_b,
172
+ nullptr, nullptr,
173
+ model.mm_1_w, model.mm_1_b,
174
+ FFN_GELU,
175
+ -1);
176
+ ggml_build_forward_expand(gf, embeddings);
177
+
178
+ return gf;
179
+ }
@@ -283,7 +283,7 @@ struct mtmd_context {
283
283
  // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
284
284
  img_end = "[IMG_END]";
285
285
 
286
- } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
286
+ } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
287
287
  // <|vision_start|> ... (image embeddings) ... <|vision_end|>
288
288
  img_beg = "<|vision_start|>";
289
289
  img_end = "<|vision_end|>";
@@ -330,6 +330,7 @@ struct mtmd_context {
330
330
  case PROJECTOR_TYPE_ULTRAVOX:
331
331
  case PROJECTOR_TYPE_VOXTRAL:
332
332
  case PROJECTOR_TYPE_GLMA:
333
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
333
334
  audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
334
335
  break;
335
336
  case PROJECTOR_TYPE_LFM2A:
@@ -352,6 +353,9 @@ struct mtmd_context {
352
353
  // [BEGIN_AUDIO] ... (embeddings) ...
353
354
  aud_beg = "[BEGIN_AUDIO]";
354
355
 
356
+ } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
357
+ // <sound> ... (embeddings) ...
358
+ aud_beg = "<sound>";
355
359
  }
356
360
  }
357
361
 
@@ -27,6 +27,9 @@
27
27
  * - Make sure the C API is aligned with the libllama C API (as in llama.h)
28
28
  * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
29
29
  * - Keep the API minimal, do not expose internal details unless necessary
30
+ *
31
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
32
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
30
33
  */
31
34
 
32
35
  #ifdef LLAMA_SHARED
@@ -12,6 +12,7 @@
12
12
  #include <cmath>
13
13
  #include <cctype>
14
14
  #include <algorithm>
15
+ #include <filesystem>
15
16
 
16
17
  struct quant_option {
17
18
  std::string name;
@@ -643,6 +644,11 @@ int main(int argc, char ** argv) {
643
644
  return 1;
644
645
  }
645
646
 
647
+ if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
648
+ fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
649
+ return 1;
650
+ }
651
+
646
652
  print_build_info();
647
653
 
648
654
  fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
@@ -38,14 +38,6 @@ set(TARGET_SRCS
38
38
  server-http.h
39
39
  server-models.cpp
40
40
  server-models.h
41
- server-task.cpp
42
- server-task.h
43
- server-queue.cpp
44
- server-queue.h
45
- server-common.cpp
46
- server-common.h
47
- server-context.cpp
48
- server-context.h
49
41
  )
50
42
  set(PUBLIC_ASSETS
51
43
  index.html.gz
@@ -107,6 +107,8 @@ For detailed instructions, see the [test documentation](./tests/README.md).
107
107
  - Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362
108
108
  - Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470
109
109
  - Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
110
+ - INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
111
+ - Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
110
112
 
111
113
 
112
114