llama-cpp-pydist 0.19.0__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. llama_cpp/binaries/{llama-b7488-bin-win-cpu-x64.zip → llama-b7631-bin-win-cpu-x64.zip} +0 -0
  2. llama_cpp_pydist-0.21.0.dist-info/METADATA +4684 -0
  3. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/RECORD +240 -222
  4. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/WHEEL +1 -1
  5. vendor_llama_cpp_pydist/llama.cpp/.devops/cuda-new.Dockerfile +95 -0
  6. vendor_llama_cpp_pydist/llama.cpp/.gemini/settings.json +1 -0
  7. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +2 -1
  8. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/011-bug-results.yml +13 -2
  9. vendor_llama_cpp_pydist/llama.cpp/.github/ISSUE_TEMPLATE/019-bug-misc.yml +13 -2
  10. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/build.yml +18 -6
  11. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/docker.yml +25 -13
  12. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/release.yml +9 -5
  13. vendor_llama_cpp_pydist/llama.cpp/.github/workflows/server.yml +18 -0
  14. vendor_llama_cpp_pydist/llama.cpp/AGENTS.md +81 -0
  15. vendor_llama_cpp_pydist/llama.cpp/CLAUDE.md +1 -0
  16. vendor_llama_cpp_pydist/llama.cpp/CONTRIBUTING.md +34 -5
  17. vendor_llama_cpp_pydist/llama.cpp/ci/run.sh +2 -1
  18. vendor_llama_cpp_pydist/llama.cpp/common/CMakeLists.txt +4 -3
  19. vendor_llama_cpp_pydist/llama.cpp/common/arg.cpp +46 -14
  20. vendor_llama_cpp_pydist/llama.cpp/common/arg.h +1 -0
  21. vendor_llama_cpp_pydist/llama.cpp/common/chat-parser.cpp +11 -0
  22. vendor_llama_cpp_pydist/llama.cpp/common/chat.cpp +36 -7
  23. vendor_llama_cpp_pydist/llama.cpp/common/chat.h +1 -0
  24. vendor_llama_cpp_pydist/llama.cpp/common/common.cpp +42 -23
  25. vendor_llama_cpp_pydist/llama.cpp/common/common.h +7 -2
  26. vendor_llama_cpp_pydist/llama.cpp/common/llguidance.cpp +10 -6
  27. vendor_llama_cpp_pydist/llama.cpp/common/regex-partial.cpp +13 -13
  28. vendor_llama_cpp_pydist/llama.cpp/common/sampling.cpp +58 -14
  29. vendor_llama_cpp_pydist/llama.cpp/common/sampling.h +3 -1
  30. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf.py +424 -103
  31. vendor_llama_cpp_pydist/llama.cpp/convert_hf_to_gguf_update.py +5 -0
  32. vendor_llama_cpp_pydist/llama.cpp/docs/backend/CANN.md +4 -0
  33. vendor_llama_cpp_pydist/llama.cpp/docs/backend/OPENCL.md +51 -1
  34. vendor_llama_cpp_pydist/llama.cpp/docs/backend/SYCL.md +1 -1
  35. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/README.md +5 -5
  36. vendor_llama_cpp_pydist/llama.cpp/docs/backend/hexagon/developer.md +1 -1
  37. vendor_llama_cpp_pydist/llama.cpp/docs/build.md +21 -2
  38. vendor_llama_cpp_pydist/llama.cpp/docs/development/parsing.md +2 -2
  39. vendor_llama_cpp_pydist/llama.cpp/docs/ops/Metal.csv +360 -322
  40. vendor_llama_cpp_pydist/llama.cpp/docs/ops.md +1 -1
  41. vendor_llama_cpp_pydist/llama.cpp/ggml/CMakeLists.txt +13 -1
  42. vendor_llama_cpp_pydist/llama.cpp/ggml/include/ggml-backend.h +1 -1
  43. vendor_llama_cpp_pydist/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  44. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-backend.cpp +11 -11
  45. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +303 -19
  46. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +17 -0
  47. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/common.h +153 -9
  48. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +51 -158
  49. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  50. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  51. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  52. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  53. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  54. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  55. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +66 -1
  56. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cu +48 -27
  57. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/argsort.cuh +16 -0
  58. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/common.cuh +45 -9
  59. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cpy.cu +117 -103
  60. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/cumsum.cu +105 -35
  61. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +3 -1
  62. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +2 -2
  63. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +83 -33
  64. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mean.cu +3 -0
  65. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mma.cuh +21 -0
  66. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cu +34 -8
  67. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +168 -13
  68. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cu +151 -0
  69. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/quantize.cuh +14 -0
  70. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/softmax.cu +203 -6
  71. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cu +96 -0
  72. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/top-k.cuh +3 -0
  73. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +17 -2
  74. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +6 -1
  75. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  76. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  77. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  78. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +224 -758
  79. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +316 -164
  80. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +5 -11
  81. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +46 -15
  82. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +9 -3
  83. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +2 -1
  84. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +20 -20
  85. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +1 -0
  86. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  87. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-impl.h +0 -4
  88. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +57 -0
  89. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +2 -0
  90. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +5 -0
  91. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +20 -0
  92. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +71 -2
  93. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +1 -0
  94. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +73 -6
  95. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +134 -13
  96. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +21 -0
  97. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +13 -0
  98. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +14 -7
  99. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +42 -1
  100. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +742 -315
  101. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_experts.comp +51 -0
  102. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum.comp +28 -14
  103. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass1.comp +60 -0
  104. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cumsum_multipass2.comp +66 -0
  105. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl +1 -7
  106. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl +2 -0
  107. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +17 -4
  108. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +42 -24
  109. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +11 -0
  110. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq_funcs.glsl +115 -0
  111. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +10 -4
  112. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +29 -18
  113. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +19 -16
  114. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
  115. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +10 -4
  116. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +8 -8
  117. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +11 -4
  118. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +4 -1
  119. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +4 -1
  120. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +4 -1
  121. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +1 -0
  122. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +4 -1
  123. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +57 -22
  124. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.glsl +312 -6
  125. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +54 -0
  126. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -2
  127. vendor_llama_cpp_pydist/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +35 -0
  128. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/constants.py +99 -0
  129. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/gguf_writer.py +38 -2
  130. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/tensor_mapping.py +26 -0
  131. vendor_llama_cpp_pydist/llama.cpp/gguf-py/gguf/utility.py +0 -8
  132. vendor_llama_cpp_pydist/llama.cpp/grammars/README.md +3 -0
  133. vendor_llama_cpp_pydist/llama.cpp/include/llama.h +99 -12
  134. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-cli.sh +9 -9
  135. vendor_llama_cpp_pydist/llama.cpp/scripts/snapdragon/adb/run-completion.sh +53 -0
  136. vendor_llama_cpp_pydist/llama.cpp/scripts/sync-ggml.last +1 -1
  137. vendor_llama_cpp_pydist/llama.cpp/src/CMakeLists.txt +4 -0
  138. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.cpp +12 -3
  139. vendor_llama_cpp_pydist/llama.cpp/src/llama-adapter.h +7 -1
  140. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.cpp +76 -0
  141. vendor_llama_cpp_pydist/llama.cpp/src/llama-arch.h +7 -0
  142. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.cpp +11 -0
  143. vendor_llama_cpp_pydist/llama.cpp/src/llama-chat.h +1 -0
  144. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.cpp +625 -40
  145. vendor_llama_cpp_pydist/llama.cpp/src/llama-context.h +43 -1
  146. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.cpp +40 -13
  147. vendor_llama_cpp_pydist/llama.cpp/src/llama-grammar.h +2 -0
  148. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.cpp +166 -2
  149. vendor_llama_cpp_pydist/llama.cpp/src/llama-graph.h +71 -6
  150. vendor_llama_cpp_pydist/llama.cpp/src/llama-hparams.h +6 -5
  151. vendor_llama_cpp_pydist/llama.cpp/src/llama-kv-cache.h +1 -1
  152. vendor_llama_cpp_pydist/llama.cpp/src/llama-mmap.cpp +11 -4
  153. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.cpp +23 -0
  154. vendor_llama_cpp_pydist/llama.cpp/src/llama-model-loader.h +2 -0
  155. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.cpp +329 -26
  156. vendor_llama_cpp_pydist/llama.cpp/src/llama-model.h +13 -2
  157. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.cpp +1259 -186
  158. vendor_llama_cpp_pydist/llama.cpp/src/llama-sampling.h +19 -7
  159. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.cpp +101 -33
  160. vendor_llama_cpp_pydist/llama.cpp/src/llama-vocab.h +2 -0
  161. vendor_llama_cpp_pydist/llama.cpp/src/llama.cpp +53 -38
  162. vendor_llama_cpp_pydist/llama.cpp/src/models/afmoe.cpp +9 -5
  163. vendor_llama_cpp_pydist/llama.cpp/src/models/bert.cpp +4 -2
  164. vendor_llama_cpp_pydist/llama.cpp/src/models/cogvlm.cpp +5 -3
  165. vendor_llama_cpp_pydist/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  166. vendor_llama_cpp_pydist/llama.cpp/src/models/deepseek2.cpp +1 -1
  167. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  168. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  169. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3.cpp +3 -4
  170. vendor_llama_cpp_pydist/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  171. vendor_llama_cpp_pydist/llama.cpp/src/models/llama-iswa.cpp +6 -2
  172. vendor_llama_cpp_pydist/llama.cpp/src/models/llama.cpp +19 -6
  173. vendor_llama_cpp_pydist/llama.cpp/src/models/maincoder.cpp +117 -0
  174. vendor_llama_cpp_pydist/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  175. vendor_llama_cpp_pydist/llama.cpp/src/models/models.h +18 -0
  176. vendor_llama_cpp_pydist/llama.cpp/src/models/modern-bert.cpp +116 -0
  177. vendor_llama_cpp_pydist/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  178. vendor_llama_cpp_pydist/llama.cpp/src/models/plamo3.cpp +128 -0
  179. vendor_llama_cpp_pydist/llama.cpp/src/models/smallthinker.cpp +11 -5
  180. vendor_llama_cpp_pydist/llama.cpp/src/unicode.cpp +23 -14
  181. vendor_llama_cpp_pydist/llama.cpp/tests/CMakeLists.txt +12 -2
  182. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-ops.cpp +286 -65
  183. vendor_llama_cpp_pydist/llama.cpp/tests/test-backend-sampler.cpp +1237 -0
  184. vendor_llama_cpp_pydist/llama.cpp/tests/test-chat.cpp +29 -3
  185. vendor_llama_cpp_pydist/llama.cpp/tests/test-grammar-llguidance.cpp +3 -0
  186. vendor_llama_cpp_pydist/llama.cpp/tests/test-regex-partial.cpp +14 -14
  187. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-0.cpp +1 -1
  188. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-bpe.cpp +1 -1
  189. vendor_llama_cpp_pydist/llama.cpp/tests/test-tokenizer-1-spm.cpp +1 -1
  190. vendor_llama_cpp_pydist/llama.cpp/tools/batched-bench/batched-bench.cpp +11 -0
  191. vendor_llama_cpp_pydist/llama.cpp/tools/cli/README.md +187 -1
  192. vendor_llama_cpp_pydist/llama.cpp/tools/cli/cli.cpp +1 -3
  193. vendor_llama_cpp_pydist/llama.cpp/tools/completion/README.md +179 -7
  194. vendor_llama_cpp_pydist/llama.cpp/tools/completion/completion.cpp +4 -1
  195. vendor_llama_cpp_pydist/llama.cpp/tools/fit-params/fit-params.cpp +3 -3
  196. vendor_llama_cpp_pydist/llama.cpp/tools/llama-bench/llama-bench.cpp +18 -1
  197. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/CMakeLists.txt +1 -0
  198. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-impl.h +12 -7
  199. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip-model.h +3 -1
  200. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/clip.cpp +118 -4
  201. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/models.h +10 -0
  202. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/siglip.cpp +9 -4
  203. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/whisper-enc.cpp +9 -0
  204. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/models/youtuvl.cpp +179 -0
  205. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.cpp +5 -1
  206. vendor_llama_cpp_pydist/llama.cpp/tools/mtmd/mtmd.h +3 -0
  207. vendor_llama_cpp_pydist/llama.cpp/tools/quantize/quantize.cpp +6 -0
  208. vendor_llama_cpp_pydist/llama.cpp/tools/server/CMakeLists.txt +0 -8
  209. vendor_llama_cpp_pydist/llama.cpp/tools/server/README-dev.md +2 -0
  210. vendor_llama_cpp_pydist/llama.cpp/tools/server/README.md +27 -14
  211. vendor_llama_cpp_pydist/llama.cpp/tools/server/public/index.html.gz +0 -0
  212. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.cpp +22 -24
  213. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-common.h +2 -3
  214. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.cpp +453 -267
  215. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-context.h +52 -15
  216. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-http.cpp +16 -10
  217. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.cpp +174 -62
  218. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-models.h +14 -5
  219. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.cpp +78 -21
  220. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-queue.h +48 -10
  221. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.cpp +36 -11
  222. vendor_llama_cpp_pydist/llama.cpp/tools/server/server-task.h +28 -35
  223. vendor_llama_cpp_pydist/llama.cpp/tools/server/server.cpp +9 -5
  224. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_chat_completion.py +11 -2
  225. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/unit/test_sleep.py +39 -0
  226. vendor_llama_cpp_pydist/llama.cpp/tools/server/tests/utils.py +3 -0
  227. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte +25 -1
  228. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte +66 -13
  229. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte +5 -0
  230. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts +3 -0
  231. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts +125 -11
  232. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/services/chat.ts +15 -8
  233. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts +12 -3
  234. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts +4 -5
  235. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/api.d.ts +5 -0
  236. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/types/settings.d.ts +2 -1
  237. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts +1 -4
  238. vendor_llama_cpp_pydist/llama.cpp/tools/server/webui/src/routes/+layout.svelte +1 -1
  239. llama_cpp_pydist-0.19.0.dist-info/METADATA +0 -2506
  240. vendor_llama_cpp_pydist/llama.cpp/.github/copilot-instructions.md +0 -262
  241. {llama_cpp_pydist-0.19.0.dist-info/licenses → llama_cpp_pydist-0.21.0.dist-info}/LICENSE +0 -0
  242. {llama_cpp_pydist-0.19.0.dist-info → llama_cpp_pydist-0.21.0.dist-info}/top_level.txt +0 -0
@@ -141,16 +141,24 @@ class ModelBase:
141
141
  self.model_name = model_name
142
142
  self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
143
143
 
144
- # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
144
+ # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
145
+ # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
145
146
  if self.ftype == gguf.LlamaFileType.GUESSED:
146
- # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
147
- _, first_tensor = next(self.get_tensors())
148
- if first_tensor.dtype == torch.float16:
149
- logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
150
- self.ftype = gguf.LlamaFileType.MOSTLY_F16
147
+ for _, tensor in self.get_tensors():
148
+ if tensor.dim() < 2:
149
+ continue
150
+
151
+ if tensor.dtype == torch.bfloat16:
152
+ self.ftype = gguf.LlamaFileType.MOSTLY_BF16
153
+ logger.info("heuristics detected bfloat16 tensor dtype, setting --outtype bf16")
154
+ break
155
+ elif tensor.dtype == torch.float16:
156
+ self.ftype = gguf.LlamaFileType.MOSTLY_F16
157
+ logger.info("heuristics detected float16 tensor dtype, setting --outtype f16")
158
+ break
151
159
  else:
152
- logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
153
- self.ftype = gguf.LlamaFileType.MOSTLY_BF16
160
+ self.ftype = gguf.LlamaFileType.MOSTLY_F16
161
+ logger.info("heuristics unable to detect tensor dtype, defaulting to --outtype f16")
154
162
 
155
163
  self.dequant_model()
156
164
 
@@ -763,9 +771,14 @@ class TextModel(ModelBase):
763
771
 
764
772
  self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
765
773
 
774
+ rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
775
+ local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
776
+
766
777
  # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
767
778
  if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
768
- if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
779
+ if local_rope_theta is not None:
780
+ self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
781
+ if "rope_theta" not in self.rope_parameters and rope_theta is not None:
769
782
  self.rope_parameters["rope_theta"] = rope_theta
770
783
  if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
771
784
  self.rope_parameters["rope_type"] = rope_type
@@ -831,6 +844,7 @@ class TextModel(ModelBase):
831
844
  self.gguf_writer.add_head_count_kv(n_head_kv)
832
845
  logger.info(f"gguf: key-value head count = {n_head_kv}")
833
846
 
847
+ # TODO: Handle "sliding_attention" similarly when models start implementing it
834
848
  rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
835
849
  if (rope_type := rope_params.get("rope_type")) is not None:
836
850
  rope_factor = rope_params.get("factor")
@@ -877,6 +891,9 @@ class TextModel(ModelBase):
877
891
  if (rope_theta := rope_params.get("rope_theta")) is not None:
878
892
  self.gguf_writer.add_rope_freq_base(rope_theta)
879
893
  logger.info(f"gguf: rope theta = {rope_theta}")
894
+ if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
895
+ self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
896
+ logger.info(f"gguf: rope theta swa = {local_rope_theta}")
880
897
  if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
881
898
  self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
882
899
  logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
@@ -1054,6 +1071,9 @@ class TextModel(ModelBase):
1054
1071
  if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
1055
1072
  # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
1056
1073
  res = "grok-2"
1074
+ if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
1075
+ # ref: https://huggingface.co/aari1995/German_Semantic_V3
1076
+ res = "jina-v2-de"
1057
1077
  if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
1058
1078
  # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
1059
1079
  res = "llama-bpe"
@@ -1204,6 +1224,9 @@ class TextModel(ModelBase):
1204
1224
  if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
1205
1225
  # ref: https://huggingface.co/JetBrains/Mellum-4b-base
1206
1226
  res = "mellum"
1227
+ if chkhsh == "a0b64b4385f123663873756336c085744376d015ff328bb1d901598f63c44152":
1228
+ # ref: https://huggingface.co/answerdotai/ModernBERT-base
1229
+ res = "modern-bert"
1207
1230
  if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
1208
1231
  # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
1209
1232
  res = "afmoe"
@@ -1219,6 +1242,12 @@ class TextModel(ModelBase):
1219
1242
  if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
1220
1243
  # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
1221
1244
  res = "kormo"
1245
+ if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
1246
+ # ref: https://huggingface.co/tencent/Youtu-LLM-2B
1247
+ res = "youtu"
1248
+ if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
1249
+ # ref: https://huggingface.co/upstage/Solar-Open-100B
1250
+ res = "solar-open"
1222
1251
 
1223
1252
  if res is None:
1224
1253
  logger.warning("\n")
@@ -1685,6 +1714,84 @@ class TextModel(ModelBase):
1685
1714
  if template is not None:
1686
1715
  self.gguf_writer.add_chat_template(template)
1687
1716
 
1717
+ def _set_vocab_plamo(self):
1718
+ # PLaMo models use a custom tokenizer with a .jsonl file
1719
+ tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
1720
+ tokenizer_config_path = self.dir_model / "tokenizer_config.json"
1721
+
1722
+ if not tokenizer_jsonl_path.is_file():
1723
+ raise FileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")
1724
+
1725
+ # Load tokenizer config
1726
+ with open(tokenizer_config_path, "r", encoding="utf-8") as f:
1727
+ tokenizer_config = json.load(f)
1728
+
1729
+ # Load tokens from JSONL file (actually a list format)
1730
+ tokens = []
1731
+ scores = []
1732
+ toktypes = []
1733
+
1734
+ with open(tokenizer_jsonl_path, "r", encoding="utf-8") as f:
1735
+ for line_num, line in enumerate(f):
1736
+ if line.strip():
1737
+ token_data = json.loads(line)
1738
+ # Format: [token, score, type, ?, ?, ?, ?]
1739
+ token = token_data[0].encode("utf-8")
1740
+ score = float(token_data[1])
1741
+ token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
1742
+
1743
+ tokens.append(token)
1744
+ scores.append(score)
1745
+
1746
+ if token_type_str == "UNKNOWN":
1747
+ toktypes.append(gguf.TokenType.UNKNOWN)
1748
+ elif token_type_str == "CONTROL":
1749
+ toktypes.append(gguf.TokenType.CONTROL)
1750
+ elif token_type_str == "BYTE":
1751
+ toktypes.append(gguf.TokenType.BYTE)
1752
+ else:
1753
+ token_str = token_data[0]
1754
+ if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
1755
+ toktypes.append(gguf.TokenType.CONTROL)
1756
+ else:
1757
+ toktypes.append(gguf.TokenType.NORMAL)
1758
+
1759
+ vocab_size = self.hparams["vocab_size"]
1760
+ if vocab_size > len(tokens):
1761
+ pad_count = vocab_size - len(tokens)
1762
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
1763
+ for i in range(1, pad_count + 1):
1764
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
1765
+ scores.append(-1000.0)
1766
+ toktypes.append(gguf.TokenType.UNUSED)
1767
+
1768
+ self.gguf_writer.add_tokenizer_model("plamo2")
1769
+ self.gguf_writer.add_tokenizer_pre("default")
1770
+ self.gguf_writer.add_token_list(tokens)
1771
+ self.gguf_writer.add_token_scores(scores)
1772
+ self.gguf_writer.add_token_types(toktypes)
1773
+
1774
+ if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
1775
+ token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
1776
+ self.gguf_writer.add_bos_token_id(token_id)
1777
+ if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
1778
+ token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
1779
+ self.gguf_writer.add_eos_token_id(token_id)
1780
+ if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
1781
+ token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
1782
+ self.gguf_writer.add_pad_token_id(token_id)
1783
+ if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
1784
+ token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
1785
+ self.gguf_writer.add_sep_token_id(token_id)
1786
+ if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
1787
+ token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
1788
+ self.gguf_writer.add_unk_token_id(token_id)
1789
+
1790
+ # Add <|plamo:op|> as EOT to ensure appropriate end of generation
1791
+ self.gguf_writer.add_eot_token_id(4)
1792
+
1793
+ self.gguf_writer.add_add_space_prefix(False)
1794
+
1688
1795
 
1689
1796
  class MmprojModel(ModelBase):
1690
1797
  model_type = ModelType.MMPROJ
@@ -2397,6 +2504,7 @@ class StableLMModel(TextModel):
2397
2504
  "VLlama3ForCausalLM",
2398
2505
  "LlavaForConditionalGeneration",
2399
2506
  "VoxtralForConditionalGeneration",
2507
+ "IQuestCoderForCausalLM",
2400
2508
  "LlamaModel")
2401
2509
  class LlamaModel(TextModel):
2402
2510
  model_arch = gguf.MODEL_ARCH.LLAMA
@@ -3414,7 +3522,7 @@ class QwenModel(TextModel):
3414
3522
  self._set_vocab_qwen()
3415
3523
 
3416
3524
 
3417
- @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
3525
+ @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
3418
3526
  class Qwen2Model(TextModel):
3419
3527
  model_arch = gguf.MODEL_ARCH.QWEN2
3420
3528
 
@@ -4787,87 +4895,7 @@ class Plamo2Model(TextModel):
4787
4895
  model_arch = gguf.MODEL_ARCH.PLAMO2
4788
4896
 
4789
4897
  def set_vocab(self):
4790
- # PLaMo 2 uses a custom tokenizer with a .jsonl file
4791
- # We need to handle this specially
4792
- tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
4793
- tokenizer_config_path = self.dir_model / "tokenizer_config.json"
4794
-
4795
- if not tokenizer_jsonl_path.is_file():
4796
- raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
4797
-
4798
- # Load tokenizer config
4799
- with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
4800
- tokenizer_config = json.load(f)
4801
-
4802
- # Load tokens from JSONL file (actually a list format)
4803
- tokens = []
4804
- scores = []
4805
- toktypes = []
4806
-
4807
- with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
4808
- for line_num, line in enumerate(f):
4809
- if line.strip():
4810
- token_data = json.loads(line)
4811
- # Format: [token, score, type, ?, ?, ?, ?]
4812
- token = token_data[0].encode("utf-8")
4813
- score = float(token_data[1])
4814
- token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
4815
-
4816
- tokens.append(token)
4817
- scores.append(score)
4818
-
4819
- # Map token type strings to GGUF token types
4820
- if token_type_str == "UNKNOWN":
4821
- toktypes.append(gguf.TokenType.UNKNOWN)
4822
- elif token_type_str == "CONTROL":
4823
- toktypes.append(gguf.TokenType.CONTROL)
4824
- elif token_type_str == "BYTE":
4825
- toktypes.append(gguf.TokenType.BYTE)
4826
- else:
4827
- # Check for PLaMo-2 special tokens
4828
- token_str = token_data[0]
4829
- if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
4830
- toktypes.append(gguf.TokenType.CONTROL)
4831
- else:
4832
- toktypes.append(gguf.TokenType.NORMAL)
4833
-
4834
- vocab_size = self.hparams["vocab_size"]
4835
- if vocab_size > len(tokens):
4836
- pad_count = vocab_size - len(tokens)
4837
- logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
4838
- for i in range(1, pad_count + 1):
4839
- tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
4840
- scores.append(-1000.0)
4841
- toktypes.append(gguf.TokenType.UNUSED)
4842
-
4843
- # Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
4844
- self.gguf_writer.add_tokenizer_model("plamo2")
4845
- self.gguf_writer.add_tokenizer_pre("default")
4846
- self.gguf_writer.add_token_list(tokens)
4847
- self.gguf_writer.add_token_scores(scores)
4848
- self.gguf_writer.add_token_types(toktypes)
4849
-
4850
- # Add special tokens from config
4851
- if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
4852
- token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
4853
- self.gguf_writer.add_bos_token_id(token_id)
4854
- if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
4855
- token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
4856
- self.gguf_writer.add_eos_token_id(token_id)
4857
- if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
4858
- token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
4859
- self.gguf_writer.add_pad_token_id(token_id)
4860
- if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
4861
- token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
4862
- self.gguf_writer.add_sep_token_id(token_id)
4863
- if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
4864
- token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
4865
- self.gguf_writer.add_unk_token_id(token_id)
4866
-
4867
- # Add <|plamo:op|> as EOT to ensure appropriate end of generation
4868
- self.gguf_writer.add_eot_token_id(4)
4869
-
4870
- self.gguf_writer.add_add_space_prefix(False)
4898
+ self._set_vocab_plamo()
4871
4899
 
4872
4900
  def set_gguf_parameters(self):
4873
4901
  hparams = self.hparams
@@ -4955,6 +4983,55 @@ class Plamo2Model(TextModel):
4955
4983
  return [(new_name, data_torch)]
4956
4984
 
4957
4985
 
4986
+ @ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM")
4987
+ class Plamo3Model(TextModel):
4988
+ model_arch = gguf.MODEL_ARCH.PLAMO3
4989
+
4990
+ def set_vocab(self):
4991
+ self._set_vocab_plamo()
4992
+
4993
+ tokenizer_config_path = self.dir_model / "tokenizer_config.json"
4994
+ tokenizer_config = {}
4995
+
4996
+ if tokenizer_config_path.is_file():
4997
+ with open(tokenizer_config_path, encoding="utf-8") as f:
4998
+ tokenizer_config = json.load(f)
4999
+
5000
+ chat_template = tokenizer_config.get("chat_template")
5001
+ chat_template_jinja = self.dir_model / "chat_template.jinja"
5002
+
5003
+ if chat_template_jinja.is_file():
5004
+ with open(chat_template_jinja, encoding="utf-8") as f:
5005
+ chat_template = f.read()
5006
+
5007
+ if chat_template:
5008
+ self.gguf_writer.add_chat_template(chat_template)
5009
+
5010
+ def set_gguf_parameters(self):
5011
+ super().set_gguf_parameters()
5012
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
5013
+ if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
5014
+ self.gguf_writer.add_sliding_window(sliding_window)
5015
+ self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
5016
+
5017
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5018
+
5019
+ if name.endswith(".pre_mixer_norm.weight"):
5020
+ data_torch = data_torch + 1.0
5021
+ elif name.endswith(".post_mixer_norm.weight"):
5022
+ data_torch = data_torch + 1.0 / 5
5023
+ elif name.endswith(".pre_mlp_norm.weight"):
5024
+ data_torch = data_torch + 1.0
5025
+ elif name.endswith(".post_mlp_norm.weight"):
5026
+ data_torch = data_torch + 1.0 / (5**1.5)
5027
+ elif name.endswith((".mixer.q_norm.weight", ".mixer.k_norm.weight")):
5028
+ data_torch = data_torch + 1.0
5029
+ elif name.endswith(".norm.weight"):
5030
+ data_torch = data_torch + 1.0
5031
+
5032
+ return [(self.map_tensor_name(name), data_torch)]
5033
+
5034
+
4958
5035
  @ModelBase.register("CodeShellForCausalLM")
4959
5036
  class CodeShellModel(TextModel):
4960
5037
  model_arch = gguf.MODEL_ARCH.CODESHELL
@@ -5225,13 +5302,14 @@ class BertModel(TextModel):
5225
5302
  self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
5226
5303
 
5227
5304
  # convert to phantom space vocab
5228
- def phantom(tok):
5229
- if tok.startswith("[") and tok.endswith("]"):
5305
+ def phantom(tok, toktype):
5306
+ if toktype == gguf.TokenType.CONTROL:
5230
5307
  return tok
5231
5308
  if tok.startswith("##"):
5232
5309
  return tok[2:]
5233
5310
  return "\u2581" + tok
5234
- tokens = list(map(phantom, tokens))
5311
+ assert len(tokens) == len(toktypes)
5312
+ tokens = list(map(phantom, tokens, toktypes))
5235
5313
 
5236
5314
  # add vocab to gguf
5237
5315
  self.gguf_writer.add_tokenizer_model("bert")
@@ -6345,6 +6423,17 @@ class ARwkv7Model(Rwkv7Model):
6345
6423
  self.gguf_writer.add_head_count(0)
6346
6424
 
6347
6425
 
6426
+ @ModelBase.register("MaincoderForCausalLM")
6427
+ class MaincoderModel(TextModel):
6428
+ model_arch = gguf.MODEL_ARCH.MAINCODER
6429
+
6430
+ def set_gguf_parameters(self):
6431
+ super().set_gguf_parameters()
6432
+
6433
+ if (head_dim := self.hparams.get("head_dim")) is not None:
6434
+ self.gguf_writer.add_rope_dimension_count(head_dim)
6435
+
6436
+
6348
6437
  @ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
6349
6438
  class MambaModel(TextModel):
6350
6439
  model_arch = gguf.MODEL_ARCH.MAMBA
@@ -7122,6 +7211,7 @@ class DeepseekModel(TextModel):
7122
7211
  "DeepseekV2ForCausalLM",
7123
7212
  "DeepseekV3ForCausalLM",
7124
7213
  "KimiVLForConditionalGeneration",
7214
+ "YoutuForCausalLM",
7125
7215
  )
7126
7216
  class DeepseekV2Model(TextModel):
7127
7217
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -7188,7 +7278,15 @@ class DeepseekV2Model(TextModel):
7188
7278
  super().set_gguf_parameters()
7189
7279
  hparams = self.hparams
7190
7280
 
7191
- self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
7281
+ # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
7282
+ # For non-MoE models (like Youtu), set to n_layer to use dense FFN for all layers
7283
+ # For MoE models (like DeepSeek-V2), this is the number of leading non-MoE layers
7284
+ has_moe = hparams.get("n_routed_experts") is not None
7285
+ first_k_dense_replace = hparams.get("first_k_dense_replace")
7286
+ if first_k_dense_replace is None:
7287
+ # Default: if no MoE, all layers are dense; if MoE, none are dense
7288
+ first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
7289
+ self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
7192
7290
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
7193
7291
  if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
7194
7292
  self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@@ -7200,11 +7298,24 @@ class DeepseekV2Model(TextModel):
7200
7298
  self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
7201
7299
  self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
7202
7300
 
7203
- self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
7204
- self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
7205
- self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
7206
- self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
7207
- self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
7301
+ # MoE parameters (required by C++ code for DEEPSEEK2 arch)
7302
+ # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
7303
+ moe_intermediate_size = self.find_hparam(["moe_intermediate_size", "intermediate_size"], optional=False)
7304
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
7305
+
7306
+ if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
7307
+ self.gguf_writer.add_expert_count(n_routed_experts)
7308
+
7309
+ # expert_shared_count is required by C++ code, default to 0 for non-MoE models
7310
+ n_shared_experts = hparams.get("n_shared_experts", 0)
7311
+ self.gguf_writer.add_expert_shared_count(n_shared_experts)
7312
+
7313
+ # When not set, C++ code will use scale_w = false to skip the no-op scaling
7314
+ if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
7315
+ self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
7316
+
7317
+ if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
7318
+ self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
7208
7319
 
7209
7320
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
7210
7321
 
@@ -7220,10 +7331,17 @@ class DeepseekV2Model(TextModel):
7220
7331
  # skip vision tensors and remove "language_model." for Kimi-VL
7221
7332
  if "vision_tower" in name or "multi_modal_projector" in name:
7222
7333
  return []
7223
-
7334
+ if name.startswith("siglip2.") or name.startswith("merger."):
7335
+ return []
7224
7336
  if name.startswith("language_model."):
7225
7337
  name = name.replace("language_model.", "")
7226
7338
 
7339
+ # skip lm_head.weight if tie_word_embeddings is True
7340
+ if self.hparams.get("tie_word_embeddings", False):
7341
+ if name == "lm_head.weight" or name == "model.lm_head.weight":
7342
+ logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
7343
+ return []
7344
+
7227
7345
  # rename e_score_correction_bias tensors
7228
7346
  if name.endswith("e_score_correction_bias"):
7229
7347
  name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -7351,6 +7469,89 @@ class MiniMaxM2Model(TextModel):
7351
7469
  return super().modify_tensors(data_torch, name, bid)
7352
7470
 
7353
7471
 
7472
+ @ModelBase.register("MiMoV2FlashForCausalLM")
7473
+ class MimoV2Model(TextModel):
7474
+ model_arch = gguf.MODEL_ARCH.MIMO2
7475
+
7476
+ def set_gguf_parameters(self):
7477
+ super().set_gguf_parameters()
7478
+
7479
+ assert self.hparams["swa_head_dim"] == self.hparams["head_dim"]
7480
+ assert self.hparams["swa_num_attention_heads"] == self.hparams["num_attention_heads"]
7481
+ assert self.hparams["swa_v_head_dim"] == self.hparams["v_head_dim"]
7482
+ assert self.hparams["topk_method"] == "noaux_tc"
7483
+
7484
+ n_head_kv = self.hparams["num_key_value_heads"]
7485
+ n_head_kv_swa = self.hparams["swa_num_key_value_heads"]
7486
+ n_head_kv_arr = [n_head_kv_swa if use_swa == 1 else n_head_kv for use_swa in self.hparams["hybrid_layer_pattern"]]
7487
+ self.gguf_writer.add_head_count_kv(n_head_kv_arr)
7488
+
7489
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
7490
+ self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
7491
+ self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
7492
+ self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
7493
+ self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
7494
+
7495
+ rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
7496
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
7497
+
7498
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
7499
+
7500
+ _experts: list[dict[str, Tensor]] | None = None
7501
+
7502
+ def modify_tensors(self, data_torch, name, bid):
7503
+ if name.endswith("e_score_correction_bias"):
7504
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7505
+
7506
+ if "attention_sink" in name and not name.endswith(".weight"):
7507
+ name += ".weight"
7508
+
7509
+ # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE
7510
+ if "model.mtp." in name:
7511
+ return []
7512
+
7513
+ # process the experts separately
7514
+ if name.find("mlp.experts") != -1:
7515
+ n_experts = self.hparams["n_routed_experts"]
7516
+ assert bid is not None
7517
+
7518
+ if self._experts is None:
7519
+ self._experts = [{} for _ in range(self.block_count)]
7520
+
7521
+ self._experts[bid][name] = data_torch
7522
+
7523
+ if len(self._experts[bid]) >= n_experts * 3:
7524
+ tensors: list[tuple[str, Tensor]] = []
7525
+
7526
+ # merge the experts into a single 3d tensor
7527
+ for w_name in ["gate_proj", "up_proj", "down_proj"]:
7528
+ datas: list[Tensor] = []
7529
+
7530
+ for xid in range(n_experts):
7531
+ ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7532
+ datas.append(self._experts[bid][ename_to_retrieve])
7533
+ del self._experts[bid][ename_to_retrieve]
7534
+
7535
+ data_torch = torch.stack(datas, dim=0)
7536
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7537
+ new_name = self.map_tensor_name(merged_name)
7538
+ tensors.append((new_name, data_torch))
7539
+
7540
+ return tensors
7541
+ else:
7542
+ return []
7543
+ return [(self.map_tensor_name(name), data_torch)]
7544
+
7545
+ def prepare_tensors(self):
7546
+ super().prepare_tensors()
7547
+
7548
+ if self._experts is not None:
7549
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
7550
+ experts = [k for d in self._experts for k in d.keys()]
7551
+ if len(experts) > 0:
7552
+ raise ValueError(f"Unprocessed experts: {experts}")
7553
+
7554
+
7354
7555
  @ModelBase.register("PanguEmbeddedForCausalLM")
7355
7556
  class PanguEmbeddedModel(TextModel):
7356
7557
  model_arch = gguf.MODEL_ARCH.PANGU_EMBED
@@ -8684,6 +8885,11 @@ class NemotronHModel(GraniteHybridModel):
8684
8885
  raise ValueError(f"Unprocessed experts: {experts}")
8685
8886
 
8686
8887
 
8888
+ @ModelBase.register("LlamaBidirectionalModel")
8889
+ class LlamaEmbedNemotronModel(LlamaModel):
8890
+ model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
8891
+
8892
+
8687
8893
  @ModelBase.register("BailingMoeForCausalLM")
8688
8894
  class BailingMoeModel(TextModel):
8689
8895
  model_arch = gguf.MODEL_ARCH.BAILINGMOE
@@ -9144,6 +9350,19 @@ class VoxtralWhisperEncoderModel(WhisperEncoderModel):
9144
9350
  self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size
9145
9351
 
9146
9352
 
9353
+ @ModelBase.register("AudioFlamingo3ForConditionalGeneration")
9354
+ class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel):
9355
+ def set_gguf_parameters(self):
9356
+ super().set_gguf_parameters()
9357
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO)
9358
+
9359
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
9360
+ if ".conv" in name and ".weight" in name:
9361
+ # Was trained in BF16, being safe, avoiding quantizing to FP16
9362
+ return gguf.GGMLQuantizationType.F32
9363
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
9364
+
9365
+
9147
9366
  @ModelBase.register("FalconH1ForCausalLM")
9148
9367
  class FalconH1Model(Mamba2Model):
9149
9368
  model_arch = gguf.MODEL_ARCH.FALCON_H1
@@ -9991,6 +10210,35 @@ class SmallThinkerModel(TextModel):
9991
10210
  raise ValueError(f"Unprocessed experts: {experts}")
9992
10211
 
9993
10212
 
10213
+ @ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
10214
+ class ModernBertModel(BertModel):
10215
+ model_arch = gguf.MODEL_ARCH.MODERN_BERT
10216
+
10217
+ def set_vocab(self):
10218
+ self.gguf_writer.add_add_bos_token(True)
10219
+ self.gguf_writer.add_add_eos_token(True)
10220
+ self.gguf_writer.add_add_sep_token(True)
10221
+ self._set_vocab_gpt2()
10222
+
10223
+ def set_gguf_parameters(self):
10224
+ super().set_gguf_parameters()
10225
+ self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
10226
+ if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
10227
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
10228
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
10229
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
10230
+
10231
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10232
+ # these layers act as MLM head, so we don't need them
10233
+ if name.startswith("decoder."):
10234
+ return []
10235
+
10236
+ if name.startswith("model."):
10237
+ name = name[6:]
10238
+
10239
+ return super().modify_tensors(data_torch, name, bid)
10240
+
10241
+
9994
10242
  @ModelBase.register("ApertusForCausalLM")
9995
10243
  class ApertusModel(LlamaModel):
9996
10244
  model_arch = gguf.MODEL_ARCH.APERTUS
@@ -10426,6 +10674,79 @@ class JanusProVisionModel(MmprojModel):
10426
10674
  return []
10427
10675
 
10428
10676
 
10677
+ @ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
10678
+ class YOUTUVLVisionModel(MmprojModel):
10679
+ def __init__(self, *args, **kwargs):
10680
+ super().__init__(*args, **kwargs)
10681
+ assert self.hparams_vision is not None
10682
+ self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
10683
+
10684
+ def set_gguf_parameters(self):
10685
+ super().set_gguf_parameters()
10686
+
10687
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.YOUTUVL)
10688
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
10689
+
10690
+ # Handle activation function
10691
+ hidden_act = str(self.hparams.get("hidden_act", "gelu_pytorch_tanh")).lower()
10692
+ if hidden_act in ("gelu", "gelu_pytorch_tanh", "gelu_fast", "gelu_new", "gelu_accurate"):
10693
+ self.gguf_writer.add_vision_use_gelu(True)
10694
+ elif hidden_act == "silu":
10695
+ self.gguf_writer.add_vision_use_silu(True)
10696
+ else:
10697
+ raise ValueError(f"Unsupported activation function for YOUTUVL: {hidden_act}")
10698
+
10699
+ self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
10700
+
10701
+ window_size = self.hparams.get("window_size")
10702
+ if window_size is not None:
10703
+ self.gguf_writer.add_vision_window_size(window_size)
10704
+ # fullatt_block_indexes contains explicit layer indices that use full attention
10705
+ # e.g., [2, 5, 8, 11] means layers 2, 5, 8, 11 use full attention
10706
+ # All other layers use window attention
10707
+ fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
10708
+ assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for youtuvl"
10709
+ # Store the explicit layer indices for YoutuVL (irregular pattern approach)
10710
+ self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes)
10711
+
10712
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10713
+ del bid # unused
10714
+
10715
+ # Skip language model tensors
10716
+ skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.')
10717
+ if name.startswith(skip_prefixes):
10718
+ return []
10719
+
10720
+ # Try to map the tensor using TensorNameMap (handles vision encoder and projector)
10721
+ try:
10722
+ new_name = self.map_tensor_name(name)
10723
+ return [(new_name, data_torch)]
10724
+ except ValueError:
10725
+ # If mapping fails, log warning and skip
10726
+ logger.warning(f"Cannot map tensor: {name}")
10727
+ return []
10728
+
10729
+
10730
+ @ModelBase.register("SolarOpenForCausalLM")
10731
+ class SolarOpenModel(Glm4MoeModel):
10732
+ model_arch = gguf.MODEL_ARCH.GLM4_MOE
10733
+
10734
+ def set_vocab(self):
10735
+ from transformers import AutoTokenizer
10736
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
10737
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
10738
+ tokens, toktypes, tokpre = self.get_vocab_base()
10739
+ self.gguf_writer.add_tokenizer_model("gpt2")
10740
+ self.gguf_writer.add_tokenizer_pre(tokpre)
10741
+ self.gguf_writer.add_token_list(tokens)
10742
+ self.gguf_writer.add_token_types(toktypes)
10743
+ special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
10744
+ special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|endoftext|>"])
10745
+ special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<unk>"])
10746
+ special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|startoftext|>"])
10747
+ special_vocab.add_to_gguf(self.gguf_writer)
10748
+
10749
+
10429
10750
  ###### CONVERSION LOGIC ######
10430
10751
 
10431
10752
 
@@ -10557,8 +10878,8 @@ def parse_args() -> argparse.Namespace:
10557
10878
  help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
10558
10879
  )
10559
10880
  parser.add_argument(
10560
- "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
10561
- help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
10881
+ "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
10882
+ help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type",
10562
10883
  )
10563
10884
  parser.add_argument(
10564
10885
  "--bigendian", action="store_true",