xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +473 -31
  3. xinference/client/restful/async_restful_client.py +178 -8
  4. xinference/client/restful/restful_client.py +151 -3
  5. xinference/core/supervisor.py +99 -53
  6. xinference/core/worker.py +10 -0
  7. xinference/deploy/cmdline.py +15 -0
  8. xinference/model/audio/core.py +21 -6
  9. xinference/model/audio/indextts2.py +166 -0
  10. xinference/model/audio/model_spec.json +58 -21
  11. xinference/model/image/model_spec.json +159 -90
  12. xinference/model/image/stable_diffusion/core.py +13 -4
  13. xinference/model/llm/__init__.py +6 -2
  14. xinference/model/llm/llm_family.json +1299 -174
  15. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  16. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  17. xinference/model/llm/sglang/core.py +44 -11
  18. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
  19. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  20. xinference/model/llm/transformers/chatglm.py +3 -0
  21. xinference/model/llm/transformers/core.py +129 -36
  22. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  23. xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
  24. xinference/model/llm/transformers/utils.py +23 -0
  25. xinference/model/llm/utils.py +48 -32
  26. xinference/model/llm/vllm/core.py +207 -72
  27. xinference/model/utils.py +74 -31
  28. xinference/thirdparty/audiotools/__init__.py +10 -0
  29. xinference/thirdparty/audiotools/core/__init__.py +4 -0
  30. xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
  31. xinference/thirdparty/audiotools/core/display.py +194 -0
  32. xinference/thirdparty/audiotools/core/dsp.py +390 -0
  33. xinference/thirdparty/audiotools/core/effects.py +647 -0
  34. xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
  35. xinference/thirdparty/audiotools/core/loudness.py +320 -0
  36. xinference/thirdparty/audiotools/core/playback.py +252 -0
  37. xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
  38. xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
  39. xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
  40. xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
  41. xinference/thirdparty/audiotools/core/util.py +671 -0
  42. xinference/thirdparty/audiotools/core/whisper.py +97 -0
  43. xinference/thirdparty/audiotools/data/__init__.py +3 -0
  44. xinference/thirdparty/audiotools/data/datasets.py +517 -0
  45. xinference/thirdparty/audiotools/data/preprocess.py +81 -0
  46. xinference/thirdparty/audiotools/data/transforms.py +1592 -0
  47. xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
  48. xinference/thirdparty/audiotools/metrics/distance.py +131 -0
  49. xinference/thirdparty/audiotools/metrics/quality.py +159 -0
  50. xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
  51. xinference/thirdparty/audiotools/ml/__init__.py +5 -0
  52. xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
  53. xinference/thirdparty/audiotools/ml/decorators.py +440 -0
  54. xinference/thirdparty/audiotools/ml/experiment.py +90 -0
  55. xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
  56. xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
  57. xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
  58. xinference/thirdparty/audiotools/post.py +140 -0
  59. xinference/thirdparty/audiotools/preference.py +600 -0
  60. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  61. xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
  62. xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
  63. xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
  64. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
  65. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
  66. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  67. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
  68. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  69. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
  70. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  71. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
  72. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  73. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  74. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
  75. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
  76. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
  77. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
  78. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
  79. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
  80. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
  81. xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
  82. xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
  83. xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
  84. xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
  85. xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
  86. xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
  87. xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
  88. xinference/thirdparty/indextts/__init__.py +0 -0
  89. xinference/thirdparty/indextts/cli.py +65 -0
  90. xinference/thirdparty/indextts/gpt/__init__.py +0 -0
  91. xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
  92. xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
  93. xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
  94. xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
  95. xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
  96. xinference/thirdparty/indextts/gpt/model.py +713 -0
  97. xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
  98. xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
  99. xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
  100. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
  101. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
  102. xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
  103. xinference/thirdparty/indextts/infer.py +690 -0
  104. xinference/thirdparty/indextts/infer_v2.py +739 -0
  105. xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
  106. xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
  107. xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
  108. xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
  109. xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
  110. xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
  111. xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
  112. xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
  113. xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
  114. xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
  115. xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
  116. xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
  117. xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
  118. xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
  119. xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
  120. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
  121. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
  122. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
  123. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
  124. xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
  125. xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
  126. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
  127. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
  128. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  129. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  130. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
  131. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
  132. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
  133. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
  134. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
  135. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
  136. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
  137. xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
  138. xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
  139. xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
  140. xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
  141. xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
  142. xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
  143. xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
  144. xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
  145. xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
  146. xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
  147. xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
  148. xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
  149. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
  150. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
  151. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
  152. xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
  153. xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
  154. xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
  155. xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
  156. xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
  157. xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
  158. xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
  159. xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
  160. xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
  161. xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
  162. xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
  163. xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
  164. xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
  165. xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
  166. xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
  167. xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
  168. xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
  169. xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
  170. xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
  171. xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
  172. xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
  173. xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
  174. xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
  175. xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
  176. xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
  177. xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
  178. xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
  179. xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
  180. xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
  181. xinference/thirdparty/indextts/utils/__init__.py +0 -0
  182. xinference/thirdparty/indextts/utils/arch_util.py +120 -0
  183. xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
  184. xinference/thirdparty/indextts/utils/common.py +121 -0
  185. xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
  186. xinference/thirdparty/indextts/utils/front.py +536 -0
  187. xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
  188. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
  189. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
  190. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
  191. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
  192. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
  193. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
  194. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
  195. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
  196. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
  197. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
  198. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
  199. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
  200. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
  201. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
  202. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
  203. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
  204. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
  205. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
  206. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
  207. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
  208. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
  209. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
  210. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
  211. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
  212. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
  213. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
  214. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
  215. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
  216. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
  217. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
  218. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
  219. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
  220. xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
  221. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
  222. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
  223. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
  224. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
  225. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
  226. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
  227. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
  228. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
  229. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
  230. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
  231. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
  232. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
  233. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
  234. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
  235. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
  236. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
  237. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
  238. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
  239. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
  240. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
  241. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
  242. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
  243. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
  244. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
  245. xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
  246. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
  247. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
  248. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
  249. xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
  250. xinference/thirdparty/indextts/utils/text_utils.py +41 -0
  251. xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
  252. xinference/thirdparty/indextts/utils/utils.py +93 -0
  253. xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
  254. xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
  255. xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
  256. xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
  257. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  258. xinference/types.py +9 -0
  259. xinference/ui/gradio/media_interface.py +66 -8
  260. xinference/ui/web/ui/build/asset-manifest.json +6 -6
  261. xinference/ui/web/ui/build/index.html +1 -1
  262. xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
  263. xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
  264. xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
  265. xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
  266. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
  267. xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
  268. xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
  269. xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
  270. xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
  271. xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
  272. xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
  273. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
  274. xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
  275. xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
  276. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
  277. xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
  278. xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
  279. xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
  280. xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
  281. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  282. xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
  283. xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
  284. xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
  285. xinference/ui/web/ui/package-lock.json +0 -34
  286. xinference/ui/web/ui/package.json +0 -1
  287. xinference/ui/web/ui/src/locales/en.json +9 -3
  288. xinference/ui/web/ui/src/locales/ja.json +9 -3
  289. xinference/ui/web/ui/src/locales/ko.json +9 -3
  290. xinference/ui/web/ui/src/locales/zh.json +9 -3
  291. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
  292. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
  293. xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
  294. xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
  295. xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
  296. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
  297. xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
  298. xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
  299. xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
  300. xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
  301. xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
  302. xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
  303. xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
  304. xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
  305. xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
  306. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
  307. xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
  308. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
  309. xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
  310. xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
  311. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
  312. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
  313. xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
  314. xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
  315. xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
  316. xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
  317. xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
  318. xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
  319. xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
  320. xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
  321. xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
  322. xinference/ui/web/ui/node_modules/select/bower.json +0 -13
  323. xinference/ui/web/ui/node_modules/select/package.json +0 -29
  324. xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
  325. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
  326. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
  327. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
  328. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,340 @@
1
+ # Copyright 2022-2025 XProbe Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import logging
15
+ from concurrent.futures import ThreadPoolExecutor
16
+ from typing import Any, Dict, Iterator, List, Optional, Tuple
17
+
18
+ import torch
19
+ from PIL import Image
20
+
21
+ from .....core.model import register_batching_multimodal_models
22
+ from .....model.utils import select_device
23
+ from .....types import PytorchModelConfig
24
+ from ....scheduler.request import InferenceRequest
25
+ from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
26
+ from ...utils import _decode_image, parse_messages
27
+ from ..core import register_non_default_model
28
+ from .core import PytorchMultiModalModel
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ @register_batching_multimodal_models("MiniCPM-V-4.5")
34
+ @register_transformer
35
+ @register_non_default_model("MiniCPM-V-4.5")
36
+ class MiniCPMV45Model(PytorchMultiModalModel):
37
+ @classmethod
38
+ def match_json(
39
+ cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
40
+ ) -> bool:
41
+ family = model_family.model_family or model_family.model_name
42
+ if "MiniCPM-V-4.5".lower() in family.lower():
43
+ return True
44
+ return False
45
+
46
+ def _sanitize_model_config(
47
+ self, pytorch_model_config: Optional[PytorchModelConfig]
48
+ ) -> PytorchModelConfig:
49
+ pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
50
+ assert pytorch_model_config is not None
51
+ # Configure pixel parameters for MiniCPM-V-4.5
52
+ pytorch_model_config.setdefault("min_pixels", 256 * 28 * 28)
53
+ pytorch_model_config.setdefault("max_pixels", 1280 * 28 * 28)
54
+ return pytorch_model_config
55
+
56
+ def decide_device(self):
57
+ device = self._pytorch_model_config.get("device", "auto")
58
+ self._device = select_device(device)
59
+ self._device = (
60
+ "auto"
61
+ if self._device == "cuda" and self.quantization is None
62
+ else self._device
63
+ )
64
+
65
+ def load_processor(self):
66
+ from transformers import AutoProcessor, AutoTokenizer
67
+
68
+ min_pixels = self._pytorch_model_config.get("min_pixels")
69
+ max_pixels = self._pytorch_model_config.get("max_pixels")
70
+ self._processor = AutoProcessor.from_pretrained(
71
+ self.model_path,
72
+ trust_remote_code=True,
73
+ min_pixels=min_pixels,
74
+ max_pixels=max_pixels,
75
+ )
76
+
77
+ self._tokenizer = AutoTokenizer.from_pretrained(
78
+ self.model_path, trust_remote_code=True
79
+ )
80
+
81
+ def load_multimodal_model(self):
82
+ from transformers import AutoModel
83
+ from transformers.generation import GenerationConfig
84
+
85
+ if "int4" in self.model_path:
86
+ model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True)
87
+ else:
88
+ kwargs = self.apply_bnb_quantization()
89
+ model = AutoModel.from_pretrained(
90
+ self.model_path,
91
+ trust_remote_code=True,
92
+ torch_dtype=torch.float16,
93
+ device_map=self._device,
94
+ **kwargs,
95
+ )
96
+ self._model = model.eval()
97
+ # Specify hyperparameters for generation
98
+ self._model.generation_config = GenerationConfig.from_pretrained(
99
+ self.model_path,
100
+ trust_remote_code=True,
101
+ )
102
+ self._device = self._model.device
103
+
104
+ def _message_content_to_chat(self, content):
105
+ MAX_NUM_FRAMES = 64
106
+
107
+ def encode_video(video_path):
108
+ from decord import VideoReader, cpu
109
+
110
+ def uniform_sample(l, n):
111
+ gap = len(l) / n
112
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
113
+ return [l[i] for i in idxs]
114
+
115
+ vr = VideoReader(video_path, ctx=cpu(0))
116
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
117
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
118
+ if len(frame_idx) > MAX_NUM_FRAMES:
119
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
120
+ frames = vr.get_batch(frame_idx).asnumpy()
121
+ frames = [Image.fromarray(v.astype("uint8")) for v in frames]
122
+ logger.info(
123
+ f"Num frames: {len(frames)} when decoding video for {self.model_uid}"
124
+ )
125
+ return frames
126
+
127
+ def _load_video(_url):
128
+ frames = None
129
+ if _url.startswith("data:"):
130
+ raise RuntimeError("Only video url format is supported")
131
+ else:
132
+ frames = encode_video(_url)
133
+ return frames
134
+
135
+ if not isinstance(content, str):
136
+ texts = []
137
+ image_urls = []
138
+ video_urls = []
139
+ for c in content:
140
+ c_type = c.get("type")
141
+ if c_type == "text":
142
+ texts.append(c["text"])
143
+ elif c_type == "image_url":
144
+ image_urls.append(c["image_url"]["url"])
145
+ elif c_type == "video_url":
146
+ video_urls.append(c["video_url"]["url"])
147
+ image_futures = []
148
+ with ThreadPoolExecutor() as executor:
149
+ for image_url in image_urls:
150
+ fut = executor.submit(_decode_image, image_url)
151
+ image_futures.append(fut)
152
+ images = [fut.result() for fut in image_futures]
153
+ frames = []
154
+ if len(video_urls) > 1:
155
+ raise RuntimeError("Only one video per message is supported")
156
+ for v in video_urls:
157
+ frames = _load_video(v)
158
+ text = " ".join(texts)
159
+ return text, images, frames
160
+ return content, [], []
161
+
162
+ def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
163
+ video_existed = False
164
+ prompt, _, chat_history = parse_messages(messages)
165
+
166
+ content, images_chat, video_frames = self._message_content_to_chat(prompt)
167
+ if len(video_frames) > 0:
168
+ video_existed = True
169
+ images_chat = video_frames
170
+
171
+ msgs = []
172
+ query_to_response: List[Dict] = []
173
+ for h in chat_history or []:
174
+ images_history = []
175
+ role = h["role"]
176
+ content_h, images_tmp, video_frames_h = self._message_content_to_chat(
177
+ h["content"]
178
+ )
179
+ if images_tmp != []:
180
+ images_history = images_tmp
181
+ if len(video_frames_h) > 0:
182
+ video_existed = True
183
+ images_history = video_frames_h
184
+ if len(query_to_response) == 0 and role == "user":
185
+ query_to_response.append(
186
+ {"role": "user", "content": images_history + [content_h]}
187
+ )
188
+ if len(query_to_response) == 1 and role == "assistant":
189
+ query_to_response.append(
190
+ {"role": "assistant", "content": images_history + [content_h]}
191
+ )
192
+ if len(query_to_response) == 2:
193
+ msgs.extend(query_to_response)
194
+ query_to_response = []
195
+ msgs.append({"role": "user", "content": images_chat + [content]})
196
+ return msgs, video_existed
197
+
198
+ def build_inputs_from_messages(
199
+ self,
200
+ messages: List[Dict],
201
+ generate_config: Dict,
202
+ ):
203
+ msgs, video_existed = self._convert_to_specific_style(messages)
204
+ # Set decode params for video
205
+ params = {}
206
+ if video_existed:
207
+ params = {"use_image_id": False, "max_slice_nums": 1}
208
+ return dict(msgs=msgs, image=None, **params)
209
+
210
+ def build_generate_kwargs(
211
+ self,
212
+ generate_config: Dict,
213
+ ) -> Dict[str, Any]:
214
+ return dict(**generate_config)
215
+
216
+ def build_streaming_iter(
217
+ self,
218
+ messages: List[Dict],
219
+ generate_config: Dict,
220
+ ) -> Tuple[Iterator, int]:
221
+ inputs = self.build_inputs_from_messages(messages, generate_config)
222
+ config = self.build_generate_kwargs(generate_config)
223
+ chat_iter = self._model.chat(
224
+ **inputs, **config, tokenizer=self._tokenizer, sampling=True
225
+ )
226
+
227
+ return chat_iter, -1
228
+
229
+ def prepare_sanitize_generate_config(self, req: InferenceRequest):
230
+ """
231
+ Refer to MiniCPM-V-4.5 documentation for generation parameters
232
+ """
233
+ raw_config = req.inference_kwargs.get("raw_params", {})
234
+ temperature = raw_config.get("temperature", None)
235
+ if temperature is None:
236
+ raw_config["temperature"] = 0.7
237
+ top_p = raw_config.get("top_p", None)
238
+ if top_p is None:
239
+ raw_config["top_p"] = 0.8
240
+ top_k = raw_config.get("top_k", None)
241
+ if top_k is None:
242
+ raw_config["top_k"] = 100
243
+ repetition_penalty = raw_config.get("repetition_penalty", None)
244
+ if repetition_penalty is None:
245
+ raw_config["repetition_penalty"] = 1.05
246
+ return raw_config
247
+
248
+ def _handle_input_ids_and_images(self, msgs: List[Dict]) -> Dict:
249
+ """
250
+ Handle input IDs and images for MiniCPM-V-4.5
251
+ Based on MiniCPM-V-2.6 implementation with adaptations for 4.5
252
+ """
253
+ from copy import deepcopy
254
+
255
+ copy_msgs = deepcopy(msgs)
256
+
257
+ images = []
258
+ for i, msg in enumerate(copy_msgs):
259
+ role = msg["role"]
260
+ content = msg["content"]
261
+ assert role in ["user", "assistant"]
262
+ if i == 0:
263
+ assert role == "user", "The role of first msg should be user"
264
+ if isinstance(content, str):
265
+ content = [content]
266
+ cur_msgs = []
267
+ for c in content:
268
+ if isinstance(c, Image.Image):
269
+ images.append(c)
270
+ cur_msgs.append("(<image>./</image>)")
271
+ elif isinstance(c, str):
272
+ cur_msgs.append(c)
273
+ msg["content"] = "\n".join(cur_msgs)
274
+
275
+ return {
276
+ "prompt": self._processor.tokenizer.apply_chat_template(
277
+ copy_msgs, tokenize=False, add_generation_prompt=True
278
+ ),
279
+ "input_image": images,
280
+ }
281
+
282
+ def _get_full_prompt(self, messages: List[Dict], tools, generate_config: dict): # type: ignore
283
+ msgs, video_existed = self._convert_to_specific_style(messages)
284
+ if video_existed:
285
+ raise RuntimeError(
286
+ f"Continuous batching does not support video inputs for this model: {self.model_uid}"
287
+ )
288
+ return self._handle_input_ids_and_images(msgs)
289
+
290
+ def build_prefill_kwargs(self, prompts: List, req_list: List[InferenceRequest]):
291
+ prompts_lists = [x["prompt"] for x in prompts]
292
+ input_images_lists = [x["input_image"] for x in prompts]
293
+ inputs = self._processor(
294
+ prompts_lists,
295
+ input_images_lists,
296
+ max_slice_nums=None,
297
+ use_image_id=None,
298
+ return_tensors="pt",
299
+ max_length=8192,
300
+ ).to(self._model.device)
301
+ inputs.pop("image_sizes")
302
+
303
+ masked_input_ids = inputs["input_ids"] * inputs["attention_mask"]
304
+ for i in range(masked_input_ids.shape[0]):
305
+ non_zero_values = masked_input_ids[i][masked_input_ids[i] != 0].tolist()
306
+ req_list[i].prompt_tokens = non_zero_values
307
+ req_list[i].extra_kwargs["attention_mask_seq_len"] = len(non_zero_values)
308
+ req_list[i].padding_len = masked_input_ids.shape[1] - len(non_zero_values)
309
+
310
+ model_inputs = {
311
+ "input_ids": inputs["input_ids"],
312
+ "image_bound": inputs["image_bound"],
313
+ "pixel_values": inputs["pixel_values"],
314
+ "tgt_sizes": inputs["tgt_sizes"],
315
+ }
316
+ model_inputs["inputs_embeds"], _ = self._model.get_vllm_embedding(model_inputs)
317
+
318
+ return {
319
+ "inputs_embeds": model_inputs["inputs_embeds"],
320
+ "attention_mask": inputs["attention_mask"],
321
+ }
322
+
323
+ def build_decode_position_ids(
324
+ self, batch_size: int, seq_length: int, reqs: List[InferenceRequest]
325
+ ):
326
+ return None
327
+
328
+ def batch_inference(self, req_list: List[InferenceRequest]):
329
+ """
330
+ This method is rewritten
331
+ because the specific inference process is performed by `self._model.llm`,
332
+ not `self._model` itself
333
+ """
334
+ from ..utils import batch_inference_one_step
335
+
336
+ self.prepare_batch_inference(req_list)
337
+ batch_inference_one_step(
338
+ self, req_list, self.model_uid, self._model.llm, self._tokenizer
339
+ )
340
+ self.handle_batch_inference_results(req_list)
@@ -27,11 +27,19 @@ logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
29
  @register_batching_multimodal_models(
30
- "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
30
+ "qwen2-vl-instruct",
31
+ "qwen2.5-vl-instruct",
32
+ "QvQ-72B-Preview",
33
+ "Qwen3-VL-Instruct",
34
+ "Qwen3-VL-Thinking",
31
35
  )
32
36
  @register_transformer
33
37
  @register_non_default_model(
34
- "qwen2-vl-instruct", "qwen2.5-vl-instruct", "QvQ-72B-Preview"
38
+ "qwen2-vl-instruct",
39
+ "qwen2.5-vl-instruct",
40
+ "QvQ-72B-Preview",
41
+ "Qwen3-VL-Instruct",
42
+ "Qwen3-VL-Thinking",
35
43
  )
36
44
  class Qwen2VLChatModel(PytorchMultiModalModel):
37
45
  def _sanitize_model_config(
@@ -47,7 +55,7 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
47
55
  def match_json(
48
56
  cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
49
57
  ) -> bool:
50
- if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb"]:
58
+ if model_spec.model_format not in ["pytorch", "gptq", "awq", "bnb", "fp8"]:
51
59
  return False
52
60
  llm_family = model_family.model_family or model_family.model_name
53
61
  if "qwen2-vl-instruct".lower() in llm_family.lower():
@@ -56,6 +64,8 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
56
64
  return True
57
65
  if "qvq-72b-preview".lower() in llm_family.lower():
58
66
  return True
67
+ if "qwen3-vl" in llm_family.lower():
68
+ return True
59
69
  return False
60
70
 
61
71
  def decide_device(self):
@@ -85,13 +95,19 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
85
95
  except ImportError:
86
96
  Qwen2_5_VLForConditionalGeneration = None
87
97
 
98
+ try:
99
+ from transformers import AutoModelForImageTextToText
100
+ except ImportError:
101
+ AutoModelForImageTextToText = None
102
+
88
103
  kwargs = self.apply_bnb_quantization()
89
104
  llm_family = self.model_family.model_family or self.model_family.model_name
90
- model_cls = (
91
- Qwen2_5_VLForConditionalGeneration
92
- if "qwen2.5" in llm_family
93
- else Qwen2VLForConditionalGeneration
94
- )
105
+ if "qwen2.5" in llm_family:
106
+ model_cls = Qwen2_5_VLForConditionalGeneration
107
+ elif "qwen3" in llm_family:
108
+ model_cls = AutoModelForImageTextToText
109
+ else:
110
+ model_cls = Qwen2VLForConditionalGeneration
95
111
  if model_cls is None:
96
112
  raise ImportError("`transformers` version is too old, please upgrade it")
97
113
  device = "auto" if self._device == "cuda" else self._device
@@ -118,6 +134,16 @@ class Qwen2VLChatModel(PytorchMultiModalModel):
118
134
  torch_dtype="float16",
119
135
  **kwargs,
120
136
  ).eval()
137
+ elif device == "mps":
138
+ # MacOS special, see https://github.com/QwenLM/Qwen2.5-VL/issues/761
139
+ self._model = model_cls.from_pretrained(
140
+ self.model_path,
141
+ torch_dtype="bfloat16",
142
+ device_map=device,
143
+ attn_implementation="eager",
144
+ low_cpu_mem_usage=True,
145
+ trust_remote_code=True,
146
+ ).eval()
121
147
  else:
122
148
  self._model = model_cls.from_pretrained(
123
149
  self.model_path,
@@ -281,11 +281,34 @@ def _batch_inference_one_step_internal(
281
281
  r.append_new_token(token)
282
282
 
283
283
  if decode_reqs:
284
+ # Ensure all decode requests have the same kv_cache reference
285
+ # This prevents batch size mismatches during merging
284
286
  decode_kv = decode_reqs[0].kv_cache
287
+
288
+ # Verify that all decode requests share the same kv_cache
289
+ for req in decode_reqs[1:]:
290
+ if req.kv_cache is not decode_kv:
291
+ logger.warning(
292
+ "Inconsistent kv_cache references detected in decode requests. "
293
+ "This may indicate a batching synchronization issue."
294
+ )
295
+ # Use the first decode_kv as the reference to maintain consistency
296
+ req.kv_cache = decode_kv
297
+
285
298
  # prefill and decode kv cache need to be merged at `batch_size` and `seq_len` dimensions.
286
299
  merged_kv_cache = xinf_model_obj.merge_kv_cache(decode_kv, past_key_values)
300
+ # Update sequence length information after KV cache merge
301
+ _, merged_seq_len = get_batch_size_and_seq_len_from_kv_cache(
302
+ merged_kv_cache, xinf_model_obj
303
+ )
287
304
  for r in valid_req_list:
288
305
  r.kv_cache = merged_kv_cache
306
+ # Update attention mask sequence length to match merged KV cache
307
+ if "attention_mask_seq_len" in r.extra_kwargs:
308
+ # Ensure the attention mask length doesn't exceed the merged sequence length
309
+ r.extra_kwargs["attention_mask_seq_len"] = min(
310
+ r.extra_kwargs["attention_mask_seq_len"], merged_seq_len - 1
311
+ )
289
312
  empty_cache()
290
313
  else:
291
314
  for r in valid_req_list:
@@ -71,6 +71,12 @@ QWEN_TOOL_CALL_FAMILY = [
71
71
  "Qwen3-Thinking",
72
72
  "Qwen3-Instruct",
73
73
  "Qwen3-Coder",
74
+ "Qwen3-VL-Instruct",
75
+ "Qwen3-VL-Thinking",
76
+ "Qwen3-Next-Instruct",
77
+ "Qwen3-Next-Thinking",
78
+ "Qwen3-Omni-Instruct",
79
+ "Qwen3-Omni-Thinking",
74
80
  ]
75
81
 
76
82
  GLM4_TOOL_CALL_FAMILY = [
@@ -96,7 +102,6 @@ QWEN_TOOL_CALL_SYMBOLS = ["<tool_call>", "</tool_call>"]
96
102
 
97
103
 
98
104
  class ChatModelMixin:
99
-
100
105
  def __init__(self):
101
106
  self.model_family = None
102
107
  self.model_uid = None
@@ -139,7 +144,7 @@ class ChatModelMixin:
139
144
  tokenize=False,
140
145
  **kwargs,
141
146
  ):
142
- if "vision" not in self.model_family.model_ability: # type: ignore
147
+ if "vision" not in self.model_family.model_ability and "audio" not in self.model_family.model_ability: # type: ignore
143
148
  messages = self.convert_messages_with_content_list_to_str_conversion(
144
149
  messages
145
150
  )
@@ -182,8 +187,7 @@ class ChatModelMixin:
182
187
  return kwargs
183
188
  else:
184
189
  raise TypeError(
185
- f"`chat_template_kwargs` but be a JSON parsable str "
186
- f"or dict, got: {kwargs}"
190
+ f"`chat_template_kwargs` but be a JSON parsable str or dict, got: {kwargs}"
187
191
  )
188
192
  elif reasoning_parser and not reasoning_parser.enable_thinking:
189
193
  # hybrid model like qwen3,
@@ -347,9 +351,7 @@ class ChatModelMixin:
347
351
  assert choices is not None
348
352
  usage = (
349
353
  chunk["usage"]
350
- if choices[0]["finish_reason"] is not None
351
- and reasoning_parser
352
- and reasoning_parser.check_content_parser()
354
+ if choices and choices[0]["finish_reason"] is not None or not choices
353
355
  else None
354
356
  )
355
357
  chat_chunk = {
@@ -798,7 +800,11 @@ class ChatModelMixin:
798
800
  chunk_id=None,
799
801
  previous_texts: List[str] = [""],
800
802
  ):
803
+ if not c.get("choices"):
804
+ return c
801
805
  _id = chunk_id if chunk_id is not None else str(uuid.uuid4())
806
+ tool_result = None
807
+ finish_reason = None
802
808
  if isinstance(self.tool_parser, Glm4ToolParser):
803
809
  tool_result = self.tool_parser.extract_tool_calls_streaming(
804
810
  [],
@@ -847,15 +853,11 @@ class ChatModelMixin:
847
853
  "tool_calls": tool_calls,
848
854
  }
849
855
 
850
- try:
856
+ # For tool completion chunks, use None for usage, actual values for stop
857
+ if finish_reason == "tool_calls":
858
+ usage = None
859
+ else:
851
860
  usage = c.get("usage")
852
- assert "prompt_tokens" in usage
853
- except Exception:
854
- usage = {
855
- "prompt_tokens": -1,
856
- "completion_tokens": -1,
857
- "total_tokens": -1,
858
- }
859
861
  return {
860
862
  "id": "chat" + f"cmpl-{_id}",
861
863
  "model": model_uid,
@@ -880,25 +882,32 @@ class ChatModelMixin:
880
882
  ):
881
883
  if not self.tool_parser:
882
884
  return self._get_final_chat_completion_chunk(c)
883
- if self.reasoning_parser:
884
- c = self.reasoning_parser.prepare_reasoning_content(c)
885
+
885
886
  _id = str(uuid.uuid4())
886
887
  reasoning_content = None
888
+ content = ""
889
+
890
+ # First, process reasoning content if reasoning parser exists
891
+ text = c["choices"][0]["text"]
887
892
  if self.reasoning_parser and self.reasoning_parser.check_content_parser():
888
- text = c["choices"][0]["text"]
889
- reasoning_content, content = (
893
+ # Extract reasoning content directly from the original text
894
+ reasoning_content, processed_content = (
890
895
  self.reasoning_parser.extract_reasoning_content(text)
891
896
  )
892
- c["choices"][0]["text"] = content
897
+ # Use the processed content (without thinking tags) for tool parsing
898
+ if processed_content:
899
+ text = processed_content
893
900
 
901
+ # Then, extract tool calls from the processed text (without thinking tags)
894
902
  tool_calls = []
895
903
  failed_contents = []
896
904
  if isinstance(self.tool_parser, Glm4ToolParser):
897
905
  tool_result = self.tool_parser.extract_tool_calls(c)
898
906
  else:
899
- text = c["choices"][0]["text"]
900
907
  tool_result = self.tool_parser.extract_tool_calls(text)
901
- for content, func, args in tool_result:
908
+
909
+ # Process tool results
910
+ for tool_content, func, args in tool_result:
902
911
  if func:
903
912
  tool_calls.append(
904
913
  {
@@ -911,25 +920,31 @@ class ChatModelMixin:
911
920
  }
912
921
  )
913
922
  else:
914
- if content:
915
- failed_contents.append(content)
916
- finish_reason = "tool_calls" if tool_calls else "stop"
923
+ if tool_content:
924
+ failed_contents.append(tool_content)
917
925
 
918
- content = "".join(failed_contents) if failed_contents else None
926
+ # Determine the final content
927
+ if tool_calls:
928
+ # For tool calls, the main content should be empty or contain only non-tool parts
929
+ content = "".join(failed_contents) if failed_contents else ""
930
+ else:
931
+ # For non-tool calls, use the processed content from reasoning parser
932
+ content = text
933
+
934
+ finish_reason = "tool_calls" if tool_calls else "stop"
919
935
 
920
936
  m = {
921
937
  "role": "assistant",
922
- "content": content if content else "",
938
+ "content": content,
923
939
  "tool_calls": tool_calls,
924
940
  }
925
941
  # add only reasoning_content is None
926
942
  if reasoning_content is not None:
927
943
  m["reasoning_content"] = reasoning_content
928
944
 
929
- try:
930
- usage = c.get("usage")
931
- assert "prompt_tokens" in usage
932
- except Exception:
945
+ # For tool completion chunks, use actual usage values when available
946
+ usage = c.get("usage")
947
+ if not usage or not isinstance(usage, dict) or "prompt_tokens" not in usage:
933
948
  usage = {
934
949
  "prompt_tokens": -1,
935
950
  "completion_tokens": -1,
@@ -1009,7 +1024,8 @@ class ChatModelMixin:
1009
1024
  completion_chunk, self.reasoning_parser, previous_texts
1010
1025
  )
1011
1026
  if (
1012
- "reasoning_content" in chat_chunk["choices"][0]["delta"]
1027
+ chat_chunk["choices"]
1028
+ and "reasoning_content" in chat_chunk["choices"][0]["delta"]
1013
1029
  and chat_chunk["choices"][0]["delta"]["reasoning_content"] is not None
1014
1030
  ):
1015
1031
  yield chat_chunk