xinference 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (328) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +473 -31
  3. xinference/client/restful/async_restful_client.py +178 -8
  4. xinference/client/restful/restful_client.py +151 -3
  5. xinference/core/supervisor.py +99 -53
  6. xinference/core/worker.py +10 -0
  7. xinference/deploy/cmdline.py +15 -0
  8. xinference/model/audio/core.py +21 -6
  9. xinference/model/audio/indextts2.py +166 -0
  10. xinference/model/audio/model_spec.json +58 -21
  11. xinference/model/image/model_spec.json +159 -90
  12. xinference/model/image/stable_diffusion/core.py +13 -4
  13. xinference/model/llm/__init__.py +6 -2
  14. xinference/model/llm/llm_family.json +1299 -174
  15. xinference/model/llm/mlx/distributed_models/core.py +41 -0
  16. xinference/model/llm/mlx/distributed_models/qwen2.py +1 -2
  17. xinference/model/llm/sglang/core.py +44 -11
  18. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
  19. xinference/model/llm/tool_parsers/qwen_tool_parser.py +29 -4
  20. xinference/model/llm/transformers/chatglm.py +3 -0
  21. xinference/model/llm/transformers/core.py +129 -36
  22. xinference/model/llm/transformers/multimodal/minicpmv45.py +340 -0
  23. xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
  24. xinference/model/llm/transformers/utils.py +23 -0
  25. xinference/model/llm/utils.py +48 -32
  26. xinference/model/llm/vllm/core.py +207 -72
  27. xinference/model/utils.py +74 -31
  28. xinference/thirdparty/audiotools/__init__.py +10 -0
  29. xinference/thirdparty/audiotools/core/__init__.py +4 -0
  30. xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
  31. xinference/thirdparty/audiotools/core/display.py +194 -0
  32. xinference/thirdparty/audiotools/core/dsp.py +390 -0
  33. xinference/thirdparty/audiotools/core/effects.py +647 -0
  34. xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
  35. xinference/thirdparty/audiotools/core/loudness.py +320 -0
  36. xinference/thirdparty/audiotools/core/playback.py +252 -0
  37. xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
  38. xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
  39. xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
  40. xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
  41. xinference/thirdparty/audiotools/core/util.py +671 -0
  42. xinference/thirdparty/audiotools/core/whisper.py +97 -0
  43. xinference/thirdparty/audiotools/data/__init__.py +3 -0
  44. xinference/thirdparty/audiotools/data/datasets.py +517 -0
  45. xinference/thirdparty/audiotools/data/preprocess.py +81 -0
  46. xinference/thirdparty/audiotools/data/transforms.py +1592 -0
  47. xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
  48. xinference/thirdparty/audiotools/metrics/distance.py +131 -0
  49. xinference/thirdparty/audiotools/metrics/quality.py +159 -0
  50. xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
  51. xinference/thirdparty/audiotools/ml/__init__.py +5 -0
  52. xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
  53. xinference/thirdparty/audiotools/ml/decorators.py +440 -0
  54. xinference/thirdparty/audiotools/ml/experiment.py +90 -0
  55. xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
  56. xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
  57. xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
  58. xinference/thirdparty/audiotools/post.py +140 -0
  59. xinference/thirdparty/audiotools/preference.py +600 -0
  60. xinference/thirdparty/fish_speech/fish_speech/text/chn_text_norm/text.py +1 -1
  61. xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
  62. xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
  63. xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
  64. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
  65. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
  66. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  67. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
  68. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  69. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
  70. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  71. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
  72. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  73. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  74. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
  75. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
  76. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
  77. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
  78. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
  79. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
  80. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
  81. xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
  82. xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
  83. xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
  84. xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
  85. xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
  86. xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
  87. xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
  88. xinference/thirdparty/indextts/__init__.py +0 -0
  89. xinference/thirdparty/indextts/cli.py +65 -0
  90. xinference/thirdparty/indextts/gpt/__init__.py +0 -0
  91. xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
  92. xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
  93. xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
  94. xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
  95. xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
  96. xinference/thirdparty/indextts/gpt/model.py +713 -0
  97. xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
  98. xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
  99. xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
  100. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
  101. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
  102. xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
  103. xinference/thirdparty/indextts/infer.py +690 -0
  104. xinference/thirdparty/indextts/infer_v2.py +739 -0
  105. xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
  106. xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
  107. xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
  108. xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
  109. xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
  110. xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
  111. xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
  112. xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
  113. xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
  114. xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
  115. xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
  116. xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
  117. xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
  118. xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
  119. xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
  120. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
  121. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
  122. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
  123. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
  124. xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
  125. xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
  126. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
  127. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
  128. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  129. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  130. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
  131. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
  132. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
  133. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
  134. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
  135. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
  136. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
  137. xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
  138. xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
  139. xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
  140. xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
  141. xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
  142. xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
  143. xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
  144. xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
  145. xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
  146. xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
  147. xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
  148. xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
  149. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
  150. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
  151. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
  152. xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
  153. xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
  154. xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
  155. xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
  156. xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
  157. xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
  158. xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
  159. xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
  160. xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
  161. xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
  162. xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
  163. xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
  164. xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
  165. xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
  166. xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
  167. xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
  168. xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
  169. xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
  170. xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
  171. xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
  172. xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
  173. xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
  174. xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
  175. xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
  176. xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
  177. xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
  178. xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
  179. xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
  180. xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
  181. xinference/thirdparty/indextts/utils/__init__.py +0 -0
  182. xinference/thirdparty/indextts/utils/arch_util.py +120 -0
  183. xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
  184. xinference/thirdparty/indextts/utils/common.py +121 -0
  185. xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
  186. xinference/thirdparty/indextts/utils/front.py +536 -0
  187. xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
  188. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
  189. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
  190. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
  191. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
  192. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
  193. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
  194. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
  195. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
  196. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
  197. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
  198. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
  199. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
  200. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
  201. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
  202. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
  203. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
  204. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
  205. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
  206. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
  207. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
  208. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
  209. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
  210. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
  211. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
  212. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
  213. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
  214. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
  215. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
  216. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
  217. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
  218. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
  219. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
  220. xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
  221. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
  222. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
  223. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
  224. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
  225. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
  226. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
  227. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
  228. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
  229. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
  230. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
  231. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
  232. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
  233. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
  234. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
  235. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
  236. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
  237. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
  238. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
  239. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
  240. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
  241. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
  242. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
  243. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
  244. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
  245. xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
  246. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
  247. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
  248. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
  249. xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
  250. xinference/thirdparty/indextts/utils/text_utils.py +41 -0
  251. xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
  252. xinference/thirdparty/indextts/utils/utils.py +93 -0
  253. xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
  254. xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
  255. xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
  256. xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
  257. xinference/thirdparty/melo/text/chinese_mix.py +2 -2
  258. xinference/types.py +9 -0
  259. xinference/ui/gradio/media_interface.py +66 -8
  260. xinference/ui/web/ui/build/asset-manifest.json +6 -6
  261. xinference/ui/web/ui/build/index.html +1 -1
  262. xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
  263. xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
  264. xinference/ui/web/ui/build/static/js/main.45e78536.js +3 -0
  265. xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.45e78536.js.LICENSE.txt} +0 -7
  266. xinference/ui/web/ui/build/static/js/main.45e78536.js.map +1 -0
  267. xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
  268. xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
  269. xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
  270. xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
  271. xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
  272. xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
  273. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
  274. xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
  275. xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
  276. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
  277. xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
  278. xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
  279. xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
  280. xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
  281. xinference/ui/web/ui/node_modules/.cache/babel-loader/ea2a26361204e70cf1018d6990fb6354bed82b3ac69690391e0f100385e7abb7.json +1 -0
  282. xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
  283. xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
  284. xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
  285. xinference/ui/web/ui/package-lock.json +0 -34
  286. xinference/ui/web/ui/package.json +0 -1
  287. xinference/ui/web/ui/src/locales/en.json +9 -3
  288. xinference/ui/web/ui/src/locales/ja.json +9 -3
  289. xinference/ui/web/ui/src/locales/ko.json +9 -3
  290. xinference/ui/web/ui/src/locales/zh.json +9 -3
  291. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/METADATA +24 -6
  292. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/RECORD +296 -77
  293. xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
  294. xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
  295. xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
  296. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
  297. xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
  298. xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
  299. xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
  300. xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
  301. xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
  302. xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
  303. xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
  304. xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
  305. xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
  306. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
  307. xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
  308. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
  309. xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
  310. xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
  311. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
  312. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
  313. xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
  314. xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
  315. xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
  316. xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
  317. xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
  318. xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
  319. xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
  320. xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
  321. xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
  322. xinference/ui/web/ui/node_modules/select/bower.json +0 -13
  323. xinference/ui/web/ui/node_modules/select/package.json +0 -29
  324. xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
  325. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/WHEEL +0 -0
  326. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/entry_points.txt +0 -0
  327. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/licenses/LICENSE +0 -0
  328. {xinference-1.10.0.dist-info → xinference-1.11.0.dist-info}/top_level.txt +0 -0
@@ -162,3 +162,44 @@ class DistributedModelMixin:
162
162
  self.layers = self.layers[: self.end_idx]
163
163
  self.layers[: self.start_idx] = [None] * self.start_idx
164
164
  self.num_layers = len(self.layers) - self.start_idx
165
+
166
+
167
+ class SafeKVCache:
168
+ """
169
+ A safe wrapper around mlx_lm's KVCache that handles None keys gracefully.
170
+ This is needed because mlx_lm's generate function accesses cache.state
171
+ before the cache is properly initialized.
172
+ """
173
+
174
+ def __init__(self):
175
+ from mlx_lm.models.cache import KVCache
176
+
177
+ self._cache = KVCache()
178
+
179
+ @property
180
+ def state(self):
181
+ # Safe access to state property
182
+ if self._cache.keys is None:
183
+ return None, None
184
+ if self._cache.offset == self._cache.keys.shape[2]:
185
+ return self._cache.keys, self._cache.values
186
+ else:
187
+ return (
188
+ self._cache.keys[..., : self._cache.offset, :],
189
+ self._cache.values[..., : self._cache.offset, :],
190
+ )
191
+
192
+ @state.setter
193
+ def state(self, v):
194
+ # Safe setter for state property
195
+ if v is None or v[0] is None:
196
+ self._cache.keys = None
197
+ self._cache.values = None
198
+ self._cache.offset = 0
199
+ else:
200
+ self._cache.keys, self._cache.values = v
201
+ self._cache.offset = self._cache.keys.shape[2]
202
+
203
+ def __getattr__(self, name):
204
+ # Delegate all other attributes and methods to the underlying cache
205
+ return getattr(self._cache, name)
@@ -46,11 +46,10 @@ class Qwen2Model(_Qwen2Model, DistributedModelMixin):
46
46
 
47
47
  pipeline_rank = self.rank
48
48
  pipeline_size = self.world_size
49
- if mask is None:
50
- mask = create_attention_mask(h, cache)
51
49
 
52
50
  if cache is None:
53
51
  cache = [None] * self.num_layers
52
+ mask = create_attention_mask(h, cache[0])
54
53
 
55
54
  # Receive from the previous process in the pipeline
56
55
 
@@ -73,6 +73,7 @@ class SGLANGGenerateConfig(TypedDict, total=False):
73
73
  stream: bool
74
74
  stream_options: Optional[Union[dict, None]]
75
75
  json_schema: Optional[dict]
76
+ response_format: dict
76
77
 
77
78
 
78
79
  try:
@@ -317,13 +318,16 @@ class SGLANGModel(LLM):
317
318
  stream_options = generate_config.get("stream_options")
318
319
  generate_config.setdefault("stream_options", stream_options)
319
320
  generate_config.setdefault("ignore_eos", False)
320
- json_schema = (
321
- generate_config.pop("response_format", {}) # type: ignore
322
- .pop("json_schema", {})
323
- .pop("schema", {})
324
- )
325
- if json_schema:
326
- generate_config.setdefault("json_schema", json.dumps(json_schema)) # type: ignore
321
+ response_format = generate_config.pop("response_format", None)
322
+ if response_format:
323
+ json_schema_config = response_format.pop("json_schema", None)
324
+ json_schema = None
325
+ if "schema_" in json_schema_config:
326
+ json_schema = json_schema_config.pop("schema_")
327
+ elif "schema" in json_schema_config:
328
+ json_schema = json_schema_config.pop("schema")
329
+ if json_schema:
330
+ generate_config.setdefault("json_schema", json.dumps(json_schema)) # type: ignore
327
331
 
328
332
  return generate_config
329
333
 
@@ -356,22 +360,38 @@ class SGLANGModel(LLM):
356
360
 
357
361
  @staticmethod
358
362
  def _convert_state_to_completion_chunk(
359
- request_id: str, model: str, output_text: str
363
+ request_id: str, model: str, output_text: str, meta_info: Dict
360
364
  ) -> CompletionChunk:
365
+ finish_reason_raw = meta_info.get("finish_reason", None)
366
+ finish_reason: Optional[str] = None
367
+ if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
368
+ finish_reason = (
369
+ str(finish_reason_raw["type"])
370
+ if finish_reason_raw["type"] is not None
371
+ else None
372
+ )
373
+ elif isinstance(finish_reason_raw, str):
374
+ finish_reason = finish_reason_raw
361
375
  choices: List[CompletionChoice] = [
362
376
  CompletionChoice(
363
377
  text=output_text,
364
378
  index=0,
365
379
  logprobs=None,
366
- finish_reason=None,
380
+ finish_reason=finish_reason,
367
381
  )
368
382
  ]
383
+ usage = CompletionUsage(
384
+ prompt_tokens=meta_info["prompt_tokens"],
385
+ completion_tokens=meta_info["completion_tokens"],
386
+ total_tokens=meta_info["prompt_tokens"] + meta_info["completion_tokens"],
387
+ )
369
388
  chunk = CompletionChunk(
370
389
  id=request_id,
371
390
  object="text_completion",
372
391
  created=int(time.time()),
373
392
  model=model,
374
393
  choices=choices,
394
+ usage=usage,
375
395
  )
376
396
  return chunk
377
397
 
@@ -379,12 +399,22 @@ class SGLANGModel(LLM):
379
399
  def _convert_state_to_completion(
380
400
  request_id: str, model: str, output_text: str, meta_info: Dict
381
401
  ) -> Completion:
402
+ finish_reason_raw = meta_info.get("finish_reason", None)
403
+ finish_reason: Optional[str] = None
404
+ if isinstance(finish_reason_raw, dict) and "type" in finish_reason_raw:
405
+ finish_reason = (
406
+ str(finish_reason_raw["type"])
407
+ if finish_reason_raw["type"] is not None
408
+ else None
409
+ )
410
+ elif isinstance(finish_reason_raw, str):
411
+ finish_reason = finish_reason_raw
382
412
  choices = [
383
413
  CompletionChoice(
384
414
  text=output_text,
385
415
  index=0,
386
416
  logprobs=None,
387
- finish_reason=None,
417
+ finish_reason=finish_reason,
388
418
  )
389
419
  ]
390
420
 
@@ -513,7 +543,10 @@ class SGLANGModel(LLM):
513
543
  prompt, image_data, **sanitized_generate_config
514
544
  ):
515
545
  chunk = self._convert_state_to_completion_chunk(
516
- request_id, self.model_uid, output_text=out
546
+ request_id,
547
+ self.model_uid,
548
+ output_text=out,
549
+ meta_info=meta_info,
517
550
  )
518
551
  complete_response += out
519
552
  finish_reason = meta_info["finish_reason"]
@@ -23,12 +23,27 @@ class DeepseekR1ToolParser(ToolParser):
23
23
  Initialize the DeepSeek R1 tool parser.
24
24
  """
25
25
  super().__init__()
26
+
27
+ # Sentinel tokens for streaming mode
28
+ self.think_start_token: str = "<think>"
29
+ self.think_end_token: str = "</think>"
30
+ self.tool_call_start_token: str = "<|tool▁call▁begin|>"
31
+ self.tool_call_end_token: str = "<|tool▁call▁end|>"
32
+
26
33
  # Regex pattern to match DeepSeek R1 tool call format
27
34
  self.tool_calls_regex = (
28
35
  r"<\|tool▁call▁begin|>function<\|tool▁sep|>([^\n]+)\n"
29
36
  r"```json\n(.*?)\n```<\|tool▁call▁end|>"
30
37
  )
31
38
 
39
+ # Regex pattern to match the entire tool-calls wrapper block.
40
+ # We intentionally do NOT match <think> blocks here so that the
41
+ # "text before" chunk will include both the think block and any
42
+ # narrative text up to the tool calls wrapper, yielding exactly two
43
+ # blocks when there is a single tool calls section:
44
+ # [before_text_including_think, tool_calls_wrapper_block]
45
+ self.content_regex = r"(<\|tool▁calls▁begin|>.*?<\|tool▁calls▁end|>)"
46
+
32
47
  def extract_tool_calls(
33
48
  self, model_output: str
34
49
  ) -> List[Tuple[Optional[str], Optional[str], Optional[dict]]]:
@@ -56,49 +71,96 @@ class DeepseekR1ToolParser(ToolParser):
56
71
  >>> print(result)
57
72
  [(None, 'get_current_weather', {'location': 'Beijing'})]
58
73
  """
59
- matches = re.findall(self.tool_calls_regex, model_output, re.DOTALL)
60
- if not matches:
61
- # No tool calls found, return the original output as content
74
+ # If no tool call tokens, return original output as content
75
+ if self.tool_call_start_token not in model_output:
62
76
  return [(model_output, None, None)]
63
77
 
78
+ # Get all content blocks (text, thinking blocks, tool calls)
79
+ function_calls = self._get_function_calls(model_output)
80
+
64
81
  # Use set for deduplication of identical tool calls
65
82
  tool_calls = set()
66
83
  results: List[Tuple[Optional[str], Optional[str], Optional[dict]]] = []
67
84
 
68
- for func_name, raw_json in matches:
69
- func_and_args = None
70
- try:
71
- # Parse JSON arguments
72
- func_and_args = json.loads(raw_json)
73
- # Create hashable representation for deduplication
74
- arguments_hashable = frozenset(func_and_args.items())
75
- tool_call_tuple = (
76
- None, # No content error
77
- func_name,
78
- func_and_args,
85
+ for content_block in function_calls:
86
+ # Check if this block is a tool call
87
+ if (
88
+ self.tool_call_start_token in content_block
89
+ and self.tool_call_end_token in content_block
90
+ ):
91
+ # Extract function name and arguments from tool call block
92
+ matches = re.findall(self.tool_calls_regex, content_block, re.DOTALL)
93
+ if not matches:
94
+ # Malformed tool call, treat as regular content
95
+ results.append((content_block, None, None))
96
+ continue
97
+
98
+ func_name, raw_json = matches[0] # Take the first match
99
+
100
+ func_and_args = None
101
+ try:
102
+ # Parse JSON arguments
103
+ func_and_args = json.loads(raw_json)
104
+ # Create hashable representation for deduplication
105
+ arguments_hashable = frozenset(func_and_args.items())
106
+ tool_call_tuple = (
107
+ None, # No content error
108
+ func_name,
109
+ func_and_args,
110
+ )
111
+ except Exception as e:
112
+ # JSON parsing failed, treat as raw content
113
+ logger.warning(
114
+ f"Failed to parse tool call JSON: {raw_json}, error: {e}"
115
+ )
116
+ tool_call_tuple = (raw_json, None, None)
117
+ arguments_hashable = None
118
+
119
+ # Create deduplication key
120
+ dedup_key = (
121
+ (func_name, arguments_hashable)
122
+ if func_and_args is not None
123
+ else raw_json
79
124
  )
80
- except Exception as e:
81
- # JSON parsing failed, treat as raw content
82
- logger.warning(
83
- f"Failed to parse tool call JSON: {raw_json}, error: {e}"
84
- )
85
- tool_call_tuple = (raw_json, None, None)
86
- arguments_hashable = None
87
-
88
- # Create deduplication key
89
- dedup_key = (
90
- (func_name, arguments_hashable)
91
- if func_and_args is not None
92
- else raw_json
93
- )
94
125
 
95
- # Add to results if not already seen
96
- if dedup_key not in tool_calls:
97
- tool_calls.add(dedup_key)
98
- results.append(tool_call_tuple)
126
+ # Add to results if not already seen
127
+ if dedup_key not in tool_calls:
128
+ tool_calls.add(dedup_key)
129
+ results.append(tool_call_tuple)
130
+ else:
131
+ # This is regular content (text or thinking block), add as-is
132
+ if content_block.strip(): # Only add non-empty content
133
+ results.append((content_block, None, None))
99
134
 
100
135
  return results
101
136
 
137
+ def _get_function_calls(self, model_output: str) -> List[str]:
138
+ """
139
+ Extract all function calls and content blocks from model output.
140
+
141
+ Parses the model output to separate thinking blocks, tool calls,
142
+ and regular content into individual components.
143
+
144
+ Args:
145
+ model_output (str): The complete model output to parse.
146
+
147
+ Returns:
148
+ List[str]: List of content blocks (text, thinking blocks, tool calls).
149
+ """
150
+ functions_calls = []
151
+ last_end = 0
152
+ for m in re.finditer(self.content_regex, model_output, re.DOTALL):
153
+ # Add any text before the current match
154
+ if m.start() > last_end:
155
+ functions_calls.append(model_output[last_end : m.start()])
156
+ # Add the matched content (think or tool_call block)
157
+ functions_calls.append(m.group(0))
158
+ last_end = m.end()
159
+ # Add any remaining text after the last match
160
+ if last_end < len(model_output):
161
+ functions_calls.append(model_output[last_end:])
162
+ return functions_calls
163
+
102
164
  def extract_tool_calls_streaming(
103
165
  self, previous_text: List[str], current_text: str, delta_text: str
104
166
  ) -> Optional[Any]:
@@ -59,10 +59,28 @@ class QwenToolParser(ToolParser):
59
59
  Returns:
60
60
  str: Extracted JSON string or original string if no match found.
61
61
  """
62
+ # First try to find complete tool calls
62
63
  function_calls = self.tool_call_complete_regex.findall(function_call_str)
63
- if len(function_calls) == 0:
64
- return function_call_str
65
- return function_calls[-1]
64
+ if len(function_calls) > 0:
65
+ return function_calls[-1]
66
+
67
+ # If no complete tool calls found, try to extract from incomplete tool calls
68
+ # Handle cases like <tool_call><tool_call>_city
69
+ if self.tool_call_start_token in function_call_str:
70
+ # Extract content between the last tool_call start token and end of string
71
+ last_start = function_call_str.rfind(self.tool_call_start_token)
72
+ potential_json = function_call_str[
73
+ last_start + len(self.tool_call_start_token) :
74
+ ]
75
+ # Remove any trailing tool_call end tokens
76
+ if self.tool_call_end_token in potential_json:
77
+ potential_json = potential_json.split(self.tool_call_end_token)[0]
78
+ # Clean up any extra whitespace
79
+ potential_json = potential_json.strip()
80
+ if potential_json:
81
+ return potential_json
82
+
83
+ return function_call_str
66
84
 
67
85
  def _parse_json_function_call_stream(
68
86
  self,
@@ -229,7 +247,14 @@ class QwenToolParser(ToolParser):
229
247
  try:
230
248
  parsed_json = self._parse_json_function_call(function_call)
231
249
  res = json.loads(parsed_json, strict=False)
232
- results.append((None, res["name"], res["arguments"]))
250
+ # Validate that we have the required fields
251
+ if "name" in res and "arguments" in res:
252
+ results.append((None, res["name"], res["arguments"]))
253
+ else:
254
+ logger.warning(
255
+ "Invalid tool call format, missing required fields: %s", res
256
+ )
257
+ results.append((function_call, None, None))
233
258
  except Exception as e:
234
259
  logger.error(
235
260
  "Can't parse single qwen tool call output: %s. Error: %s",
@@ -472,6 +472,9 @@ class ChatglmPytorchChatModel(PytorchChatModel):
472
472
  r.prompt = self._process_messages(
473
473
  r.prompt, tools=tools, tool_choice=tool_choice
474
474
  )
475
+ assert isinstance(
476
+ r.prompt, list
477
+ ), "r.prompt must be a list after processing"
475
478
  r.full_prompt = self.get_full_context(
476
479
  r.prompt,
477
480
  self.model_family.chat_template, # type: ignore
@@ -48,6 +48,7 @@ from ..utils import (
48
48
  )
49
49
  from .utils import (
50
50
  _get_pad_param,
51
+ convert_to_cache_cls,
51
52
  get_context_length,
52
53
  get_max_src_len,
53
54
  pad_prefill_tokens,
@@ -548,31 +549,48 @@ class PytorchModel(LLM):
548
549
  So we need pad `0` on the left again.
549
550
  """
550
551
  data = []
551
- max_len = max(r.extra_kwargs["attention_mask_seq_len"] for r in reqs) + 1
552
+ # For decode phase, attention mask should match the full KV cache sequence length
553
+ # All requests in batch should have attention mask of length `seq_length`
554
+ for r in reqs:
555
+ # Get the actual sequence length for this request from its tracking
556
+ if "attention_mask_seq_len" not in r.extra_kwargs:
557
+ # Initialize with the current sequence length (full KV cache length)
558
+ r.extra_kwargs["attention_mask_seq_len"] = seq_length
559
+ else:
560
+ # Use the previously tracked length, but ensure it doesn't exceed current seq_length
561
+ tracked_len = r.extra_kwargs["attention_mask_seq_len"]
562
+ r.extra_kwargs["attention_mask_seq_len"] = min(tracked_len, seq_length)
563
+
564
+ # For decode phase after KV cache merge, all requests should have attention mask
565
+ # that matches the merged sequence length
552
566
  for r in reqs:
553
- r.extra_kwargs["attention_mask_seq_len"] += 1
554
567
  real_len = r.extra_kwargs["attention_mask_seq_len"]
555
- pad_len = max_len - real_len
556
568
 
557
- if self._tokenizer.padding_side == "left":
558
- x = torch.cat(
559
- [
560
- (
561
- torch.full((pad_len,), 0, dtype=torch.long)
562
- if pad_len > 0
563
- else torch.tensor([], dtype=torch.long)
564
- ),
565
- torch.ones((real_len,), dtype=torch.long),
566
- ]
567
- )
569
+ # The attention mask should cover the full sequence length
570
+ if real_len < seq_length:
571
+ # Pad with zeros on the left to reach full sequence length
572
+ pad_len = seq_length - real_len
573
+
574
+ if self._tokenizer.padding_side == "left":
575
+ x = torch.cat(
576
+ [
577
+ torch.full((pad_len,), 0, dtype=torch.long),
578
+ torch.ones((real_len,), dtype=torch.long),
579
+ ]
580
+ )
581
+ else:
582
+ x = torch.cat(
583
+ [
584
+ torch.ones((real_len,), dtype=torch.long),
585
+ torch.full((pad_len,), 0, dtype=torch.long),
586
+ ]
587
+ )
568
588
  else:
569
- x = torch.cat(
570
- [
571
- torch.ones((real_len,), dtype=torch.long),
572
- torch.full((pad_len,), 0, dtype=torch.long),
573
- ]
574
- )
589
+ # Already at correct length
590
+ x = torch.ones((real_len,), dtype=torch.long)
591
+
575
592
  data.append(x)
593
+
576
594
  return torch.stack(data).to(self._device)
577
595
 
578
596
  def build_prefill_position_ids(
@@ -713,30 +731,105 @@ class PytorchModel(LLM):
713
731
  from torch.nn.functional import pad
714
732
  from transformers import DynamicCache
715
733
 
734
+ # Handle case where past_cache is None
735
+ if past_cache is None:
736
+ return new_cache
737
+
738
+ # Convert both caches to DynamicCache if not already
739
+ if not isinstance(past_cache, DynamicCache):
740
+ past_cache = convert_to_cache_cls(past_cache)
741
+ if not isinstance(new_cache, DynamicCache):
742
+ new_cache = convert_to_cache_cls(new_cache)
743
+
716
744
  _, seq_len_idx = self.get_batch_size_and_seq_len_indexes_from_kv()
717
- past_seq_len = past_cache[0][0].shape[seq_len_idx]
718
- new_seq_len = new_cache[0][0].shape[seq_len_idx]
745
+
746
+ # Handle empty caches
747
+ if len(past_cache) == 0:
748
+ return new_cache
749
+ if len(new_cache) == 0:
750
+ return past_cache
751
+
752
+ # Get first layer seq_len safely
753
+ past_first = past_cache[0] if len(past_cache) > 0 else (None, None)
754
+ new_first = new_cache[0] if len(new_cache) > 0 else (None, None)
755
+
756
+ if past_first[0] is None or past_first[1] is None:
757
+ return new_cache
758
+ if new_first[0] is None or new_first[1] is None:
759
+ return past_cache
760
+
761
+ past_seq_len = past_first[0].shape[seq_len_idx]
762
+ new_seq_len = new_first[0].shape[seq_len_idx]
763
+
764
+ # Pad the shorter cache
719
765
  if past_seq_len != new_seq_len:
720
- padding_target = new_cache if past_seq_len > new_seq_len else past_cache
721
- padding_len = abs(past_seq_len - new_seq_len)
766
+ if past_seq_len > new_seq_len:
767
+ padding_target = new_cache
768
+ padding_len = past_seq_len - new_seq_len
769
+ else:
770
+ padding_target = past_cache
771
+ padding_len = new_seq_len - past_seq_len
772
+
722
773
  pad_param = _get_pad_param(seq_len_idx, padding_len)
723
774
  for idx in range(len(padding_target)):
724
775
  k = padding_target.key_cache[idx]
725
776
  v = padding_target.value_cache[idx]
726
- _k = pad(k, pad_param)
727
- _v = pad(v, pad_param)
728
- padding_target.key_cache[idx] = _k
729
- padding_target.value_cache[idx] = _v
777
+ if k is not None and v is not None:
778
+ padding_target.key_cache[idx] = pad(k, pad_param)
779
+ padding_target.value_cache[idx] = pad(v, pad_param)
730
780
 
781
+ # Merge caches
731
782
  ret_kv = DynamicCache()
732
- for idx in range(len(past_cache)):
733
- k1, k2 = new_cache.key_cache[idx], past_cache.key_cache[idx]
734
- v1, v2 = new_cache.value_cache[idx], past_cache.value_cache[idx]
735
- ret_kv.update(
736
- torch.cat((k1, k2), 0).contiguous(),
737
- torch.cat((v1, v2), 0).contiguous(),
738
- idx,
739
- )
783
+ max_layers = max(len(past_cache), len(new_cache))
784
+
785
+ for idx in range(max_layers):
786
+ past_k = past_cache.key_cache[idx] if idx < len(past_cache) else None
787
+ past_v = past_cache.value_cache[idx] if idx < len(past_cache) else None
788
+ new_k = new_cache.key_cache[idx] if idx < len(new_cache) else None
789
+ new_v = new_cache.value_cache[idx] if idx < len(new_cache) else None
790
+
791
+ if past_k is not None and new_k is not None:
792
+ # Both layers exist - validate tensor dimensions before concatenation
793
+ if past_k.dim() != new_k.dim():
794
+ logger.error(
795
+ f"KV cache tensor dimension mismatch at layer {idx}: "
796
+ f"past_k.dim()={past_k.dim()}, new_k.dim()={new_k.dim()}"
797
+ )
798
+ # Use the cache with higher batch size
799
+ if past_k.shape[0] >= new_k.shape[0]:
800
+ ret_kv.update(past_k, past_v, idx)
801
+ else:
802
+ ret_kv.update(new_k, new_v, idx)
803
+ continue
804
+
805
+ if past_k.shape[1:] == new_k.shape[1:]:
806
+ # Shapes are compatible, concatenate along batch dimension
807
+ ret_kv.update(
808
+ torch.cat((new_k, past_k), 0).contiguous(),
809
+ torch.cat((new_v, past_v), 0).contiguous(),
810
+ idx,
811
+ )
812
+ else:
813
+ # Detailed logging for shape mismatch
814
+ logger.warning(
815
+ f"KV cache shape mismatch at layer {idx}: "
816
+ f"past_k.shape={past_k.shape}, new_k.shape={new_k.shape}. "
817
+ f"This may be due to inconsistent batch sizes in continuous batching."
818
+ )
819
+
820
+ # Choose the cache with larger batch size to preserve more data
821
+ if past_k.shape[0] >= new_k.shape[0]:
822
+ ret_kv.update(past_k, past_v, idx)
823
+ else:
824
+ ret_kv.update(new_k, new_v, idx)
825
+ elif past_k is not None:
826
+ ret_kv.update(past_k, past_v, idx)
827
+ elif new_k is not None:
828
+ ret_kv.update(new_k, new_v, idx)
829
+ else:
830
+ # both None, fill with None
831
+ ret_kv.update(None, None, idx)
832
+
740
833
  return ret_kv
741
834
 
742
835
  def prepare_batch_inference(self, req_list: List[InferenceRequest]):