xinference 1.10.0__py3-none-any.whl → 1.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of xinference might be problematic. Click here for more details.

Files changed (317) hide show
  1. xinference/_version.py +3 -3
  2. xinference/api/restful_api.py +11 -28
  3. xinference/client/restful/async_restful_client.py +20 -3
  4. xinference/client/restful/restful_client.py +20 -3
  5. xinference/core/supervisor.py +87 -53
  6. xinference/core/worker.py +10 -0
  7. xinference/deploy/cmdline.py +15 -0
  8. xinference/model/audio/core.py +21 -6
  9. xinference/model/audio/indextts2.py +166 -0
  10. xinference/model/audio/model_spec.json +38 -1
  11. xinference/model/image/model_spec.json +69 -0
  12. xinference/model/image/stable_diffusion/core.py +13 -4
  13. xinference/model/llm/__init__.py +4 -0
  14. xinference/model/llm/llm_family.json +464 -2
  15. xinference/model/llm/sglang/core.py +30 -11
  16. xinference/model/llm/tool_parsers/deepseek_r1_tool_parser.py +94 -32
  17. xinference/model/llm/transformers/multimodal/qwen2_vl.py +34 -8
  18. xinference/model/llm/utils.py +12 -9
  19. xinference/model/llm/vllm/core.py +93 -17
  20. xinference/thirdparty/audiotools/__init__.py +10 -0
  21. xinference/thirdparty/audiotools/core/__init__.py +4 -0
  22. xinference/thirdparty/audiotools/core/audio_signal.py +1682 -0
  23. xinference/thirdparty/audiotools/core/display.py +194 -0
  24. xinference/thirdparty/audiotools/core/dsp.py +390 -0
  25. xinference/thirdparty/audiotools/core/effects.py +647 -0
  26. xinference/thirdparty/audiotools/core/ffmpeg.py +211 -0
  27. xinference/thirdparty/audiotools/core/loudness.py +320 -0
  28. xinference/thirdparty/audiotools/core/playback.py +252 -0
  29. xinference/thirdparty/audiotools/core/templates/__init__.py +0 -0
  30. xinference/thirdparty/audiotools/core/templates/headers.html +322 -0
  31. xinference/thirdparty/audiotools/core/templates/pandoc.css +407 -0
  32. xinference/thirdparty/audiotools/core/templates/widget.html +52 -0
  33. xinference/thirdparty/audiotools/core/util.py +671 -0
  34. xinference/thirdparty/audiotools/core/whisper.py +97 -0
  35. xinference/thirdparty/audiotools/data/__init__.py +3 -0
  36. xinference/thirdparty/audiotools/data/datasets.py +517 -0
  37. xinference/thirdparty/audiotools/data/preprocess.py +81 -0
  38. xinference/thirdparty/audiotools/data/transforms.py +1592 -0
  39. xinference/thirdparty/audiotools/metrics/__init__.py +6 -0
  40. xinference/thirdparty/audiotools/metrics/distance.py +131 -0
  41. xinference/thirdparty/audiotools/metrics/quality.py +159 -0
  42. xinference/thirdparty/audiotools/metrics/spectral.py +247 -0
  43. xinference/thirdparty/audiotools/ml/__init__.py +5 -0
  44. xinference/thirdparty/audiotools/ml/accelerator.py +184 -0
  45. xinference/thirdparty/audiotools/ml/decorators.py +440 -0
  46. xinference/thirdparty/audiotools/ml/experiment.py +90 -0
  47. xinference/thirdparty/audiotools/ml/layers/__init__.py +2 -0
  48. xinference/thirdparty/audiotools/ml/layers/base.py +328 -0
  49. xinference/thirdparty/audiotools/ml/layers/spectral_gate.py +127 -0
  50. xinference/thirdparty/audiotools/post.py +140 -0
  51. xinference/thirdparty/audiotools/preference.py +600 -0
  52. xinference/thirdparty/indextts/BigVGAN/ECAPA_TDNN.py +656 -0
  53. xinference/thirdparty/indextts/BigVGAN/__init__.py +0 -0
  54. xinference/thirdparty/indextts/BigVGAN/activations.py +122 -0
  55. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/__init__.py +0 -0
  56. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/.gitignore +1 -0
  57. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/__init__.py +0 -0
  58. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/activation1d.py +76 -0
  59. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  60. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu +256 -0
  61. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/compat.h +29 -0
  62. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/load.py +121 -0
  63. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/cuda/type_shim.h +92 -0
  64. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/__init__.py +6 -0
  65. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/act.py +31 -0
  66. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/filter.py +102 -0
  67. xinference/thirdparty/indextts/BigVGAN/alias_free_activation/torch/resample.py +58 -0
  68. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/__init__.py +6 -0
  69. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/act.py +29 -0
  70. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/filter.py +96 -0
  71. xinference/thirdparty/indextts/BigVGAN/alias_free_torch/resample.py +49 -0
  72. xinference/thirdparty/indextts/BigVGAN/bigvgan.py +534 -0
  73. xinference/thirdparty/indextts/BigVGAN/models.py +451 -0
  74. xinference/thirdparty/indextts/BigVGAN/nnet/CNN.py +546 -0
  75. xinference/thirdparty/indextts/BigVGAN/nnet/__init__.py +0 -0
  76. xinference/thirdparty/indextts/BigVGAN/nnet/linear.py +89 -0
  77. xinference/thirdparty/indextts/BigVGAN/nnet/normalization.py +670 -0
  78. xinference/thirdparty/indextts/BigVGAN/utils.py +101 -0
  79. xinference/thirdparty/indextts/__init__.py +0 -0
  80. xinference/thirdparty/indextts/cli.py +65 -0
  81. xinference/thirdparty/indextts/gpt/__init__.py +0 -0
  82. xinference/thirdparty/indextts/gpt/conformer/__init__.py +0 -0
  83. xinference/thirdparty/indextts/gpt/conformer/attention.py +312 -0
  84. xinference/thirdparty/indextts/gpt/conformer/embedding.py +163 -0
  85. xinference/thirdparty/indextts/gpt/conformer/subsampling.py +348 -0
  86. xinference/thirdparty/indextts/gpt/conformer_encoder.py +520 -0
  87. xinference/thirdparty/indextts/gpt/model.py +713 -0
  88. xinference/thirdparty/indextts/gpt/model_v2.py +747 -0
  89. xinference/thirdparty/indextts/gpt/perceiver.py +317 -0
  90. xinference/thirdparty/indextts/gpt/transformers_beam_search.py +1013 -0
  91. xinference/thirdparty/indextts/gpt/transformers_generation_utils.py +4747 -0
  92. xinference/thirdparty/indextts/gpt/transformers_gpt2.py +1878 -0
  93. xinference/thirdparty/indextts/gpt/transformers_modeling_utils.py +5525 -0
  94. xinference/thirdparty/indextts/infer.py +690 -0
  95. xinference/thirdparty/indextts/infer_v2.py +739 -0
  96. xinference/thirdparty/indextts/s2mel/dac/__init__.py +16 -0
  97. xinference/thirdparty/indextts/s2mel/dac/__main__.py +36 -0
  98. xinference/thirdparty/indextts/s2mel/dac/model/__init__.py +4 -0
  99. xinference/thirdparty/indextts/s2mel/dac/model/base.py +294 -0
  100. xinference/thirdparty/indextts/s2mel/dac/model/dac.py +400 -0
  101. xinference/thirdparty/indextts/s2mel/dac/model/discriminator.py +228 -0
  102. xinference/thirdparty/indextts/s2mel/dac/model/encodec.py +320 -0
  103. xinference/thirdparty/indextts/s2mel/dac/nn/__init__.py +3 -0
  104. xinference/thirdparty/indextts/s2mel/dac/nn/layers.py +33 -0
  105. xinference/thirdparty/indextts/s2mel/dac/nn/loss.py +368 -0
  106. xinference/thirdparty/indextts/s2mel/dac/nn/quantize.py +339 -0
  107. xinference/thirdparty/indextts/s2mel/dac/utils/__init__.py +123 -0
  108. xinference/thirdparty/indextts/s2mel/dac/utils/decode.py +95 -0
  109. xinference/thirdparty/indextts/s2mel/dac/utils/encode.py +94 -0
  110. xinference/thirdparty/indextts/s2mel/hf_utils.py +12 -0
  111. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/__init__.py +5 -0
  112. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/act.py +29 -0
  113. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/filter.py +96 -0
  114. xinference/thirdparty/indextts/s2mel/modules/alias_free_torch/resample.py +57 -0
  115. xinference/thirdparty/indextts/s2mel/modules/audio.py +82 -0
  116. xinference/thirdparty/indextts/s2mel/modules/bigvgan/activations.py +120 -0
  117. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/__init__.py +0 -0
  118. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/activation1d.py +77 -0
  119. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation.cpp +23 -0
  120. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/anti_alias_activation_cuda.cu +246 -0
  121. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/compat.h +29 -0
  122. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/load.py +86 -0
  123. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/cuda/type_shim.h +92 -0
  124. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/__init__.py +6 -0
  125. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/act.py +30 -0
  126. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/filter.py +101 -0
  127. xinference/thirdparty/indextts/s2mel/modules/bigvgan/alias_free_activation/torch/resample.py +58 -0
  128. xinference/thirdparty/indextts/s2mel/modules/bigvgan/bigvgan.py +492 -0
  129. xinference/thirdparty/indextts/s2mel/modules/bigvgan/config.json +63 -0
  130. xinference/thirdparty/indextts/s2mel/modules/bigvgan/env.py +18 -0
  131. xinference/thirdparty/indextts/s2mel/modules/bigvgan/meldataset.py +354 -0
  132. xinference/thirdparty/indextts/s2mel/modules/bigvgan/utils.py +99 -0
  133. xinference/thirdparty/indextts/s2mel/modules/campplus/DTDNN.py +115 -0
  134. xinference/thirdparty/indextts/s2mel/modules/campplus/classifier.py +70 -0
  135. xinference/thirdparty/indextts/s2mel/modules/campplus/layers.py +253 -0
  136. xinference/thirdparty/indextts/s2mel/modules/commons.py +632 -0
  137. xinference/thirdparty/indextts/s2mel/modules/diffusion_transformer.py +257 -0
  138. xinference/thirdparty/indextts/s2mel/modules/encodec.py +292 -0
  139. xinference/thirdparty/indextts/s2mel/modules/flow_matching.py +171 -0
  140. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/generate.py +436 -0
  141. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/model.py +360 -0
  142. xinference/thirdparty/indextts/s2mel/modules/gpt_fast/quantize.py +622 -0
  143. xinference/thirdparty/indextts/s2mel/modules/hifigan/f0_predictor.py +55 -0
  144. xinference/thirdparty/indextts/s2mel/modules/hifigan/generator.py +454 -0
  145. xinference/thirdparty/indextts/s2mel/modules/layers.py +354 -0
  146. xinference/thirdparty/indextts/s2mel/modules/length_regulator.py +141 -0
  147. xinference/thirdparty/indextts/s2mel/modules/openvoice/__init__.py +0 -0
  148. xinference/thirdparty/indextts/s2mel/modules/openvoice/api.py +186 -0
  149. xinference/thirdparty/indextts/s2mel/modules/openvoice/attentions.py +465 -0
  150. xinference/thirdparty/indextts/s2mel/modules/openvoice/checkpoints_v2/converter/config.json +57 -0
  151. xinference/thirdparty/indextts/s2mel/modules/openvoice/commons.py +160 -0
  152. xinference/thirdparty/indextts/s2mel/modules/openvoice/mel_processing.py +183 -0
  153. xinference/thirdparty/indextts/s2mel/modules/openvoice/models.py +499 -0
  154. xinference/thirdparty/indextts/s2mel/modules/openvoice/modules.py +598 -0
  155. xinference/thirdparty/indextts/s2mel/modules/openvoice/openvoice_app.py +275 -0
  156. xinference/thirdparty/indextts/s2mel/modules/openvoice/se_extractor.py +153 -0
  157. xinference/thirdparty/indextts/s2mel/modules/openvoice/transforms.py +209 -0
  158. xinference/thirdparty/indextts/s2mel/modules/openvoice/utils.py +194 -0
  159. xinference/thirdparty/indextts/s2mel/modules/quantize.py +229 -0
  160. xinference/thirdparty/indextts/s2mel/modules/rmvpe.py +631 -0
  161. xinference/thirdparty/indextts/s2mel/modules/vocos/__init__.py +4 -0
  162. xinference/thirdparty/indextts/s2mel/modules/vocos/heads.py +164 -0
  163. xinference/thirdparty/indextts/s2mel/modules/vocos/helpers.py +71 -0
  164. xinference/thirdparty/indextts/s2mel/modules/vocos/loss.py +114 -0
  165. xinference/thirdparty/indextts/s2mel/modules/vocos/models.py +118 -0
  166. xinference/thirdparty/indextts/s2mel/modules/vocos/modules.py +213 -0
  167. xinference/thirdparty/indextts/s2mel/modules/vocos/pretrained.py +51 -0
  168. xinference/thirdparty/indextts/s2mel/modules/vocos/spectral_ops.py +192 -0
  169. xinference/thirdparty/indextts/s2mel/modules/wavenet.py +174 -0
  170. xinference/thirdparty/indextts/s2mel/optimizers.py +96 -0
  171. xinference/thirdparty/indextts/s2mel/wav2vecbert_extract.py +148 -0
  172. xinference/thirdparty/indextts/utils/__init__.py +0 -0
  173. xinference/thirdparty/indextts/utils/arch_util.py +120 -0
  174. xinference/thirdparty/indextts/utils/checkpoint.py +34 -0
  175. xinference/thirdparty/indextts/utils/common.py +121 -0
  176. xinference/thirdparty/indextts/utils/feature_extractors.py +50 -0
  177. xinference/thirdparty/indextts/utils/front.py +536 -0
  178. xinference/thirdparty/indextts/utils/maskgct/models/codec/__init__.py +0 -0
  179. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/codec.py +427 -0
  180. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/__init__.py +11 -0
  181. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
  182. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
  183. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/residual_vq.py +177 -0
  184. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
  185. xinference/thirdparty/indextts/utils/maskgct/models/codec/amphion_codec/vocos.py +881 -0
  186. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_dataset.py +264 -0
  187. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_inference.py +515 -0
  188. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_sampler.py +126 -0
  189. xinference/thirdparty/indextts/utils/maskgct/models/codec/codec_trainer.py +166 -0
  190. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/__init__.py +0 -0
  191. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/__init__.py +5 -0
  192. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/act.py +29 -0
  193. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/filter.py +96 -0
  194. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/alias_free_torch/resample.py +57 -0
  195. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_dataset.py +98 -0
  196. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_inference.py +137 -0
  197. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/facodec_trainer.py +776 -0
  198. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/__init__.py +1 -0
  199. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +0 -0
  200. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/JDC/model.py +219 -0
  201. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/attentions.py +437 -0
  202. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/commons.py +331 -0
  203. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/gradient_reversal.py +35 -0
  204. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/layers.py +460 -0
  205. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/quantize.py +741 -0
  206. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/style_encoder.py +110 -0
  207. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/modules/wavenet.py +224 -0
  208. xinference/thirdparty/indextts/utils/maskgct/models/codec/facodec/optimizer.py +104 -0
  209. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/repcodec_model.py +210 -0
  210. xinference/thirdparty/indextts/utils/maskgct/models/codec/kmeans/vocos.py +850 -0
  211. xinference/thirdparty/indextts/utils/maskgct/models/codec/melvqgan/melspec.py +108 -0
  212. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/README.md +216 -0
  213. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/__init__.py +6 -0
  214. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/__init__.py +5 -0
  215. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/act.py +29 -0
  216. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
  217. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/alias_free_torch/resample.py +57 -0
  218. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/facodec.py +1222 -0
  219. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/gradient_reversal.py +35 -0
  220. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/melspec.py +102 -0
  221. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/__init__.py +7 -0
  222. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/fvq.py +116 -0
  223. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/quantize/rvq.py +87 -0
  224. xinference/thirdparty/indextts/utils/maskgct/models/codec/ns3_codec/transformer.py +234 -0
  225. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/model.py +184 -0
  226. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/__init__.py +27 -0
  227. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/conv.py +346 -0
  228. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/lstm.py +46 -0
  229. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/norm.py +37 -0
  230. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/__init__.py +14 -0
  231. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/ac.py +317 -0
  232. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/core_vq.py +388 -0
  233. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/distrib.py +135 -0
  234. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/quantization/vq.py +125 -0
  235. xinference/thirdparty/indextts/utils/maskgct/models/codec/speechtokenizer/modules/seanet.py +414 -0
  236. xinference/thirdparty/indextts/utils/maskgct/models/codec/vevo/vevo_repcodec.py +592 -0
  237. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/ckpt/wav2vec2bert_stats.pt +0 -0
  238. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/llama_nar.py +650 -0
  239. xinference/thirdparty/indextts/utils/maskgct/models/tts/maskgct/maskgct_s2a.py +503 -0
  240. xinference/thirdparty/indextts/utils/maskgct_utils.py +259 -0
  241. xinference/thirdparty/indextts/utils/text_utils.py +41 -0
  242. xinference/thirdparty/indextts/utils/typical_sampling.py +30 -0
  243. xinference/thirdparty/indextts/utils/utils.py +93 -0
  244. xinference/thirdparty/indextts/utils/webui_utils.py +42 -0
  245. xinference/thirdparty/indextts/utils/xtransformers.py +1247 -0
  246. xinference/thirdparty/indextts/vqvae/__init__.py +0 -0
  247. xinference/thirdparty/indextts/vqvae/xtts_dvae.py +395 -0
  248. xinference/ui/gradio/media_interface.py +66 -8
  249. xinference/ui/web/ui/build/asset-manifest.json +6 -6
  250. xinference/ui/web/ui/build/index.html +1 -1
  251. xinference/ui/web/ui/build/static/css/main.5ea97072.css +2 -0
  252. xinference/ui/web/ui/build/static/css/main.5ea97072.css.map +1 -0
  253. xinference/ui/web/ui/build/static/js/main.d192c4f3.js +3 -0
  254. xinference/ui/web/ui/build/static/js/{main.1086c759.js.LICENSE.txt → main.d192c4f3.js.LICENSE.txt} +0 -7
  255. xinference/ui/web/ui/build/static/js/main.d192c4f3.js.map +1 -0
  256. xinference/ui/web/ui/node_modules/.cache/babel-loader/089c38df5f52348d212ed868dda5c518a42e0c2762caed4175487c0405830c35.json +1 -0
  257. xinference/ui/web/ui/node_modules/.cache/babel-loader/2b6e3a5b6eb2c5c5f2d007e68cd46c372721cd52bf63508adcdb21ecf79241d8.json +1 -0
  258. xinference/ui/web/ui/node_modules/.cache/babel-loader/2d887825fd07a56f872eda4420da25fba0b5b62a23bdcc6c6da1a5281887f618.json +1 -0
  259. xinference/ui/web/ui/node_modules/.cache/babel-loader/4001f9c3e64e73a4f2158826650c174a59d5e3f89ddecddf17cbb6bb688cc4ca.json +1 -0
  260. xinference/ui/web/ui/node_modules/.cache/babel-loader/4a7018a69e6b7f90fc313248c2aa86f2a8f1eb1db120df586047a8023549b44b.json +1 -0
  261. xinference/ui/web/ui/node_modules/.cache/babel-loader/64b12aaa1c1d1bf53820ada8a63769067c0ccc5aab46b32348eb1917ae7f2a11.json +1 -0
  262. xinference/ui/web/ui/node_modules/.cache/babel-loader/7275b67c78ec76ce38a686bb8a576d8c9cecf54e1573614c84859d538efb9be5.json +1 -0
  263. xinference/ui/web/ui/node_modules/.cache/babel-loader/a68b6ee3b31eadc051fb95ce8f8ccb9c2e8b52c60f290dbab545a1917e065282.json +1 -0
  264. xinference/ui/web/ui/node_modules/.cache/babel-loader/ae8771cc37693feb160fa8727231312a0c54ef2d1d1ca893be568cd70016ca7e.json +1 -0
  265. xinference/ui/web/ui/node_modules/.cache/babel-loader/bb4e8722d2d41d87f1fce3661bc8937bffe9448e231fc5f0462630849e851592.json +1 -0
  266. xinference/ui/web/ui/node_modules/.cache/babel-loader/be6aada1ee4adc2bbf65dbe56d17db32bb3b5478be05d6b527805a8ba6cfb2b9.json +1 -0
  267. xinference/ui/web/ui/node_modules/.cache/babel-loader/de91c352653c233cf0cb6674e6e04049a44fd0e1156560de65d5c4620521391e.json +1 -0
  268. xinference/ui/web/ui/node_modules/.cache/babel-loader/e85f7002fc325c83b9c9cd8a1619e5b3ebc701d30e811afc284b88e6ae710cb5.json +1 -0
  269. xinference/ui/web/ui/node_modules/.cache/babel-loader/e8b603c78944bf3d213639078bfe155ff5c0dfa4048a93cbb967cad6a4eb4ff3.json +1 -0
  270. xinference/ui/web/ui/node_modules/.cache/babel-loader/f05535160a508b2a312de546a6de234776c613db276479ea4253c0b1bdeeb7d6.json +1 -0
  271. xinference/ui/web/ui/node_modules/.cache/babel-loader/f09ba9e11106bd59a0de10cc85c55084097729dcab575f43dfcf07375961ed87.json +1 -0
  272. xinference/ui/web/ui/node_modules/.cache/babel-loader/f995a2425dfb0822fd07127f66ffe9b026883bc156b402eb8bd0b83d52460a93.json +1 -0
  273. xinference/ui/web/ui/node_modules/.package-lock.json +0 -33
  274. xinference/ui/web/ui/package-lock.json +0 -34
  275. xinference/ui/web/ui/package.json +0 -1
  276. xinference/ui/web/ui/src/locales/en.json +9 -3
  277. xinference/ui/web/ui/src/locales/ja.json +9 -3
  278. xinference/ui/web/ui/src/locales/ko.json +9 -3
  279. xinference/ui/web/ui/src/locales/zh.json +9 -3
  280. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/METADATA +18 -2
  281. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/RECORD +285 -67
  282. xinference/ui/web/ui/build/static/css/main.013f296b.css +0 -2
  283. xinference/ui/web/ui/build/static/css/main.013f296b.css.map +0 -1
  284. xinference/ui/web/ui/build/static/js/main.1086c759.js +0 -3
  285. xinference/ui/web/ui/build/static/js/main.1086c759.js.map +0 -1
  286. xinference/ui/web/ui/node_modules/.cache/babel-loader/0b0f77000cc1b482ca091cfbcae511dfe02f08916971645fad21d0b1234d04a2.json +0 -1
  287. xinference/ui/web/ui/node_modules/.cache/babel-loader/1c5f8ff423a7c9202bea60b15680f04b1e9964b445b0da3f86c6ff70cf24e797.json +0 -1
  288. xinference/ui/web/ui/node_modules/.cache/babel-loader/44ce7993e344980e3ed4f13e8f69237d4a5dfc60e37ca6b54f51f8ee1357bd67.json +0 -1
  289. xinference/ui/web/ui/node_modules/.cache/babel-loader/4aec1cc414ac3ebb3481d3d915e4db597d9127de813291346eacb8554ab170d4.json +0 -1
  290. xinference/ui/web/ui/node_modules/.cache/babel-loader/644cfec52f3c57a6e222ce60f112237a1efefe9835efd9aad857a685f53d8eed.json +0 -1
  291. xinference/ui/web/ui/node_modules/.cache/babel-loader/663436f72af53fe0d72394f56d003fa4e0bba489e5bb4e483fd34b00f84637f7.json +0 -1
  292. xinference/ui/web/ui/node_modules/.cache/babel-loader/69db82ca9bfe27fe417cc6cf2b1716b09be9c6f0cd198530f12bfc60e801bbcf.json +0 -1
  293. xinference/ui/web/ui/node_modules/.cache/babel-loader/85087e27618d740c236bf159f30e0219db443ab55f0997388eed5fde6f9e90cc.json +0 -1
  294. xinference/ui/web/ui/node_modules/.cache/babel-loader/88b07838348864aa86c672be3bbca1e9f58f6f3a2881b32070ec27f4e7b449d1.json +0 -1
  295. xinference/ui/web/ui/node_modules/.cache/babel-loader/8b8cd408ccfbe115acef27ccfa5b233da8597131a2a5712add13e1e4d5d4504b.json +0 -1
  296. xinference/ui/web/ui/node_modules/.cache/babel-loader/a23824fe746b9c6ca5eee9159b5764d1ff1653c1d856288c0f75c742bbb0023b.json +0 -1
  297. xinference/ui/web/ui/node_modules/.cache/babel-loader/a3eb18af328280b139693c9092dff2a0ef8c9a967e6c8956ceee0996611f1984.json +0 -1
  298. xinference/ui/web/ui/node_modules/.cache/babel-loader/bc1aacc65a102db325ca61bcd2f681e1ae22c36a1f1d98a6ff5e4ad49dc7544f.json +0 -1
  299. xinference/ui/web/ui/node_modules/.cache/babel-loader/c682fd521747c19dae437d83ce3235a306ce6b68e24a117bc57c27ebb8d1f1ca.json +0 -1
  300. xinference/ui/web/ui/node_modules/.cache/babel-loader/d5c224be7081f18cba1678b7874a9782eba895df004874ff8f243f94ba79942a.json +0 -1
  301. xinference/ui/web/ui/node_modules/.cache/babel-loader/f7f18bfb539b036a6a342176dd98a85df5057a884a8da978d679f2a0264883d0.json +0 -1
  302. xinference/ui/web/ui/node_modules/clipboard/.babelrc.json +0 -11
  303. xinference/ui/web/ui/node_modules/clipboard/.eslintrc.json +0 -24
  304. xinference/ui/web/ui/node_modules/clipboard/.prettierrc.json +0 -9
  305. xinference/ui/web/ui/node_modules/clipboard/bower.json +0 -18
  306. xinference/ui/web/ui/node_modules/clipboard/composer.json +0 -25
  307. xinference/ui/web/ui/node_modules/clipboard/package.json +0 -63
  308. xinference/ui/web/ui/node_modules/delegate/package.json +0 -31
  309. xinference/ui/web/ui/node_modules/good-listener/bower.json +0 -11
  310. xinference/ui/web/ui/node_modules/good-listener/package.json +0 -35
  311. xinference/ui/web/ui/node_modules/select/bower.json +0 -13
  312. xinference/ui/web/ui/node_modules/select/package.json +0 -29
  313. xinference/ui/web/ui/node_modules/tiny-emitter/package.json +0 -53
  314. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/WHEEL +0 -0
  315. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/entry_points.txt +0 -0
  316. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/licenses/LICENSE +0 -0
  317. {xinference-1.10.0.dist-info → xinference-1.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,739 @@
1
+ import os
2
+ from subprocess import CalledProcessError
3
+
4
+ os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
5
+ import json
6
+ import re
7
+ import time
8
+ import librosa
9
+ import torch
10
+ import torchaudio
11
+ from torch.nn.utils.rnn import pad_sequence
12
+
13
+ import warnings
14
+
15
+ warnings.filterwarnings("ignore", category=FutureWarning)
16
+ warnings.filterwarnings("ignore", category=UserWarning)
17
+
18
+ from omegaconf import OmegaConf
19
+
20
+ from indextts.gpt.model_v2 import UnifiedVoice
21
+ from indextts.utils.maskgct_utils import build_semantic_model, build_semantic_codec
22
+ from indextts.utils.checkpoint import load_checkpoint
23
+ from indextts.utils.front import TextNormalizer, TextTokenizer
24
+
25
+ from indextts.s2mel.modules.commons import load_checkpoint2, MyModel
26
+ from indextts.s2mel.modules.bigvgan import bigvgan
27
+ from indextts.s2mel.modules.campplus.DTDNN import CAMPPlus
28
+ from indextts.s2mel.modules.audio import mel_spectrogram
29
+
30
+ from transformers import AutoTokenizer
31
+ from modelscope import AutoModelForCausalLM
32
+ from huggingface_hub import hf_hub_download
33
+ import safetensors
34
+ from transformers import SeamlessM4TFeatureExtractor
35
+ import random
36
+ import torch.nn.functional as F
37
+
38
+ class IndexTTS2:
39
+ def __init__(
40
+ self, cfg_path="checkpoints/config.yaml", model_dir="checkpoints", use_fp16=False, device=None,
41
+ use_cuda_kernel=None,use_deepspeed=False
42
+ ):
43
+ """
44
+ Args:
45
+ cfg_path (str): path to the config file.
46
+ model_dir (str): path to the model directory.
47
+ use_fp16 (bool): whether to use fp16.
48
+ device (str): device to use (e.g., 'cuda:0', 'cpu'). If None, it will be set automatically based on the availability of CUDA or MPS.
49
+ use_cuda_kernel (None | bool): whether to use BigVGan custom fused activation CUDA kernel, only for CUDA device.
50
+ use_deepspeed (bool): whether to use DeepSpeed or not.
51
+ """
52
+ if device is not None:
53
+ self.device = device
54
+ self.use_fp16 = False if device == "cpu" else use_fp16
55
+ self.use_cuda_kernel = use_cuda_kernel is not None and use_cuda_kernel and device.startswith("cuda")
56
+ elif torch.cuda.is_available():
57
+ self.device = "cuda:0"
58
+ self.use_fp16 = use_fp16
59
+ self.use_cuda_kernel = use_cuda_kernel is None or use_cuda_kernel
60
+ elif hasattr(torch, "xpu") and torch.xpu.is_available():
61
+ self.device = "xpu"
62
+ self.use_fp16 = use_fp16
63
+ self.use_cuda_kernel = False
64
+ elif hasattr(torch, "mps") and torch.backends.mps.is_available():
65
+ self.device = "mps"
66
+ self.use_fp16 = False # Use float16 on MPS is overhead than float32
67
+ self.use_cuda_kernel = False
68
+ else:
69
+ self.device = "cpu"
70
+ self.use_fp16 = False
71
+ self.use_cuda_kernel = False
72
+ print(">> Be patient, it may take a while to run in CPU mode.")
73
+
74
+ self.cfg = OmegaConf.load(cfg_path)
75
+ self.model_dir = model_dir
76
+ self.dtype = torch.float16 if self.use_fp16 else None
77
+ self.stop_mel_token = self.cfg.gpt.stop_mel_token
78
+
79
+ self.qwen_emo = QwenEmotion(os.path.join(self.model_dir, self.cfg.qwen_emo_path))
80
+
81
+ self.gpt = UnifiedVoice(**self.cfg.gpt)
82
+ self.gpt_path = os.path.join(self.model_dir, self.cfg.gpt_checkpoint)
83
+ load_checkpoint(self.gpt, self.gpt_path)
84
+ self.gpt = self.gpt.to(self.device)
85
+ if self.use_fp16:
86
+ self.gpt.eval().half()
87
+ else:
88
+ self.gpt.eval()
89
+ print(">> GPT weights restored from:", self.gpt_path)
90
+
91
+ if use_deepspeed:
92
+ try:
93
+ import deepspeed
94
+ except (ImportError, OSError, CalledProcessError) as e:
95
+ use_deepspeed = False
96
+ print(f">> Failed to load DeepSpeed. Falling back to normal inference. Error: {e}")
97
+
98
+ self.gpt.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=True, half=self.use_fp16)
99
+
100
+ if self.use_cuda_kernel:
101
+ # preload the CUDA kernel for BigVGAN
102
+ try:
103
+ from indextts.s2mel.modules.bigvgan.alias_free_activation.cuda import activation1d
104
+
105
+ print(">> Preload custom CUDA kernel for BigVGAN", activation1d.anti_alias_activation_cuda)
106
+ except Exception as e:
107
+ print(">> Failed to load custom CUDA kernel for BigVGAN. Falling back to torch.")
108
+ print(f"{e!r}")
109
+ self.use_cuda_kernel = False
110
+
111
+ self.extract_features = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
112
+ self.semantic_model, self.semantic_mean, self.semantic_std = build_semantic_model(
113
+ os.path.join(self.model_dir, self.cfg.w2v_stat))
114
+ self.semantic_model = self.semantic_model.to(self.device)
115
+ self.semantic_model.eval()
116
+ self.semantic_mean = self.semantic_mean.to(self.device)
117
+ self.semantic_std = self.semantic_std.to(self.device)
118
+
119
+ semantic_codec = build_semantic_codec(self.cfg.semantic_codec)
120
+ semantic_code_ckpt = hf_hub_download("amphion/MaskGCT", filename="semantic_codec/model.safetensors")
121
+ safetensors.torch.load_model(semantic_codec, semantic_code_ckpt)
122
+ self.semantic_codec = semantic_codec.to(self.device)
123
+ self.semantic_codec.eval()
124
+ print('>> semantic_codec weights restored from: {}'.format(semantic_code_ckpt))
125
+
126
+ s2mel_path = os.path.join(self.model_dir, self.cfg.s2mel_checkpoint)
127
+ s2mel = MyModel(self.cfg.s2mel, use_gpt_latent=True)
128
+ s2mel, _, _, _ = load_checkpoint2(
129
+ s2mel,
130
+ None,
131
+ s2mel_path,
132
+ load_only_params=True,
133
+ ignore_modules=[],
134
+ is_distributed=False,
135
+ )
136
+ self.s2mel = s2mel.to(self.device)
137
+ self.s2mel.models['cfm'].estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
138
+ self.s2mel.eval()
139
+ print(">> s2mel weights restored from:", s2mel_path)
140
+
141
+ # load campplus_model
142
+ campplus_ckpt_path = hf_hub_download(
143
+ "funasr/campplus", filename="campplus_cn_common.bin"
144
+ )
145
+ campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
146
+ campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
147
+ self.campplus_model = campplus_model.to(self.device)
148
+ self.campplus_model.eval()
149
+ print(">> campplus_model weights restored from:", campplus_ckpt_path)
150
+
151
+ bigvgan_name = self.cfg.vocoder.name
152
+ self.bigvgan = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=self.use_cuda_kernel)
153
+ self.bigvgan = self.bigvgan.to(self.device)
154
+ self.bigvgan.remove_weight_norm()
155
+ self.bigvgan.eval()
156
+ print(">> bigvgan weights restored from:", bigvgan_name)
157
+
158
+ self.bpe_path = os.path.join(self.model_dir, self.cfg.dataset["bpe_model"])
159
+ self.normalizer = TextNormalizer()
160
+ self.normalizer.load()
161
+ print(">> TextNormalizer loaded")
162
+ self.tokenizer = TextTokenizer(self.bpe_path, self.normalizer)
163
+ print(">> bpe model loaded from:", self.bpe_path)
164
+
165
+ emo_matrix = torch.load(os.path.join(self.model_dir, self.cfg.emo_matrix))
166
+ self.emo_matrix = emo_matrix.to(self.device)
167
+ self.emo_num = list(self.cfg.emo_num)
168
+
169
+ spk_matrix = torch.load(os.path.join(self.model_dir, self.cfg.spk_matrix))
170
+ self.spk_matrix = spk_matrix.to(self.device)
171
+
172
+ self.emo_matrix = torch.split(self.emo_matrix, self.emo_num)
173
+ self.spk_matrix = torch.split(self.spk_matrix, self.emo_num)
174
+
175
+ mel_fn_args = {
176
+ "n_fft": self.cfg.s2mel['preprocess_params']['spect_params']['n_fft'],
177
+ "win_size": self.cfg.s2mel['preprocess_params']['spect_params']['win_length'],
178
+ "hop_size": self.cfg.s2mel['preprocess_params']['spect_params']['hop_length'],
179
+ "num_mels": self.cfg.s2mel['preprocess_params']['spect_params']['n_mels'],
180
+ "sampling_rate": self.cfg.s2mel["preprocess_params"]["sr"],
181
+ "fmin": self.cfg.s2mel['preprocess_params']['spect_params'].get('fmin', 0),
182
+ "fmax": None if self.cfg.s2mel['preprocess_params']['spect_params'].get('fmax', "None") == "None" else 8000,
183
+ "center": False
184
+ }
185
+ self.mel_fn = lambda x: mel_spectrogram(x, **mel_fn_args)
186
+
187
+ # 缓存参考音频:
188
+ self.cache_spk_cond = None
189
+ self.cache_s2mel_style = None
190
+ self.cache_s2mel_prompt = None
191
+ self.cache_spk_audio_prompt = None
192
+ self.cache_emo_cond = None
193
+ self.cache_emo_audio_prompt = None
194
+ self.cache_mel = None
195
+
196
+ # 进度引用显示(可选)
197
+ self.gr_progress = None
198
+ self.model_version = self.cfg.version if hasattr(self.cfg, "version") else None
199
+
200
+ @torch.no_grad()
201
+ def get_emb(self, input_features, attention_mask):
202
+ vq_emb = self.semantic_model(
203
+ input_features=input_features,
204
+ attention_mask=attention_mask,
205
+ output_hidden_states=True,
206
+ )
207
+ feat = vq_emb.hidden_states[17] # (B, T, C)
208
+ feat = (feat - self.semantic_mean) / self.semantic_std
209
+ return feat
210
+
211
+ def remove_long_silence(self, codes: torch.Tensor, silent_token=52, max_consecutive=30):
212
+ """
213
+ Shrink special tokens (silent_token and stop_mel_token) in codes
214
+ codes: [B, T]
215
+ """
216
+ code_lens = []
217
+ codes_list = []
218
+ device = codes.device
219
+ dtype = codes.dtype
220
+ isfix = False
221
+ for i in range(0, codes.shape[0]):
222
+ code = codes[i]
223
+ if not torch.any(code == self.stop_mel_token).item():
224
+ len_ = code.size(0)
225
+ else:
226
+ stop_mel_idx = (code == self.stop_mel_token).nonzero(as_tuple=False)
227
+ len_ = stop_mel_idx[0].item() if len(stop_mel_idx) > 0 else code.size(0)
228
+
229
+ count = torch.sum(code == silent_token).item()
230
+ if count > max_consecutive:
231
+ # code = code.cpu().tolist()
232
+ ncode_idx = []
233
+ n = 0
234
+ for k in range(len_):
235
+ assert code[
236
+ k] != self.stop_mel_token, f"stop_mel_token {self.stop_mel_token} should be shrinked here"
237
+ if code[k] != silent_token:
238
+ ncode_idx.append(k)
239
+ n = 0
240
+ elif code[k] == silent_token and n < 10:
241
+ ncode_idx.append(k)
242
+ n += 1
243
+ # if (k == 0 and code[k] == 52) or (code[k] == 52 and code[k-1] == 52):
244
+ # n += 1
245
+ # new code
246
+ len_ = len(ncode_idx)
247
+ codes_list.append(code[ncode_idx])
248
+ isfix = True
249
+ else:
250
+ # shrink to len_
251
+ codes_list.append(code[:len_])
252
+ code_lens.append(len_)
253
+ if isfix:
254
+ if len(codes_list) > 1:
255
+ codes = pad_sequence(codes_list, batch_first=True, padding_value=self.stop_mel_token)
256
+ else:
257
+ codes = codes_list[0].unsqueeze(0)
258
+ else:
259
+ # unchanged
260
+ pass
261
+ # clip codes to max length
262
+ max_len = max(code_lens)
263
+ if max_len < codes.shape[1]:
264
+ codes = codes[:, :max_len]
265
+ code_lens = torch.tensor(code_lens, dtype=torch.long, device=device)
266
+ return codes, code_lens
267
+
268
+ def insert_interval_silence(self, wavs, sampling_rate=22050, interval_silence=200):
269
+ """
270
+ Insert silences between generated segments.
271
+ wavs: List[torch.tensor]
272
+ """
273
+
274
+ if not wavs or interval_silence <= 0:
275
+ return wavs
276
+
277
+ # get channel_size
278
+ channel_size = wavs[0].size(0)
279
+ # get silence tensor
280
+ sil_dur = int(sampling_rate * interval_silence / 1000.0)
281
+ sil_tensor = torch.zeros(channel_size, sil_dur)
282
+
283
+ wavs_list = []
284
+ for i, wav in enumerate(wavs):
285
+ wavs_list.append(wav)
286
+ if i < len(wavs) - 1:
287
+ wavs_list.append(sil_tensor)
288
+
289
+ return wavs_list
290
+
291
+ def _set_gr_progress(self, value, desc):
292
+ if self.gr_progress is not None:
293
+ self.gr_progress(value, desc=desc)
294
+
295
+ def _load_and_cut_audio(self,audio_path,max_audio_length_seconds,verbose=False,sr=None):
296
+ if not sr:
297
+ audio, sr = librosa.load(audio_path)
298
+ else:
299
+ audio, _ = librosa.load(audio_path,sr=sr)
300
+ audio = torch.tensor(audio).unsqueeze(0)
301
+ max_audio_samples = int(max_audio_length_seconds * sr)
302
+
303
+ if audio.shape[1] > max_audio_samples:
304
+ if verbose:
305
+ print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples")
306
+ audio = audio[:, :max_audio_samples]
307
+ return audio, sr
308
+
309
+ # 原始推理模式
310
+ def infer(self, spk_audio_prompt, text, output_path,
311
+ emo_audio_prompt=None, emo_alpha=1.0,
312
+ emo_vector=None,
313
+ use_emo_text=False, emo_text=None, use_random=False, interval_silence=200,
314
+ verbose=False, max_text_tokens_per_segment=120, **generation_kwargs):
315
+ print(">> starting inference...")
316
+ self._set_gr_progress(0, "starting inference...")
317
+ #if verbose:
318
+ print(f"origin text:{text}, spk_audio_prompt:{spk_audio_prompt}, "
319
+ f"emo_audio_prompt:{emo_audio_prompt}, emo_alpha:{emo_alpha}, "
320
+ f"emo_vector:{emo_vector}, use_emo_text:{use_emo_text}, "
321
+ f"emo_text:{emo_text}")
322
+ start_time = time.perf_counter()
323
+
324
+ if use_emo_text or emo_vector is not None:
325
+ # we're using a text or emotion vector guidance; so we must remove
326
+ # "emotion reference voice", to ensure we use correct emotion mixing!
327
+ emo_audio_prompt = None
328
+
329
+ if use_emo_text:
330
+ # automatically generate emotion vectors from text prompt
331
+ if emo_text is None:
332
+ emo_text = text # use main text prompt
333
+ emo_dict = self.qwen_emo.inference(emo_text)
334
+ print(f"detected emotion vectors from text: {emo_dict}")
335
+ # convert ordered dict to list of vectors; the order is VERY important!
336
+ emo_vector = list(emo_dict.values())
337
+
338
+ if emo_vector is not None:
339
+ # we have emotion vectors; they can't be blended via alpha mixing
340
+ # in the main inference process later, so we must pre-calculate
341
+ # their new strengths here based on the alpha instead!
342
+ emo_vector_scale = max(0.0, min(1.0, emo_alpha))
343
+ if emo_vector_scale != 1.0:
344
+ # scale each vector and truncate to 4 decimals (for nicer printing)
345
+ emo_vector = [int(x * emo_vector_scale * 10000) / 10000 for x in emo_vector]
346
+ print(f"scaled emotion vectors to {emo_vector_scale}x: {emo_vector}")
347
+
348
+ if emo_audio_prompt is None:
349
+ # we are not using any external "emotion reference voice"; use
350
+ # speaker's voice as the main emotion reference audio.
351
+ emo_audio_prompt = spk_audio_prompt
352
+ # must always use alpha=1.0 when we don't have an external reference voice
353
+ emo_alpha = 1.0
354
+
355
+ # 如果参考音频改变了,才需要重新生成, 提升速度
356
+ if self.cache_spk_cond is None or self.cache_spk_audio_prompt != spk_audio_prompt:
357
+ audio,sr = self._load_and_cut_audio(spk_audio_prompt,15,verbose)
358
+ audio_22k = torchaudio.transforms.Resample(sr, 22050)(audio)
359
+ audio_16k = torchaudio.transforms.Resample(sr, 16000)(audio)
360
+
361
+ inputs = self.extract_features(audio_16k, sampling_rate=16000, return_tensors="pt")
362
+ input_features = inputs["input_features"]
363
+ attention_mask = inputs["attention_mask"]
364
+ input_features = input_features.to(self.device)
365
+ attention_mask = attention_mask.to(self.device)
366
+ spk_cond_emb = self.get_emb(input_features, attention_mask)
367
+
368
+ _, S_ref = self.semantic_codec.quantize(spk_cond_emb)
369
+ ref_mel = self.mel_fn(audio_22k.to(spk_cond_emb.device).float())
370
+ ref_target_lengths = torch.LongTensor([ref_mel.size(2)]).to(ref_mel.device)
371
+ feat = torchaudio.compliance.kaldi.fbank(audio_16k.to(ref_mel.device),
372
+ num_mel_bins=80,
373
+ dither=0,
374
+ sample_frequency=16000)
375
+ feat = feat - feat.mean(dim=0, keepdim=True) # feat2另外一个滤波器能量组特征[922, 80]
376
+ style = self.campplus_model(feat.unsqueeze(0)) # 参考音频的全局style2[1,192]
377
+
378
+ prompt_condition = self.s2mel.models['length_regulator'](S_ref,
379
+ ylens=ref_target_lengths,
380
+ n_quantizers=3,
381
+ f0=None)[0]
382
+
383
+ self.cache_spk_cond = spk_cond_emb
384
+ self.cache_s2mel_style = style
385
+ self.cache_s2mel_prompt = prompt_condition
386
+ self.cache_spk_audio_prompt = spk_audio_prompt
387
+ self.cache_mel = ref_mel
388
+ else:
389
+ style = self.cache_s2mel_style
390
+ prompt_condition = self.cache_s2mel_prompt
391
+ spk_cond_emb = self.cache_spk_cond
392
+ ref_mel = self.cache_mel
393
+
394
+ if emo_vector is not None:
395
+ weight_vector = torch.tensor(emo_vector).to(self.device)
396
+ if use_random:
397
+ random_index = [random.randint(0, x - 1) for x in self.emo_num]
398
+ else:
399
+ random_index = [find_most_similar_cosine(style, tmp) for tmp in self.spk_matrix]
400
+
401
+ emo_matrix = [tmp[index].unsqueeze(0) for index, tmp in zip(random_index, self.emo_matrix)]
402
+ emo_matrix = torch.cat(emo_matrix, 0)
403
+ emovec_mat = weight_vector.unsqueeze(1) * emo_matrix
404
+ emovec_mat = torch.sum(emovec_mat, 0)
405
+ emovec_mat = emovec_mat.unsqueeze(0)
406
+
407
+ if self.cache_emo_cond is None or self.cache_emo_audio_prompt != emo_audio_prompt:
408
+ emo_audio, _ = self._load_and_cut_audio(emo_audio_prompt,15,verbose,sr=16000)
409
+ emo_inputs = self.extract_features(emo_audio, sampling_rate=16000, return_tensors="pt")
410
+ emo_input_features = emo_inputs["input_features"]
411
+ emo_attention_mask = emo_inputs["attention_mask"]
412
+ emo_input_features = emo_input_features.to(self.device)
413
+ emo_attention_mask = emo_attention_mask.to(self.device)
414
+ emo_cond_emb = self.get_emb(emo_input_features, emo_attention_mask)
415
+
416
+ self.cache_emo_cond = emo_cond_emb
417
+ self.cache_emo_audio_prompt = emo_audio_prompt
418
+ else:
419
+ emo_cond_emb = self.cache_emo_cond
420
+
421
+ self._set_gr_progress(0.1, "text processing...")
422
+ text_tokens_list = self.tokenizer.tokenize(text)
423
+ segments = self.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment)
424
+ segments_count = len(segments)
425
+ if verbose:
426
+ print("text_tokens_list:", text_tokens_list)
427
+ print("segments count:", segments_count)
428
+ print("max_text_tokens_per_segment:", max_text_tokens_per_segment)
429
+ print(*segments, sep="\n")
430
+ do_sample = generation_kwargs.pop("do_sample", True)
431
+ top_p = generation_kwargs.pop("top_p", 0.8)
432
+ top_k = generation_kwargs.pop("top_k", 30)
433
+ temperature = generation_kwargs.pop("temperature", 0.8)
434
+ autoregressive_batch_size = 1
435
+ length_penalty = generation_kwargs.pop("length_penalty", 0.0)
436
+ num_beams = generation_kwargs.pop("num_beams", 3)
437
+ repetition_penalty = generation_kwargs.pop("repetition_penalty", 10.0)
438
+ max_mel_tokens = generation_kwargs.pop("max_mel_tokens", 1500)
439
+ sampling_rate = 22050
440
+
441
+ wavs = []
442
+ gpt_gen_time = 0
443
+ gpt_forward_time = 0
444
+ s2mel_time = 0
445
+ bigvgan_time = 0
446
+ has_warned = False
447
+ for seg_idx, sent in enumerate(segments):
448
+ self._set_gr_progress(0.2 + 0.7 * seg_idx / segments_count,
449
+ f"speech synthesis {seg_idx + 1}/{segments_count}...")
450
+
451
+ text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
452
+ text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
453
+ if verbose:
454
+ print(text_tokens)
455
+ print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
456
+ # debug tokenizer
457
+ text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
458
+ print("text_token_syms is same as segment tokens", text_token_syms == sent)
459
+
460
+ m_start_time = time.perf_counter()
461
+ with torch.no_grad():
462
+ with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
463
+ emovec = self.gpt.merge_emovec(
464
+ spk_cond_emb,
465
+ emo_cond_emb,
466
+ torch.tensor([spk_cond_emb.shape[-1]], device=text_tokens.device),
467
+ torch.tensor([emo_cond_emb.shape[-1]], device=text_tokens.device),
468
+ alpha=emo_alpha
469
+ )
470
+
471
+ if emo_vector is not None:
472
+ emovec = emovec_mat + (1 - torch.sum(weight_vector)) * emovec
473
+ # emovec = emovec_mat
474
+
475
+ codes, speech_conditioning_latent = self.gpt.inference_speech(
476
+ spk_cond_emb,
477
+ text_tokens,
478
+ emo_cond_emb,
479
+ cond_lengths=torch.tensor([spk_cond_emb.shape[-1]], device=text_tokens.device),
480
+ emo_cond_lengths=torch.tensor([emo_cond_emb.shape[-1]], device=text_tokens.device),
481
+ emo_vec=emovec,
482
+ do_sample=True,
483
+ top_p=top_p,
484
+ top_k=top_k,
485
+ temperature=temperature,
486
+ num_return_sequences=autoregressive_batch_size,
487
+ length_penalty=length_penalty,
488
+ num_beams=num_beams,
489
+ repetition_penalty=repetition_penalty,
490
+ max_generate_length=max_mel_tokens,
491
+ **generation_kwargs
492
+ )
493
+
494
+ gpt_gen_time += time.perf_counter() - m_start_time
495
+ if not has_warned and (codes[:, -1] != self.stop_mel_token).any():
496
+ warnings.warn(
497
+ f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
498
+ f"Input text tokens: {text_tokens.shape[1]}. "
499
+ f"Consider reducing `max_text_tokens_per_segment`({max_text_tokens_per_segment}) or increasing `max_mel_tokens`.",
500
+ category=RuntimeWarning
501
+ )
502
+ has_warned = True
503
+
504
+ code_lens = torch.tensor([codes.shape[-1]], device=codes.device, dtype=codes.dtype)
505
+ # if verbose:
506
+ # print(codes, type(codes))
507
+ # print(f"codes shape: {codes.shape}, codes type: {codes.dtype}")
508
+ # print(f"code len: {code_lens}")
509
+
510
+ code_lens = []
511
+ for code in codes:
512
+ if self.stop_mel_token not in code:
513
+ code_lens.append(len(code))
514
+ code_len = len(code)
515
+ else:
516
+ len_ = (code == self.stop_mel_token).nonzero(as_tuple=False)[0] + 1
517
+ code_len = len_ - 1
518
+ code_lens.append(code_len)
519
+ codes = codes[:, :code_len]
520
+ code_lens = torch.LongTensor(code_lens)
521
+ code_lens = code_lens.to(self.device)
522
+ if verbose:
523
+ print(codes, type(codes))
524
+ print(f"fix codes shape: {codes.shape}, codes type: {codes.dtype}")
525
+ print(f"code len: {code_lens}")
526
+
527
+ m_start_time = time.perf_counter()
528
+ use_speed = torch.zeros(spk_cond_emb.size(0)).to(spk_cond_emb.device).long()
529
+ with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
530
+ latent = self.gpt(
531
+ speech_conditioning_latent,
532
+ text_tokens,
533
+ torch.tensor([text_tokens.shape[-1]], device=text_tokens.device),
534
+ codes,
535
+ torch.tensor([codes.shape[-1]], device=text_tokens.device),
536
+ emo_cond_emb,
537
+ cond_mel_lengths=torch.tensor([spk_cond_emb.shape[-1]], device=text_tokens.device),
538
+ emo_cond_mel_lengths=torch.tensor([emo_cond_emb.shape[-1]], device=text_tokens.device),
539
+ emo_vec=emovec,
540
+ use_speed=use_speed,
541
+ )
542
+ gpt_forward_time += time.perf_counter() - m_start_time
543
+
544
+ dtype = None
545
+ with torch.amp.autocast(text_tokens.device.type, enabled=dtype is not None, dtype=dtype):
546
+ m_start_time = time.perf_counter()
547
+ diffusion_steps = 25
548
+ inference_cfg_rate = 0.7
549
+ latent = self.s2mel.models['gpt_layer'](latent)
550
+ S_infer = self.semantic_codec.quantizer.vq2emb(codes.unsqueeze(1))
551
+ S_infer = S_infer.transpose(1, 2)
552
+ S_infer = S_infer + latent
553
+ target_lengths = (code_lens * 1.72).long()
554
+
555
+ cond = self.s2mel.models['length_regulator'](S_infer,
556
+ ylens=target_lengths,
557
+ n_quantizers=3,
558
+ f0=None)[0]
559
+ cat_condition = torch.cat([prompt_condition, cond], dim=1)
560
+ vc_target = self.s2mel.models['cfm'].inference(cat_condition,
561
+ torch.LongTensor([cat_condition.size(1)]).to(
562
+ cond.device),
563
+ ref_mel, style, None, diffusion_steps,
564
+ inference_cfg_rate=inference_cfg_rate)
565
+ vc_target = vc_target[:, :, ref_mel.size(-1):]
566
+ s2mel_time += time.perf_counter() - m_start_time
567
+
568
+ m_start_time = time.perf_counter()
569
+ wav = self.bigvgan(vc_target.float()).squeeze().unsqueeze(0)
570
+ print(wav.shape)
571
+ bigvgan_time += time.perf_counter() - m_start_time
572
+ wav = wav.squeeze(1)
573
+
574
+ wav = torch.clamp(32767 * wav, -32767.0, 32767.0)
575
+ if verbose:
576
+ print(f"wav shape: {wav.shape}", "min:", wav.min(), "max:", wav.max())
577
+ # wavs.append(wav[:, :-512])
578
+ wavs.append(wav.cpu()) # to cpu before saving
579
+ end_time = time.perf_counter()
580
+
581
+ self._set_gr_progress(0.9, "saving audio...")
582
+ wavs = self.insert_interval_silence(wavs, sampling_rate=sampling_rate, interval_silence=interval_silence)
583
+ wav = torch.cat(wavs, dim=1)
584
+ wav_length = wav.shape[-1] / sampling_rate
585
+ print(f">> gpt_gen_time: {gpt_gen_time:.2f} seconds")
586
+ print(f">> gpt_forward_time: {gpt_forward_time:.2f} seconds")
587
+ print(f">> s2mel_time: {s2mel_time:.2f} seconds")
588
+ print(f">> bigvgan_time: {bigvgan_time:.2f} seconds")
589
+ print(f">> Total inference time: {end_time - start_time:.2f} seconds")
590
+ print(f">> Generated audio length: {wav_length:.2f} seconds")
591
+ print(f">> RTF: {(end_time - start_time) / wav_length:.4f}")
592
+
593
+ # save audio
594
+ wav = wav.cpu() # to cpu
595
+ if output_path:
596
+ # 直接保存音频到指定路径中
597
+ if os.path.isfile(output_path):
598
+ os.remove(output_path)
599
+ print(">> remove old wav file:", output_path)
600
+ if os.path.dirname(output_path) != "":
601
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
602
+ torchaudio.save(output_path, wav.type(torch.int16), sampling_rate)
603
+ print(">> wav file saved to:", output_path)
604
+ return output_path
605
+ else:
606
+ # 返回以符合Gradio的格式要求
607
+ wav_data = wav.type(torch.int16)
608
+ wav_data = wav_data.numpy().T
609
+ return (sampling_rate, wav_data)
610
+
611
+
612
+ def find_most_similar_cosine(query_vector, matrix):
613
+ query_vector = query_vector.float()
614
+ matrix = matrix.float()
615
+
616
+ similarities = F.cosine_similarity(query_vector, matrix, dim=1)
617
+ most_similar_index = torch.argmax(similarities)
618
+ return most_similar_index
619
+
620
+ class QwenEmotion:
621
+ def __init__(self, model_dir):
622
+ self.model_dir = model_dir
623
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
624
+ self.model = AutoModelForCausalLM.from_pretrained(
625
+ self.model_dir,
626
+ torch_dtype="float16", # "auto"
627
+ device_map="auto"
628
+ )
629
+ self.prompt = "文本情感分类"
630
+ self.cn_key_to_en = {
631
+ "高兴": "happy",
632
+ "愤怒": "angry",
633
+ "悲伤": "sad",
634
+ "恐惧": "afraid",
635
+ "反感": "disgusted",
636
+ # TODO: the "低落" (melancholic) emotion will always be mapped to
637
+ # "悲伤" (sad) by QwenEmotion's text analysis. it doesn't know the
638
+ # difference between those emotions even if user writes exact words.
639
+ # SEE: `self.melancholic_words` for current workaround.
640
+ "低落": "melancholic",
641
+ "惊讶": "surprised",
642
+ "自然": "calm",
643
+ }
644
+ self.desired_vector_order = ["高兴", "愤怒", "悲伤", "恐惧", "反感", "低落", "惊讶", "自然"]
645
+ self.melancholic_words = {
646
+ # emotion text phrases that will force QwenEmotion's "悲伤" (sad) detection
647
+ # to become "低落" (melancholic) instead, to fix limitations mentioned above.
648
+ "低落",
649
+ "melancholy",
650
+ "melancholic",
651
+ "depression",
652
+ "depressed",
653
+ "gloomy",
654
+ }
655
+ self.max_score = 1.2
656
+ self.min_score = 0.0
657
+
658
+ def clamp_score(self, value):
659
+ return max(self.min_score, min(self.max_score, value))
660
+
661
+ def convert(self, content):
662
+ # generate emotion vector dictionary:
663
+ # - insert values in desired order (Python 3.7+ `dict` remembers insertion order)
664
+ # - convert Chinese keys to English
665
+ # - clamp all values to the allowed min/max range
666
+ # - use 0.0 for any values that were missing in `content`
667
+ emotion_dict = {
668
+ self.cn_key_to_en[cn_key]: self.clamp_score(content.get(cn_key, 0.0))
669
+ for cn_key in self.desired_vector_order
670
+ }
671
+
672
+ # default to a calm/neutral voice if all emotion vectors were empty
673
+ if all(val <= 0.0 for val in emotion_dict.values()):
674
+ print(">> no emotions detected; using default calm/neutral voice")
675
+ emotion_dict["calm"] = 1.0
676
+
677
+ return emotion_dict
678
+
679
+ def inference(self, text_input):
680
+ start = time.time()
681
+ messages = [
682
+ {"role": "system", "content": f"{self.prompt}"},
683
+ {"role": "user", "content": f"{text_input}"}
684
+ ]
685
+ text = self.tokenizer.apply_chat_template(
686
+ messages,
687
+ tokenize=False,
688
+ add_generation_prompt=True,
689
+ enable_thinking=False,
690
+ )
691
+ model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
692
+
693
+ # conduct text completion
694
+ generated_ids = self.model.generate(
695
+ **model_inputs,
696
+ max_new_tokens=32768,
697
+ pad_token_id=self.tokenizer.eos_token_id
698
+ )
699
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
700
+
701
+ # parsing thinking content
702
+ try:
703
+ # rindex finding 151668 (</think>)
704
+ index = len(output_ids) - output_ids[::-1].index(151668)
705
+ except ValueError:
706
+ index = 0
707
+
708
+ content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True)
709
+
710
+ # decode the JSON emotion detections as a dictionary
711
+ try:
712
+ content = json.loads(content)
713
+ except json.decoder.JSONDecodeError:
714
+ # invalid JSON; fallback to manual string parsing
715
+ # print(">> parsing QwenEmotion response", content)
716
+ content = {
717
+ m.group(1): float(m.group(2))
718
+ for m in re.finditer(r'([^\s":.,]+?)"?\s*:\s*([\d.]+)', content)
719
+ }
720
+ # print(">> dict result", content)
721
+
722
+ # workaround for QwenEmotion's inability to distinguish "悲伤" (sad) vs "低落" (melancholic).
723
+ # if we detect any of the IndexTTS "melancholic" words, we swap those vectors
724
+ # to encode the "sad" emotion as "melancholic" (instead of sadness).
725
+ text_input_lower = text_input.lower()
726
+ if any(word in text_input_lower for word in self.melancholic_words):
727
+ # print(">> before vec swap", content)
728
+ content["悲伤"], content["低落"] = content.get("低落", 0.0), content.get("悲伤", 0.0)
729
+ # print(">> after vec swap", content)
730
+
731
+ return self.convert(content)
732
+
733
+
734
+ if __name__ == "__main__":
735
+ prompt_wav = "examples/voice_01.wav"
736
+ text = '欢迎大家来体验indextts2,并给予我们意见与反馈,谢谢大家。'
737
+
738
+ tts = IndexTTS2(cfg_path="checkpoints/config.yaml", model_dir="checkpoints", use_cuda_kernel=False)
739
+ tts.infer(spk_audio_prompt=prompt_wav, text=text, output_path="gen.wav", verbose=True)